aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-06-13 19:31:46 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-06-13 19:37:19 +0000
commite8d8bef961a50d4dc22501cde4fb9fb0be1b2532 (patch)
tree94f04805f47bb7c59ae29690d8952b6074fff602 /contrib/llvm-project/llvm/lib/Target
parentbb130ff39747b94592cb26d71b7cb097b9a4ea6b (diff)
parentb60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
downloadsrc-e8d8bef961a50d4dc22501cde4fb9fb0be1b2532.tar.gz
src-e8d8bef961a50d4dc22501cde4fb9fb0be1b2532.zip
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td200
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp401
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td64
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp575
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp87
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp3151
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h137
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td130
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td60
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp527
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td334
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp43
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp82
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h74
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp111
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td684
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td339
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td61
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td3890
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td745
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h151
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp167
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td253
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp196
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp596
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp45
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp106
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp1677
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp336
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp523
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp704
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp187
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir158
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir158
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp55
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td557
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h99
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td197
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp162
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp856
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td76
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp122
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp601
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp237
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp226
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp1075
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp1101
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h62
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td159
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp195
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1259
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp93
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp62
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp120
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp154
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp281
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp372
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp172
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h1204
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp252
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp365
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h106
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp66
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp1973
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td237
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td119
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp435
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td125
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td138
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td704
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp220
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp256
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h1064
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp217
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp81
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td129
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp59
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h174
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h257
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp144
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp239
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp283
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp55
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp406
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp2013
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h56
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp163
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp241
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1551
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h104
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td552
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td570
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp316
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp109
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp72
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h65
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp586
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp59
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp208
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp76
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp56
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp744
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp146
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp204
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td629
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp247
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp460
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h188
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp50
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td225
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td592
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td269
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARM.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARM.td109
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp725
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h221
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp231
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp79
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp190
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h46
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp731
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h46
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td801
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td147
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td51
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td51
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp449
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp1505
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp416
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td66
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td147
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td488
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp858
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp132
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp251
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h157
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp503
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp460
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp147
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp129
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td35
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPF.h37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp323
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp323
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp130
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td190
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp78
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp122
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td528
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td108
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td182
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp69
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp45
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp71
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h61
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp62
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp76
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp91
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp229
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp162
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h909
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp738
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp265
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td119
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td240
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp89
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp71
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp1487
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp165
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h139
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/Mips.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp82
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp257
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp40
-rw-r--r--contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp257
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h40
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp118
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp163
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h86
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td56
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp668
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp497
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp835
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp3571
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h126
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td99
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td170
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td55
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp1019
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h106
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td441
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td1735
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td1212
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td375
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp40
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp278
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp140
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp240
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp161
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp170
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td106
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp56
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp111
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp305
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp657
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp (renamed from contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp)65
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h406
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp (renamed from contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp)4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h (renamed from contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h)3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp131
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp145
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp1125
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp1969
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h125
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td85
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td126
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp117
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td141
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td1058
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td765
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td4397
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td643
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td371
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp98
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td389
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td233
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td227
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td228
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td222
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h223
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp158
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp30
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp77
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp56
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp88
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp74
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td54
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td104
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td35
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp85
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp103
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp55
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp137
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VE.h19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VE.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td138
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp402
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp40
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp2238
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h150
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td89
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp534
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td845
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td1604
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td64
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td91
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td1510
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp105
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td104
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp64
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td46
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td71
-rw-r--r--contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp140
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp116
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h95
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp133
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp91
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp676
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp45
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp329
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td212
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td45
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td1199
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td64
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp325
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp110
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp78
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h91
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp1218
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp225
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86.h11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86.td868
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp70
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp579
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp77
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp96
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp168
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h23
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp648
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp4957
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h60
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp2017
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td54
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td734
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td181
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp255
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td150
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td86
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td205
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp99
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h40
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp35
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp184
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp81
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp351
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp69
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp265
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp139
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp102
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp117
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp358
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp248
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp56
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp4
912 files changed, 90814 insertions, 35521 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h
index fd35b530e3ce..d2170a99e0a2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h
@@ -58,8 +58,10 @@ ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
-FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
-FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone);
+FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone);
+FunctionPass *createAArch64PostLegalizerLowering();
+FunctionPass *createAArch64PostSelectOptimize();
FunctionPass *createAArch64StackTaggingPass(bool IsOptNone);
FunctionPass *createAArch64StackTaggingPreRAPass();
@@ -80,6 +82,8 @@ void initializeAArch64LoadStoreOptPass(PassRegistry&);
void initializeAArch64SIMDInstrOptPass(PassRegistry&);
void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
+void initializeAArch64PostLegalizerLoweringPass(PassRegistry &);
+void initializeAArch64PostSelectOptimizePass(PassRegistry &);
void initializeAArch64PromoteConstantPass(PassRegistry&);
void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
void initializeAArch64StorePairSuppressPass(PassRegistry&);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td
index 534af9686af0..762855207d2b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td
@@ -61,6 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
+def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
+ "Enable out of line atomics to support LSE instructions">;
+
def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
"Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
@@ -72,9 +75,11 @@ def FeatureLOR : SubtargetFeature<
"lor", "HasLOR", "true",
"Enables ARM v8.1 Limited Ordering Regions extension">;
-def FeatureVH : SubtargetFeature<
- "vh", "HasVH", "true",
- "Enables ARM v8.1 Virtual Host extension">;
+def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2",
+ "true", "Enable RW operand CONTEXTIDR_EL2" >;
+
+def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
+ "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >;
def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
"Enable ARMv8 PMUv3 Performance Monitors extension">;
@@ -213,6 +218,10 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
"arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
"CPU fuses arithmetic + cbz/cbnz operations">;
+def FeatureCmpBccFusion : SubtargetFeature<
+ "cmp-bcc-fusion", "HasCmpBccFusion", "true",
+ "CPU fuses cmp+bcc operations">;
+
def FeatureFuseAddress : SubtargetFeature<
"fuse-address", "HasFuseAddress", "true",
"CPU fuses address generation and memory operations">;
@@ -256,8 +265,8 @@ def FeatureDotProd : SubtargetFeature<
"dotprod", "HasDotProd", "true",
"Enable dot product support">;
-def FeaturePA : SubtargetFeature<
- "pa", "HasPA", "true",
+def FeaturePAuth : SubtargetFeature<
+ "pauth", "HasPAuth", "true",
"Enable v8.3-A Pointer Authentication extension">;
def FeatureJS : SubtargetFeature<
@@ -278,11 +287,6 @@ def FeatureNV : SubtargetFeature<
"nv", "HasNV", "true",
"Enable v8.4-A Nested Virtualization Enchancement">;
-def FeatureRASv8_4 : SubtargetFeature<
- "rasv8_4", "HasRASv8_4", "true",
- "Enable v8.4-A Reliability, Availability and Serviceability extension",
- [FeatureRAS]>;
-
def FeatureMPAM : SubtargetFeature<
"mpam", "HasMPAM", "true",
"Enable v8.4-A Memory system Partitioning and Monitoring extension">;
@@ -316,8 +320,8 @@ def FeatureTLB_RMI : SubtargetFeature<
"tlb-rmi", "HasTLB_RMI", "true",
"Enable v8.4-A TLB Range and Maintenance Instructions">;
-def FeatureFMI : SubtargetFeature<
- "fmi", "HasFMI", "true",
+def FeatureFlagM : SubtargetFeature<
+ "flagm", "HasFlagM", "true",
"Enable v8.4-A Flag Manipulation Instructions">;
// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
@@ -400,6 +404,24 @@ def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
"true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+def FeatureXS : SubtargetFeature<"xs", "HasXS",
+ "true", "Enable Armv8.7-A limited-TLB-maintenance instruction">;
+
+def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT",
+ "true", "Enable Armv8.7-A WFET and WFIT instruction">;
+
+def FeatureHCX : SubtargetFeature<
+ "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register">;
+
+def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64",
+ "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">;
+
+def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE",
+ "true", "Enable Branch Record Buffer Extension">;
+
+def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF",
+ "true", "Enable extra register in the Statistical Profiling Extension">;
+
def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
"true", "Enable fine grained virtualization traps extension">;
@@ -420,14 +442,14 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
- "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePA,
+ "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePAuth,
FeatureJS, FeatureCCIDX, FeatureComplxNum]>;
def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
- FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
+ FeatureNV, FeatureMPAM, FeatureDIT,
FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI,
- FeatureFMI, FeatureRCPC_IMMO]>;
+ FeatureFlagM, FeatureRCPC_IMMO]>;
def HasV8_5aOps : SubtargetFeature<
"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
@@ -437,10 +459,29 @@ def HasV8_5aOps : SubtargetFeature<
def HasV8_6aOps : SubtargetFeature<
"v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
-
[HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
+def HasV8_7aOps : SubtargetFeature<
+ "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions",
+ [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>;
+
+def HasV8_0rOps : SubtargetFeature<
+ "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions",
+ [//v8.1
+ FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2,
+ //v8.2
+ FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4,
+ FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV,
+ //v8.3
+ FeatureComplxNum, FeatureCCIDX, FeatureJS,
+ FeaturePAuth, FeatureRCPC,
+ //v8.4
+ FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4,
+ FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
+ //v8.5
+ FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -502,10 +543,11 @@ def SVEUnsupported : AArch64Unsupported {
}
def PAUnsupported : AArch64Unsupported {
- let F = [HasPA];
+ let F = [HasPAuth];
}
include "AArch64SchedA53.td"
+include "AArch64SchedA55.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
include "AArch64SchedFalkor.td"
@@ -515,7 +557,9 @@ include "AArch64SchedExynosM4.td"
include "AArch64SchedExynosM5.td"
include "AArch64SchedThunderX.td"
include "AArch64SchedThunderX2T99.td"
+include "AArch64SchedA64FX.td"
include "AArch64SchedThunderX3T110.td"
+include "AArch64SchedTSV110.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors", [
@@ -575,6 +619,9 @@ def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
FeatureDotProd,
FeatureFPARMv8,
FeatureFullFP16,
+ FeatureFuseAddress,
+ FeatureFuseAES,
+ FeatureFuseLiterals,
FeatureNEON,
FeatureRAS,
FeatureRCPC,
@@ -587,6 +634,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
+ FeatureFuseLiterals,
FeatureNEON,
FeaturePerfMon
]>;
@@ -618,6 +666,7 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", [
HasV8_2aOps,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeatureRCPC,
FeatureCrypto,
@@ -629,7 +678,9 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
"Cortex-A77 ARM processors", [
HasV8_2aOps,
+ FeatureCmpBccFusion,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON, FeatureRCPC,
FeatureCrypto,
FeatureFullFP16,
@@ -640,6 +691,7 @@ def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
"CortexA78",
"Cortex-A78 ARM processors", [
HasV8_2aOps,
+ FeatureCmpBccFusion,
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
@@ -652,9 +704,39 @@ def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
FeatureSSBS,
FeatureDotProd]>;
+def ProcA78C : SubtargetFeature<"cortex-a78c", "ARMProcFamily",
+ "CortexA78C",
+ "Cortex-A78C ARM processors", [
+ HasV8_2aOps,
+ FeatureCmpBccFusion,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFlagM,
+ FeatureFP16FML,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureFuseAES,
+ FeatureNEON,
+ FeaturePAuth,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureRCPC,
+ FeatureSPE,
+ FeatureSSBS]>;
+
+def ProcR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
+ "CortexR82",
+ "Cortex-R82 ARM Processors", [
+ FeaturePostRAScheduler,
+ // TODO: crypto and FuseAES
+ // All other features are implied by v8_0r ops:
+ HasV8_0rOps,
+ ]>;
+
def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
"Cortex-X1 ARM processors", [
HasV8_2aOps,
+ FeatureCmpBccFusion,
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
@@ -676,7 +758,10 @@ def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
FeatureFullFP16,
FeatureSVE,
FeaturePostRAScheduler,
- FeatureComplxNum
+ FeatureComplxNum,
+ FeatureAggressiveFMA,
+ FeatureArithmeticBccFusion,
+ FeaturePredictableSelectIsExpensive
]>;
def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
@@ -783,6 +868,38 @@ def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
HasV8_4aOps
]>;
+def ProcAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
+ "Apple A14", [
+ FeatureAggressiveFMA,
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureAltFPCmp,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
+ FeatureCrypto,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
+ FeatureFRInt3264,
+ FeatureFuseAddress,
+ FeatureFuseAES,
+ FeatureFuseArithmeticLogic,
+ FeatureFuseCCSelect,
+ FeatureFuseCryptoEOR,
+ FeatureFuseLiterals,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeatureSpecRestrict,
+ FeatureSSBS,
+ FeatureSB,
+ FeaturePredRes,
+ FeatureCacheDeepPersist,
+ FeatureZCRegMove,
+ FeatureZCZeroing,
+ FeatureFullFP16,
+ FeatureFP16FML,
+ FeatureSHA3,
+ HasV8_4aOps
+ ]>;
+
def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
[FeatureCRC,
@@ -876,6 +993,38 @@ def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
FeatureSSBS,
]>;
+def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily",
+ "NeoverseN2",
+ "Neoverse N2 ARM processors", [
+ HasV8_5aOps,
+ FeatureBF16,
+ FeatureETE,
+ FeatureMatMulInt8,
+ FeatureMTE,
+ FeatureSVE2,
+ FeatureSVE2BitPerm,
+ FeatureTRBE]>;
+
+def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily",
+ "NeoverseV1",
+ "Neoverse V1 ARM processors", [
+ HasV8_4aOps,
+ FeatureBF16,
+ FeatureCacheDeepPersist,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureFP16FML,
+ FeatureFullFP16,
+ FeatureFuseAES,
+ FeatureMatMulInt8,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeatureRandGen,
+ FeatureSPE,
+ FeatureSSBS,
+ FeatureSVE]>;
+
def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
FeatureCrypto,
@@ -916,7 +1065,7 @@ def ProcThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureLSE,
- FeaturePA,
+ FeaturePAuth,
FeatureUseAA,
FeatureBalanceFPOps,
FeaturePerfMon,
@@ -998,7 +1147,7 @@ def : ProcessorModel<"generic", NoSchedModel, [
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
+def : ProcessorModel<"cortex-a55", CortexA55Model, [ProcA55]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>;
def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>;
@@ -1009,9 +1158,13 @@ def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>;
def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>;
+def : ProcessorModel<"cortex-a78c", CortexA57Model, [ProcA78C]>;
+def : ProcessorModel<"cortex-r82", CortexA55Model, [ProcR82]>;
def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>;
def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
+def : ProcessorModel<"neoverse-n2", CortexA57Model, [ProcNeoverseN2]>;
+def : ProcessorModel<"neoverse-v1", CortexA57Model, [ProcNeoverseV1]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>;
@@ -1027,8 +1180,7 @@ def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>;
def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
// Marvell ThunderX3T110 Processors.
def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>;
-// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
-def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
+def : ProcessorModel<"tsv110", TSV110Model, [ProcTSV110]>;
// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>;
@@ -1041,6 +1193,7 @@ def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>;
def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>;
def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>;
def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>;
+def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>;
// watch CPUs.
def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>;
@@ -1050,8 +1203,7 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>;
// Fujitsu A64FX
-// FIXME: Scheduling model is not implemented yet.
-def : ProcessorModel<"a64fx", NoSchedModel, [ProcA64FX]>;
+def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>;
// Nvidia Carmel
def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 981b366c14b1..c996d2df8c38 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -123,7 +123,7 @@ static bool isFPR64(unsigned Reg, unsigned SubReg,
}
// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
-// copy instruction. Return zero_reg if the instruction is not a copy.
+// copy instruction. Return nullptr if the instruction is not a copy.
static MachineOperand *getSrcFromCopy(MachineInstr *MI,
const MachineRegisterInfo *MRI,
unsigned &SubReg) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 7ec7ffe309f7..a0c5498ee620 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -32,6 +32,7 @@
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -54,6 +55,7 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -69,12 +71,13 @@ namespace {
class AArch64AsmPrinter : public AsmPrinter {
AArch64MCInstLower MCInstLowering;
StackMaps SM;
+ FaultMaps FM;
const AArch64Subtarget *STI;
public:
AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
- SM(*this) {}
+ SM(*this), FM(*this) {}
StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
@@ -86,17 +89,18 @@ public:
void emitStartOfAsmFile(Module &M) override;
void emitJumpTableInfo() override;
- void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
- const MachineBasicBlock *MBB, unsigned JTI);
void emitFunctionEntryLabel() override;
- void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
+ void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);
void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI);
void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI);
+ void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI);
+ void LowerFAULTING_OP(const MachineInstr &MI);
void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
@@ -191,58 +195,24 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
return;
// Assemble feature flags that may require creation of a note section.
- unsigned Flags = ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI |
- ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
-
- if (any_of(M, [](const Function &F) {
- return !F.isDeclaration() &&
- !F.hasFnAttribute("branch-target-enforcement");
- })) {
- Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
- }
+ unsigned Flags = 0;
+ if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("branch-target-enforcement")))
+ if (BTE->getZExtValue())
+ Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
- if ((Flags & ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI) == 0 &&
- any_of(M, [](const Function &F) {
- return F.hasFnAttribute("branch-target-enforcement");
- })) {
- errs() << "warning: some functions compiled with BTI and some compiled "
- "without BTI\n"
- << "warning: not setting BTI in feature flags\n";
- }
-
- if (any_of(M, [](const Function &F) {
- if (F.isDeclaration())
- return false;
- Attribute A = F.getFnAttribute("sign-return-address");
- return !A.isStringAttribute() || A.getValueAsString() == "none";
- })) {
- Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
- }
+ if (const auto *Sign = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("sign-return-address")))
+ if (Sign->getZExtValue())
+ Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
if (Flags == 0)
return;
// Emit a .note.gnu.property section with the flags.
- MCSection *Cur = OutStreamer->getCurrentSectionOnly();
- MCSection *Nt = MMI->getContext().getELFSection(
- ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
- OutStreamer->SwitchSection(Nt);
-
- // Emit the note header.
- emitAlignment(Align(8));
- OutStreamer->emitInt32(4); // data size for "GNU\0"
- OutStreamer->emitInt32(4 * 4); // Elf_Prop size
- OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0);
- OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
-
- // Emit the PAC/BTI properties.
- OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND);
- OutStreamer->emitInt32(4); // data size
- OutStreamer->emitInt32(Flags); // data
- OutStreamer->emitInt32(0); // pad
-
- OutStreamer->endSection(Nt);
- OutStreamer->SwitchSection(Cur);
+ if (auto *TS = static_cast<AArch64TargetStreamer *>(
+ OutStreamer->getTargetStreamer()))
+ TS->emitNoteSection(Flags);
}
void AArch64AsmPrinter::emitFunctionHeaderComment() {
@@ -333,7 +303,7 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
utostr(AccessInfo);
if (IsShort)
- SymName += "_short";
+ SymName += "_short_v2";
Sym = OutContext.getOrCreateSymbol(SymName);
}
@@ -350,6 +320,7 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
assert(TT.isOSBinFormatELF());
std::unique_ptr<MCSubtargetInfo> STI(
TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
+ assert(STI && "Unable to create subtarget info");
MCSymbol *HwasanTagMismatchV1Sym =
OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
@@ -369,6 +340,15 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref;
MCSymbol *Sym = P.second;
+ bool HasMatchAllTag =
+ (AccessInfo >> HWASanAccessInfo::HasMatchAllShift) & 1;
+ uint8_t MatchAllTag =
+ (AccessInfo >> HWASanAccessInfo::MatchAllShift) & 0xff;
+ unsigned Size =
+ 1 << ((AccessInfo >> HWASanAccessInfo::AccessSizeShift) & 0xf);
+ bool CompileKernel =
+ (AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1;
+
OutStreamer->SwitchSection(OutContext.getELFSection(
".text.hot", ELF::SHT_PROGBITS,
ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
@@ -379,19 +359,20 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
OutStreamer->emitLabel(Sym);
- OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::SBFMXri)
.addReg(AArch64::X16)
.addReg(Reg)
.addImm(4)
.addImm(55),
*STI);
- OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX)
- .addReg(AArch64::W16)
- .addReg(AArch64::X9)
- .addReg(AArch64::X16)
- .addImm(0)
- .addImm(0),
- *STI);
+ OutStreamer->emitInstruction(
+ MCInstBuilder(AArch64::LDRBBroX)
+ .addReg(AArch64::W16)
+ .addReg(IsShort ? AArch64::X20 : AArch64::X9)
+ .addReg(AArch64::X16)
+ .addImm(0)
+ .addImm(0),
+ *STI);
OutStreamer->emitInstruction(
MCInstBuilder(AArch64::SUBSXrs)
.addReg(AArch64::XZR)
@@ -412,6 +393,26 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
OutStreamer->emitLabel(HandleMismatchOrPartialSym);
+ if (HasMatchAllTag) {
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(56)
+ .addImm(63),
+ *STI);
+ OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSXri)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X16)
+ .addImm(MatchAllTag)
+ .addImm(0),
+ *STI);
+ OutStreamer->emitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::EQ)
+ .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+ *STI);
+ }
+
if (IsShort) {
OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri)
.addReg(AArch64::WZR)
@@ -432,7 +433,6 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
.addReg(Reg)
.addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
*STI);
- unsigned Size = 1 << (AccessInfo & 0xf);
if (Size != 1)
OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X17)
@@ -500,32 +500,41 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
.addReg(Reg)
.addImm(0),
*STI);
- OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi)
- .addReg(AArch64::X1)
- .addImm(AccessInfo)
- .addImm(0),
- *STI);
-
- // Intentionally load the GOT entry and branch to it, rather than possibly
- // late binding the function, which may clobber the registers before we have
- // a chance to save them.
OutStreamer->emitInstruction(
- MCInstBuilder(AArch64::ADRP)
- .addReg(AArch64::X16)
- .addExpr(AArch64MCExpr::create(
- HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
- OutContext)),
+ MCInstBuilder(AArch64::MOVZXi)
+ .addReg(AArch64::X1)
+ .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask)
+ .addImm(0),
*STI);
- OutStreamer->emitInstruction(
- MCInstBuilder(AArch64::LDRXui)
- .addReg(AArch64::X16)
- .addReg(AArch64::X16)
- .addExpr(AArch64MCExpr::create(
- HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
- OutContext)),
- *STI);
- OutStreamer->emitInstruction(
- MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
+
+ if (CompileKernel) {
+ // The Linux kernel's dynamic loader doesn't support GOT relative
+ // relocations, but it doesn't support late binding either, so just call
+ // the function directly.
+ OutStreamer->emitInstruction(
+ MCInstBuilder(AArch64::B).addExpr(HwasanTagMismatchRef), *STI);
+ } else {
+ // Intentionally load the GOT entry and branch to it, rather than possibly
+ // late binding the function, which may clobber the registers before we
+ // have a chance to save them.
+ OutStreamer->emitInstruction(
+ MCInstBuilder(AArch64::ADRP)
+ .addReg(AArch64::X16)
+ .addExpr(AArch64MCExpr::create(
+ HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
+ OutContext)),
+ *STI);
+ OutStreamer->emitInstruction(
+ MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X16)
+ .addReg(AArch64::X16)
+ .addExpr(AArch64MCExpr::create(
+ HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
+ OutContext)),
+ *STI);
+ OutStreamer->emitInstruction(
+ MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
+ }
}
}
@@ -541,7 +550,11 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
// generates code that does this, it is always safe to set.
OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
}
+
+ // Emit stack and fault map information.
emitStackMaps(SM);
+ FM.serializeToFaultMapSection();
+
}
void AArch64AsmPrinter::EmitLOHs() {
@@ -634,7 +647,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
const TargetRegisterInfo *RI = STI->getRegisterInfo();
Register Reg = MO.getReg();
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
- assert(RI->regsOverlap(RegToPrint, Reg));
+ if (!RI->regsOverlap(RegToPrint, Reg))
+ return true;
O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName);
return false;
}
@@ -795,33 +809,25 @@ void AArch64AsmPrinter::emitJumpTableInfo() {
emitAlignment(Align(Size));
OutStreamer->emitLabel(GetJTISymbol(JTI));
- for (auto *JTBB : JTBBs)
- emitJumpTableEntry(MJTI, JTBB, JTI);
- }
-}
+ const MCSymbol *BaseSym = AArch64FI->getJumpTableEntryPCRelSymbol(JTI);
+ const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
-void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
- const MachineBasicBlock *MBB,
- unsigned JTI) {
- const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
- auto AFI = MF->getInfo<AArch64FunctionInfo>();
- unsigned Size = AFI->getJumpTableEntrySize(JTI);
+ for (auto *JTBB : JTBBs) {
+ const MCExpr *Value =
+ MCSymbolRefExpr::create(JTBB->getSymbol(), OutContext);
- if (Size == 4) {
- // .word LBB - LJTI
- const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
- const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext);
- Value = MCBinaryExpr::createSub(Value, Base, OutContext);
- } else {
- // .byte (LBB - LBB) >> 2 (or .hword)
- const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI);
- const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
- Value = MCBinaryExpr::createSub(Value, Base, OutContext);
- Value = MCBinaryExpr::createLShr(
- Value, MCConstantExpr::create(2, OutContext), OutContext);
- }
+ // Each entry is:
+ // .byte/.hword (LBB - Lbase)>>2
+ // or plain:
+ // .word LBB - Lbase
+ Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+ if (Size != 4)
+ Value = MCBinaryExpr::createLShr(
+ Value, MCConstantExpr::create(2, OutContext), OutContext);
- OutStreamer->emitValue(Value, Size);
+ OutStreamer->emitValue(Value, Size);
+ }
+ }
}
void AArch64AsmPrinter::emitFunctionEntryLabel() {
@@ -845,9 +851,9 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
///
/// adr xDest, .LBB0_0
/// ldrb wScratch, [xTable, xEntry] (with "lsl #1" for ldrh).
-/// add xDest, xDest, xScratch, lsl #2
-void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
- const llvm::MachineInstr &MI) {
+/// add xDest, xDest, xScratch (with "lsl #2" for smaller entries)
+void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
+ const llvm::MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
Register ScratchReg = MI.getOperand(1).getReg();
Register ScratchRegW =
@@ -855,33 +861,50 @@ void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
Register TableReg = MI.getOperand(2).getReg();
Register EntryReg = MI.getOperand(3).getReg();
int JTIdx = MI.getOperand(4).getIndex();
- bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
+ int Size = AArch64FI->getJumpTableEntrySize(JTIdx);
// This has to be first because the compression pass based its reachability
// calculations on the start of the JumpTableDest instruction.
auto Label =
MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx);
+
+ // If we don't already have a symbol to use as the base, use the ADR
+ // instruction itself.
+ if (!Label) {
+ Label = MF->getContext().createTempSymbol();
+ AArch64FI->setJumpTableEntryInfo(JTIdx, Size, Label);
+ OutStreamer.emitLabel(Label);
+ }
+
+ auto LabelExpr = MCSymbolRefExpr::create(Label, MF->getContext());
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR)
.addReg(DestReg)
- .addExpr(MCSymbolRefExpr::create(
- Label, MF->getContext())));
+ .addExpr(LabelExpr));
// Load the number of instruction-steps to offset from the label.
- unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX;
+ unsigned LdrOpcode;
+ switch (Size) {
+ case 1: LdrOpcode = AArch64::LDRBBroX; break;
+ case 2: LdrOpcode = AArch64::LDRHHroX; break;
+ case 4: LdrOpcode = AArch64::LDRSWroX; break;
+ default:
+ llvm_unreachable("Unknown jump table size");
+ }
+
EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode)
- .addReg(ScratchRegW)
+ .addReg(Size == 4 ? ScratchReg : ScratchRegW)
.addReg(TableReg)
.addReg(EntryReg)
.addImm(0)
- .addImm(IsByteEntry ? 0 : 1));
+ .addImm(Size == 1 ? 0 : 1));
- // Multiply the steps by 4 and add to the already materialized base label
- // address.
+ // Add to the already materialized base label address, multiplying by 4 if
+ // compressed.
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs)
.addReg(DestReg)
.addReg(DestReg)
.addReg(ScratchReg)
- .addImm(2));
+ .addImm(Size == 4 ? 0 : 2));
}
void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
@@ -959,6 +982,83 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
}
+void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+ const MachineInstr &MI) {
+ StatepointOpers SOpers(&MI);
+ if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+ assert(PatchBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+ for (unsigned i = 0; i < PatchBytes; i += 4)
+ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+ } else {
+ // Lower call target and choose correct opcode
+ const MachineOperand &CallTarget = SOpers.getCallTarget();
+ MCOperand CallTargetMCOp;
+ unsigned CallOpcode;
+ switch (CallTarget.getType()) {
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ MCInstLowering.lowerOperand(CallTarget, CallTargetMCOp);
+ CallOpcode = AArch64::BL;
+ break;
+ case MachineOperand::MO_Immediate:
+ CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+ CallOpcode = AArch64::BL;
+ break;
+ case MachineOperand::MO_Register:
+ CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+ CallOpcode = AArch64::BLR;
+ break;
+ default:
+ llvm_unreachable("Unsupported operand type in statepoint call target");
+ break;
+ }
+
+ EmitToStreamer(OutStreamer,
+ MCInstBuilder(CallOpcode).addOperand(CallTargetMCOp));
+ }
+
+ auto &Ctx = OutStreamer.getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer.emitLabel(MILabel);
+ SM.recordStatepoint(*MILabel, MI);
+}
+
+void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) {
+ // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+ // <opcode>, <operands>
+
+ Register DefRegister = FaultingMI.getOperand(0).getReg();
+ FaultMaps::FaultKind FK =
+ static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+ MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+ unsigned Opcode = FaultingMI.getOperand(3).getImm();
+ unsigned OperandsBeginIdx = 4;
+
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+ OutStreamer->emitLabel(FaultingLabel);
+
+ assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+ FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
+
+ MCInst MI;
+ MI.setOpcode(Opcode);
+
+ if (DefRegister != (Register)0)
+ MI.addOperand(MCOperand::createReg(DefRegister));
+
+ for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+ E = FaultingMI.operands_end();
+ I != E; ++I) {
+ MCOperand Dest;
+ lowerOperand(*I, Dest);
+ MI.addOperand(Dest);
+ }
+
+ OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
+ OutStreamer->emitInstruction(MI, getSubtargetInfo());
+}
+
void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
@@ -1172,17 +1272,28 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Adrp);
MCInst Ldr;
- Ldr.setOpcode(AArch64::LDRXui);
- Ldr.addOperand(MCOperand::createReg(AArch64::X1));
+ if (STI->isTargetILP32()) {
+ Ldr.setOpcode(AArch64::LDRWui);
+ Ldr.addOperand(MCOperand::createReg(AArch64::W1));
+ } else {
+ Ldr.setOpcode(AArch64::LDRXui);
+ Ldr.addOperand(MCOperand::createReg(AArch64::X1));
+ }
Ldr.addOperand(MCOperand::createReg(AArch64::X0));
Ldr.addOperand(SymTLSDescLo12);
Ldr.addOperand(MCOperand::createImm(0));
EmitToStreamer(*OutStreamer, Ldr);
MCInst Add;
- Add.setOpcode(AArch64::ADDXri);
- Add.addOperand(MCOperand::createReg(AArch64::X0));
- Add.addOperand(MCOperand::createReg(AArch64::X0));
+ if (STI->isTargetILP32()) {
+ Add.setOpcode(AArch64::ADDWri);
+ Add.addOperand(MCOperand::createReg(AArch64::W0));
+ Add.addOperand(MCOperand::createReg(AArch64::W0));
+ } else {
+ Add.setOpcode(AArch64::ADDXri);
+ Add.addOperand(MCOperand::createReg(AArch64::X0));
+ Add.addOperand(MCOperand::createReg(AArch64::X0));
+ }
Add.addOperand(SymTLSDescLo12);
Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
EmitToStreamer(*OutStreamer, Add);
@@ -1202,30 +1313,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
- case AArch64::JumpTableDest32: {
- // We want:
- // ldrsw xScratch, [xTable, xEntry, lsl #2]
- // add xDest, xTable, xScratch
- unsigned DestReg = MI->getOperand(0).getReg(),
- ScratchReg = MI->getOperand(1).getReg(),
- TableReg = MI->getOperand(2).getReg(),
- EntryReg = MI->getOperand(3).getReg();
- EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
- .addReg(ScratchReg)
- .addReg(TableReg)
- .addReg(EntryReg)
- .addImm(0)
- .addImm(1));
- EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
- .addReg(DestReg)
- .addReg(TableReg)
- .addReg(ScratchReg)
- .addImm(0));
- return;
- }
+ case AArch64::JumpTableDest32:
case AArch64::JumpTableDest16:
case AArch64::JumpTableDest8:
- LowerJumpTableDestSmall(*OutStreamer, *MI);
+ LowerJumpTableDest(*OutStreamer, *MI);
return;
case AArch64::FMOVH0:
@@ -1240,6 +1331,12 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
case TargetOpcode::PATCHPOINT:
return LowerPATCHPOINT(*OutStreamer, SM, *MI);
+ case TargetOpcode::STATEPOINT:
+ return LowerSTATEPOINT(*OutStreamer, SM, *MI);
+
+ case TargetOpcode::FAULTING_OP:
+ return LowerFAULTING_OP(*MI);
+
case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
LowerPATCHABLE_FUNCTION_ENTER(*MI);
return;
@@ -1284,6 +1381,14 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
case AArch64::SEH_SaveRegP:
+ if (MI->getOperand(1).getImm() == 30 && MI->getOperand(0).getImm() >= 19 &&
+ MI->getOperand(0).getImm() <= 28) {
+ assert((MI->getOperand(0).getImm() - 19) % 2 == 0 &&
+ "Register paired with LR must be odd");
+ TS->EmitARM64WinCFISaveLRPair(MI->getOperand(0).getImm(),
+ MI->getOperand(2).getImm());
+ return;
+ }
assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
"Non-consecutive registers not allowed for save_regp");
TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 1956014b738d..d3b5166585c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -16,6 +16,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -57,13 +58,13 @@ FunctionPass *llvm::createAArch64BranchTargetsPass() {
}
bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
- const Function &F = MF.getFunction();
- if (!F.hasFnAttribute("branch-target-enforcement"))
+ if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
return false;
LLVM_DEBUG(
dbgs() << "********** AArch64 Branch Targets **********\n"
<< "********** Function: " << MF.getName() << '\n');
+ const Function &F = MF.getFunction();
// LLVM does not consider basic blocks which are the targets of jump tables
// to be address-taken (the address can't escape anywhere else), but they are
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 9ae2b465e247..c51dd48cab34 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -42,6 +42,51 @@ static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
CCState &State, Align SlotAlign) {
+ if (LocVT.isScalableVector()) {
+ const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+ State.getMachineFunction().getSubtarget());
+ const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
+
+ // We are about to reinvoke the CCAssignFn auto-generated handler. If we
+ // don't unset these flags we will get stuck in an infinite loop forever
+ // invoking the custom handler.
+ ArgFlags.setInConsecutiveRegs(false);
+ ArgFlags.setInConsecutiveRegsLast(false);
+
+ // The calling convention for passing SVE tuples states that in the event
+ // we cannot allocate enough registers for the tuple we should still leave
+ // any remaining registers unallocated. However, when we call the
+ // CCAssignFn again we want it to behave as if all remaining registers are
+ // allocated. This will force the code to pass the tuple indirectly in
+ // accordance with the PCS.
+ bool RegsAllocated[8];
+ for (int I = 0; I < 8; I++) {
+ RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+ State.AllocateReg(ZRegList[I]);
+ }
+
+ auto &It = PendingMembers[0];
+ CCAssignFn *AssignFn =
+ TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+ if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
+ ArgFlags, State))
+ llvm_unreachable("Call operand has unhandled type");
+
+ // Return the flags to how they were before.
+ ArgFlags.setInConsecutiveRegs(true);
+ ArgFlags.setInConsecutiveRegsLast(true);
+
+ // Return the register state back to how it was before, leaving any
+ // unallocated registers available for other smaller types.
+ for (int I = 0; I < 8; I++)
+ if (!RegsAllocated[I])
+ State.DeallocateReg(ZRegList[I]);
+
+ // All pending members have now been allocated
+ PendingMembers.clear();
+ return true;
+ }
+
unsigned Size = LocVT.getSizeInBits() / 8;
const Align StackAlign =
State.getMachineFunction().getDataLayout().getStackAlignment();
@@ -146,13 +191,11 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
- if (LocVT.isScalableVector())
- report_fatal_error(
- "Passing consecutive scalable vector registers unsupported");
-
- // Mark all regs in the class as unavailable
- for (auto Reg : RegList)
- State.AllocateReg(Reg);
+ if (!LocVT.isScalableVector()) {
+ // Mark all regs in the class as unavailable
+ for (auto Reg : RegList)
+ State.AllocateReg(Reg);
+ }
const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
index aa41cae289e8..b1e714653f46 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -19,7 +19,6 @@ def fconstant_to_constant : GICombineRule<
def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
"AArch64GenPreLegalizerCombinerHelper", [all_combines,
- elide_br_by_inverting_cond,
fconstant_to_constant]> {
let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
let StateClass = "AArch64PreLegalizerCombinerHelperState";
@@ -76,9 +75,68 @@ def ext: GICombineRule <
// instruction.
def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>;
+def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">;
+def vashr_vlshr_imm : GICombineRule<
+ (defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo),
+ (match (wip_match_opcode G_ASHR, G_LSHR):$root,
+ [{ return matchVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyVAshrLshrImm(*${root}, MRI, ${matchinfo}); }])
+>;
+
+def form_duplane_matchdata :
+ GIDefMatchData<"std::pair<unsigned, int>">;
+def form_duplane : GICombineRule <
+ (defs root:$root, form_duplane_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchDupLane(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+def adjust_icmp_imm_matchdata :
+ GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
+def adjust_icmp_imm : GICombineRule <
+ (defs root:$root, adjust_icmp_imm_matchdata:$matchinfo),
+ (match (wip_match_opcode G_ICMP):$root,
+ [{ return matchAdjustICmpImmAndPred(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }])
+>;
+
+def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>;
+
+def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">;
+def extractvecelt_pairwise_add : GICombineRule<
+ (defs root:$root, extractvecelt_pairwise_add_matchdata:$matchinfo),
+ (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+ [{ return matchExtractVecEltPairwiseAdd(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyExtractVecEltPairwiseAdd(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+def mul_const_matchdata : GIDefMatchData<"std::function<void(MachineIRBuilder&, Register)>">;
+def mul_const : GICombineRule<
+ (defs root:$root, mul_const_matchdata:$matchinfo),
+ (match (wip_match_opcode G_MUL):$root,
+ [{ return matchAArch64MulConstCombine(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
+// Post-legalization combines which should happen at all optimization levels.
+// (E.g. ones that facilitate matching for the selector) For example, matching
+// pseudos.
+def AArch64PostLegalizerLoweringHelper
+ : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper",
+ [shuffle_vector_pseudos, vashr_vlshr_imm,
+ icmp_lowering, form_duplane]> {
+ let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule";
+}
+
+// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombinerHelper
: GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
- [erase_undef_store, combines_for_extload,
- sext_already_extended, shuffle_vector_pseudos]> {
+ [copy_prop, erase_undef_store, combines_for_extload,
+ sext_trunc_sextload,
+ hoist_logic_op_with_same_opcode_hands,
+ redundant_and, xor_of_and_with_same_reg,
+ extractvecelt_pairwise_add, redundant_or,
+ mul_const]> {
let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
index 57dc8a4061f1..2328a8b4deb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -37,8 +37,13 @@ class AArch64CompressJumpTables : public MachineFunctionPass {
MachineFunction *MF;
SmallVector<int, 8> BlockInfo;
- int computeBlockSize(MachineBasicBlock &MBB);
- void scanFunction();
+ /// Returns the size in instructions of the block \p MBB, or None if we
+ /// couldn't get a safe upper bound.
+ Optional<int> computeBlockSize(MachineBasicBlock &MBB);
+
+ /// Gather information about the function, returns false if we can't perform
+ /// this optimization for some reason.
+ bool scanFunction();
bool compressJumpTable(MachineInstr &MI, int Offset);
@@ -59,19 +64,27 @@ public:
}
};
char AArch64CompressJumpTables::ID = 0;
-}
+} // namespace
INITIALIZE_PASS(AArch64CompressJumpTables, DEBUG_TYPE,
"AArch64 compress jump tables pass", false, false)
-int AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) {
+Optional<int>
+AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) {
int Size = 0;
- for (const MachineInstr &MI : MBB)
+ for (const MachineInstr &MI : MBB) {
+ // Inline asm may contain some directives like .bytes which we don't
+ // currently have the ability to parse accurately. To be safe, just avoid
+ // computing a size and bail out.
+ if (MI.getOpcode() == AArch64::INLINEASM ||
+ MI.getOpcode() == AArch64::INLINEASM_BR)
+ return None;
Size += TII->getInstSizeInBytes(MI);
+ }
return Size;
}
-void AArch64CompressJumpTables::scanFunction() {
+bool AArch64CompressJumpTables::scanFunction() {
BlockInfo.clear();
BlockInfo.resize(MF->getNumBlockIDs());
@@ -84,8 +97,12 @@ void AArch64CompressJumpTables::scanFunction() {
else
AlignedOffset = alignTo(Offset, Alignment);
BlockInfo[MBB.getNumber()] = AlignedOffset;
- Offset = AlignedOffset + computeBlockSize(MBB);
+ auto BlockSize = computeBlockSize(MBB);
+ if (!BlockSize)
+ return false;
+ Offset = AlignedOffset + *BlockSize;
}
+ return true;
}
bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
@@ -104,7 +121,7 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
int MaxOffset = std::numeric_limits<int>::min(),
MinOffset = std::numeric_limits<int>::max();
MachineBasicBlock *MinBlock = nullptr;
- for (auto Block : JT.MBBs) {
+ for (auto *Block : JT.MBBs) {
int BlockOffset = BlockInfo[Block->getNumber()];
assert(BlockOffset % 4 == 0 && "misaligned basic block");
@@ -124,13 +141,14 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
}
int Span = MaxOffset - MinOffset;
- auto AFI = MF->getInfo<AArch64FunctionInfo>();
+ auto *AFI = MF->getInfo<AArch64FunctionInfo>();
if (isUInt<8>(Span / 4)) {
AFI->setJumpTableEntryInfo(JTIdx, 1, MinBlock->getSymbol());
MI.setDesc(TII->get(AArch64::JumpTableDest8));
++NumJT8;
return true;
- } else if (isUInt<16>(Span / 4)) {
+ }
+ if (isUInt<16>(Span / 4)) {
AFI->setJumpTableEntryInfo(JTIdx, 2, MinBlock->getSymbol());
MI.setDesc(TII->get(AArch64::JumpTableDest16));
++NumJT16;
@@ -151,7 +169,8 @@ bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) {
if (ST.force32BitJumpTables() && !MF->getFunction().hasMinSize())
return false;
- scanFunction();
+ if (!scanFunction())
+ return false;
for (MachineBasicBlock &MBB : *MF) {
int Offset = BlockInfo[MBB.getNumber()];
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 9e65ad2e18f9..e57650ae60b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -83,6 +83,8 @@ private:
bool expandSVESpillFill(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned Opc,
unsigned N);
+ bool expandCALL_RVMARKER(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
};
} // end anonymous namespace
@@ -627,6 +629,46 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
return true;
}
+bool AArch64ExpandPseudo::expandCALL_RVMARKER(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+ // Expand CALL_RVMARKER pseudo to a branch, followed by the special `mov x29,
+ // x29` marker. Mark the sequence as bundle, to avoid passes moving other code
+ // in between.
+ MachineInstr &MI = *MBBI;
+
+ MachineInstr *OriginalCall;
+ MachineOperand &CallTarget = MI.getOperand(0);
+ assert((CallTarget.isGlobal() || CallTarget.isReg()) &&
+ "invalid operand for regular call");
+ unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
+ OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
+ OriginalCall->addOperand(CallTarget);
+
+ unsigned RegMaskStartIdx = 1;
+ // Skip register arguments. Those are added during ISel, but are not
+ // needed for the concrete branch.
+ while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
+ assert(MI.getOperand(RegMaskStartIdx).isReg() &&
+ "should only skip register operands");
+ RegMaskStartIdx++;
+ }
+ for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx)
+ OriginalCall->addOperand(MI.getOperand(RegMaskStartIdx));
+
+ auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
+ .addReg(AArch64::FP, RegState::Define)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::FP)
+ .addImm(0)
+ .getInstr();
+ if (MI.shouldUpdateCallSiteInfo())
+ MBB.getParent()->moveCallSiteInfo(&MI, Marker);
+ MI.eraseFromParent();
+ finalizeBundle(MBB, OriginalCall->getIterator(),
+ std::next(Marker->getIterator()));
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1014,6 +1056,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
case AArch64::LDR_ZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
+ case AArch64::BLR_RVMARKER:
+ return expandCALL_RVMARKER(MBB, MBBI);
}
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 538863ebe95a..209f9f7255a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -54,7 +54,7 @@
using namespace llvm;
-#define DEBUG_TYPE "falkor-hwpf-fix"
+#define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
STATISTIC(NumCollisionsAvoided,
@@ -146,7 +146,7 @@ bool FalkorMarkStridedAccesses::run() {
bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
// Only mark strided loads in the inner-most loop
- if (!L.empty())
+ if (!L.isInnermost())
return false;
bool MadeChange = false;
@@ -224,10 +224,10 @@ struct LoadInfo {
char FalkorHWPFFix::ID = 0;
-INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
"Falkor HW Prefetch Fix Late Phase", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
"Falkor HW Prefetch Fix Late Phase", false, false)
static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
@@ -830,7 +830,7 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
for (MachineLoop *I : LI)
for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
// Only process inner-loops
- if (L->empty())
+ if (L->isInnermost())
runOnLoop(**L, Fn);
return Modified;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 0f63f4ca62e5..9801036653f7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3409,8 +3409,7 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
const Value *RHS = II->getArgOperand(1);
// Canonicalize immediate to the RHS.
- if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
- isCommutativeIntrinsic(II))
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
std::swap(LHS, RHS);
// Simplify multiplies.
@@ -3652,14 +3651,10 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
.addImm(1);
return true;
- case Intrinsic::debugtrap: {
- if (Subtarget->isTargetWindows()) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
- .addImm(0xF000);
- return true;
- }
- break;
- }
+ case Intrinsic::debugtrap:
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+ .addImm(0xF000);
+ return true;
case Intrinsic::sqrt: {
Type *RetTy = II->getCalledFunction()->getReturnType();
@@ -3701,8 +3696,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
const Value *LHS = II->getArgOperand(0);
const Value *RHS = II->getArgOperand(1);
// Canonicalize immediate to the RHS.
- if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
- isCommutativeIntrinsic(II))
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
std::swap(LHS, RHS);
// Simplify multiplies.
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c6cc6e9e8471..65ee5016042c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -116,7 +116,6 @@
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
-#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@@ -176,6 +175,10 @@ static cl::opt<bool> StackTaggingMergeSetTag(
cl::desc("merge settag instruction in function epilog"), cl::init(true),
cl::Hidden);
+static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
+ cl::desc("sort stack allocations"),
+ cl::init(true), cl::Hidden);
+
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
/// Returns the argument pop size.
@@ -246,7 +249,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
TargetStackID::Value
AArch64FrameLowering::getStackIDForScalableVectors() const {
- return TargetStackID::SVEVector;
+ return TargetStackID::ScalableVector;
}
/// Returns the size of the fixed object area (allocated next to sp on entry)
@@ -270,7 +273,7 @@ static unsigned getFixedObjectSize(const MachineFunction &MF,
/// Returns the size of the entire SVE stackframe (calleesaves + spills).
static StackOffset getSVEStackSize(const MachineFunction &MF) {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
+ return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
}
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
@@ -362,44 +365,19 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
- emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
- TII);
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(Amount), TII);
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
// stack, we want to add it back if we have a reserved call frame.
assert(CalleePopAmount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
- {-(int64_t)CalleePopAmount, MVT::i8}, TII);
+ StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
}
return MBB.erase(I);
}
-static bool ShouldSignReturnAddress(MachineFunction &MF) {
- // The function should be signed in the following situations:
- // - sign-return-address=all
- // - sign-return-address=non-leaf and the functions spills the LR
-
- const Function &F = MF.getFunction();
- if (!F.hasFnAttribute("sign-return-address"))
- return false;
-
- StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
- if (Scope.equals("none"))
- return false;
-
- if (Scope.equals("all"))
- return true;
-
- assert(Scope.equals("non-leaf") && "Expected all, none or non-leaf");
-
- for (const auto &Info : MF.getFrameInfo().getCalleeSavedInfo())
- if (Info.getReg() == AArch64::LR)
- return true;
-
- return false;
-}
-
// Convenience function to create a DWARF expression for
// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
@@ -435,7 +413,8 @@ static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
int64_t NumBytes, NumVGScaledBytes;
- OffsetFromSP.getForDwarfOffset(NumBytes, NumVGScaledBytes);
+ AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
+ NumVGScaledBytes);
std::string CommentBuffer = "sp";
llvm::raw_string_ostream Comment(CommentBuffer);
@@ -462,7 +441,8 @@ MCCFIInstruction AArch64FrameLowering::createCfaOffset(
const TargetRegisterInfo &TRI, unsigned Reg,
const StackOffset &OffsetFromDefCFA) const {
int64_t NumBytes, NumVGScaledBytes;
- OffsetFromDefCFA.getForDwarfOffset(NumBytes, NumVGScaledBytes);
+ AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
+ OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
@@ -516,14 +496,14 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
continue;
StackOffset Offset;
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::SVEVector) {
+ if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- Offset = StackOffset(MFI.getObjectOffset(Info.getFrameIdx()), MVT::nxv1i8) -
- StackOffset(AFI->getCalleeSavedStackSize(MFI), MVT::i8);
+ Offset =
+ StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+ StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
} else {
- Offset = {MFI.getObjectOffset(Info.getFrameIdx()) -
- getOffsetOfLocalArea(),
- MVT::i8};
+ Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
+ getOffsetOfLocalArea());
}
unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -604,6 +584,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF,
!F.hasFnAttribute("no-stack-arg-probe");
}
+static bool needsWinCFI(const MachineFunction &MF) {
+ const Function &F = MF.getFunction();
+ return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+ F.needsUnwindTableEntry();
+}
+
bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
MachineFunction &MF, uint64_t StackBumpBytes) const {
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -614,6 +600,18 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
if (AFI->getLocalStackSize() == 0)
return false;
+ // For WinCFI, if optimizing for size, prefer to not combine the stack bump
+ // (to force a stp with predecrement) to match the packed unwind format,
+ // provided that there actually are any callee saved registers to merge the
+ // decrement with.
+ // This is potentially marginally slower, but allows using the packed
+ // unwind format for functions that both have a local area and callee saved
+ // registers. Using the packed unwind format notably reduces the size of
+ // the unwind info.
+ if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
+ MF.getFunction().hasOptSize())
+ return false;
+
// 512 is the maximum immediate for stp/ldp that will be used for
// callee-save save/restores
if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
@@ -1007,27 +1005,6 @@ static void adaptForLdStOpt(MachineBasicBlock &MBB,
//
}
-static bool ShouldSignWithAKey(MachineFunction &MF) {
- const Function &F = MF.getFunction();
- if (!F.hasFnAttribute("sign-return-address-key"))
- return true;
-
- const StringRef Key =
- F.getFnAttribute("sign-return-address-key").getValueAsString();
- assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
- return Key.equals_lower("a_key");
-}
-
-static bool needsWinCFI(const MachineFunction &MF) {
- const Function &F = MF.getFunction();
- return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
- F.needsUnwindTableEntry();
-}
-
-static bool isTargetDarwin(const MachineFunction &MF) {
- return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
-}
-
static bool isTargetWindows(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
}
@@ -1074,15 +1051,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// to determine the end of the prologue.
DebugLoc DL;
- if (ShouldSignReturnAddress(MF)) {
- if (ShouldSignWithAKey(MF))
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
- .setMIFlag(MachineInstr::FrameSetup);
- else {
+ const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
+ if (MFnI.shouldSignReturnAddress()) {
+ if (MFnI.shouldSignWithBKey()) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
.setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
+ .setMIFlag(MachineInstr::FrameSetup);
}
unsigned CFIIndex =
@@ -1097,9 +1075,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;
- // Set tagged base pointer to the bottom of the stack frame.
+ // Set tagged base pointer to the requested stack slot.
// Ideally it should match SP value after prologue.
- AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+ Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
+ if (TBPI)
+ AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
+ else
+ AFI->setTaggedBasePointerOffset(MFI.getStackSize());
const StackOffset &SVEStackSize = getSVEStackSize(MF);
@@ -1126,8 +1108,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
++NumRedZoneFunctions;
} else {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
- {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
- false, NeedsWinCFI, &HasWinCFI);
+ StackOffset::getFixed(-NumBytes), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
if (!NeedsWinCFI && needsFrameMoves) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
@@ -1160,8 +1142,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
- {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
- NeedsWinCFI, &HasWinCFI);
+ StackOffset::getFixed(-NumBytes), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
@@ -1185,7 +1167,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// For funclets the FP belongs to the containing function.
if (!IsFunclet && HasFP) {
// Only set up FP if we actually need to.
- int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+ int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
@@ -1195,8 +1177,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Note: All stores of callee-saved registers are marked as "FrameSetup".
// This code marks the instruction(s) that set the FP also.
emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
- {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
- NeedsWinCFI, &HasWinCFI);
+ StackOffset::getFixed(FPOffset), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
@@ -1306,7 +1288,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
++MBBI;
CalleeSavesEnd = MBBI;
- AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};
+ AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
AllocateAfter = SVEStackSize - AllocateBefore;
}
@@ -1338,8 +1320,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// the correct value here, as NumBytes also includes padding bytes,
// which shouldn't be counted here.
emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
- {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
- false, NeedsWinCFI, &HasWinCFI);
+ StackOffset::getFixed(-NumBytes), TII,
+ MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
if (NeedsRealignment) {
const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
@@ -1409,11 +1391,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
if (needsFrameMoves) {
- const DataLayout &TD = MF.getDataLayout();
- const int StackGrowth = isTargetDarwin(MF)
- ? (2 * -TD.getPointerSize(0))
- : -AFI->getCalleeSavedStackSize();
- Register FramePtr = RegInfo->getFrameRegister(MF);
// An example of the prologue:
//
// .globl __foo
@@ -1481,10 +1458,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// .cfi_offset w28, -32
if (HasFP) {
+ const int OffsetToFirstCalleeSaveFromFP =
+ AFI->getCalleeSaveBaseToFrameRecordOffset() -
+ AFI->getCalleeSavedStackSize();
+ Register FramePtr = RegInfo->getFrameRegister(MF);
+
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
+ MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -1494,7 +1476,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
const TargetSubtargetInfo &STI = MF.getSubtarget();
const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
StackOffset TotalSize =
- SVEStackSize + StackOffset((int64_t)MFI.getStackSize(), MVT::i8);
+ SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
} else {
// Encode the stack size of the leaf function.
@@ -1514,7 +1496,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
static void InsertReturnAddressAuth(MachineFunction &MF,
MachineBasicBlock &MBB) {
- if (!ShouldSignReturnAddress(MF))
+ const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
+ if (!MFI.shouldSignReturnAddress())
return;
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
@@ -1528,16 +1511,16 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
// this instruction can safely used for any v8a architecture.
// From v8.3a onwards there are optimised authenticate LR and return
// instructions, namely RETA{A,B}, that can be used instead.
- if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
+ if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
MBBI->getOpcode() == AArch64::RET_ReallyLR) {
BuildMI(MBB, MBBI, DL,
- TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
+ TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
.copyImplicitOps(*MBBI);
MBB.erase(MBBI);
} else {
BuildMI(
MBB, MBBI, DL,
- TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
+ TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
.setMIFlag(MachineInstr::FrameDestroy);
}
}
@@ -1562,10 +1545,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
bool IsFunclet = false;
- auto WinCFI = make_scope_exit([&]() {
- if (!MF.hasWinCFI())
- MF.setHasWinCFI(HasWinCFI);
- });
+ auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
if (MBB.end() != MBBI) {
DL = MBBI->getDebugLoc();
@@ -1665,7 +1645,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
NeedsWinCFI, &HasWinCFI);
}
- if (NeedsWinCFI) {
+ if (MF.hasWinCFI()) {
+ // If the prologue didn't contain any SEH opcodes and didn't set the
+ // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
+ // EpilogStart - to avoid generating CFI for functions that don't need it.
+ // (And as we didn't generate any prologue at all, it would be asymmetrical
+ // to the epilogue.) By the end of the function, we assert that
+ // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
HasWinCFI = true;
BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -1677,9 +1663,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
- if (NeedsWinCFI && HasWinCFI)
+ StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
+ TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
+ if (HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -1702,7 +1689,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
assert(IsSVECalleeSave(RestoreBegin) &&
IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
- StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8};
+ StackOffset CalleeSavedSizeAsOffset =
+ StackOffset::getScalable(CalleeSavedSize);
DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
DeallocateAfter = CalleeSavedSizeAsOffset;
}
@@ -1715,14 +1703,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// be reloaded. The code below will deallocate the stack space
// space by moving FP -> SP.
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
- {-CalleeSavedSize, MVT::nxv1i8}, TII,
+ StackOffset::getScalable(-CalleeSavedSize), TII,
MachineInstr::FrameDestroy);
} else {
if (AFI->getSVECalleeSavedStackSize()) {
// Deallocate the non-SVE locals first before we can deallocate (and
// restore callee saves) from the SVE area.
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
+ StackOffset::getFixed(NumBytes), TII,
+ MachineInstr::FrameDestroy);
NumBytes = 0;
}
@@ -1755,11 +1744,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- {StackRestoreBytes, MVT::i8}, TII,
+ StackOffset::getFixed(StackRestoreBytes), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (Done) {
- if (NeedsWinCFI) {
- HasWinCFI = true;
+ if (HasWinCFI) {
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -1775,15 +1763,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// non-post-indexed loads for the restores if we aren't actually going to
// be able to save any instructions.
if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
- int64_t OffsetToFrameRecord =
- isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- {OffsetToFrameRecord, MVT::i8},
- TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
+ emitFrameOffset(
+ MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+ StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
+ TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
} else if (NumBytes)
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI);
+ StackOffset::getFixed(NumBytes), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI);
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
@@ -1804,62 +1791,63 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
- {(int64_t)AfterCSRPopSize, MVT::i8}, TII,
+ StackOffset::getFixed((int64_t)AfterCSRPopSize), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
}
- if (NeedsWinCFI && HasWinCFI)
+ if (HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
.setMIFlag(MachineInstr::FrameDestroy);
-
- MF.setHasWinCFI(HasWinCFI);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
/// debug info. It's the same as what we use for resolving the code-gen
/// references for now. FIXME: This can go wrong when references are
/// SP-relative and simple call frames aren't used.
-int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
return resolveFrameIndexReference(
- MF, FI, FrameReg,
- /*PreferFP=*/
- MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
- /*ForSimm=*/false)
- .getBytes();
+ MF, FI, FrameReg,
+ /*PreferFP=*/
+ MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+ /*ForSimm=*/false);
}
-int AArch64FrameLowering::getNonLocalFrameIndexReference(
- const MachineFunction &MF, int FI) const {
- return getSEHFrameIndexOffset(MF, FI);
+StackOffset
+AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
+ int FI) const {
+ return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
}
-static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) {
+static StackOffset getFPOffset(const MachineFunction &MF,
+ int64_t ObjectOffset) {
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-
unsigned FixedObject =
getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
- unsigned FPAdjust = isTargetDarwin(MF)
- ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
- return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
+ int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
+ int64_t FPAdjust =
+ CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
+ return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
}
-static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) {
+static StackOffset getStackOffset(const MachineFunction &MF,
+ int64_t ObjectOffset) {
const auto &MFI = MF.getFrameInfo();
- return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8};
+ return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
}
+ // TODO: This function currently does not work for scalable vectors.
int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
int FI) const {
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
- ? getFPOffset(MF, ObjectOffset).getBytes()
- : getStackOffset(MF, ObjectOffset).getBytes();
+ ? getFPOffset(MF, ObjectOffset).getFixed()
+ : getStackOffset(MF, ObjectOffset).getFixed();
}
StackOffset AArch64FrameLowering::resolveFrameIndexReference(
@@ -1868,7 +1856,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
const auto &MFI = MF.getFrameInfo();
int64_t ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
- bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
+ bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
PreferFP, ForSimm);
}
@@ -1882,8 +1870,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
- int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes();
+ int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
+ int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
bool isCSR =
!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
@@ -1958,16 +1946,16 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
"non-argument/CSR objects cannot be accessed through the frame pointer");
if (isSVE) {
- int64_t OffsetToSVEArea =
- MFI.getStackSize() - AFI->getCalleeSavedStackSize();
- StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
- StackOffset SPOffset = SVEStackSize +
- StackOffset(ObjectOffset, MVT::nxv1i8) +
- StackOffset(OffsetToSVEArea, MVT::i8);
+ StackOffset FPOffset =
+ StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
+ StackOffset SPOffset =
+ SVEStackSize +
+ StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
+ ObjectOffset);
// Always use the FP for SVE spills if available and beneficial.
if (hasFP(MF) &&
- (SPOffset.getBytes() ||
- FPOffset.getScalableBytes() < SPOffset.getScalableBytes() ||
+ (SPOffset.getFixed() ||
+ FPOffset.getScalable() < SPOffset.getScalable() ||
RegInfo->needsStackRealignment(MF))) {
FrameReg = RegInfo->getFrameRegister(MF);
return FPOffset;
@@ -1986,7 +1974,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
- return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
+ return StackOffset::getFixed(FPOffset) + ScalableOffset;
}
// Use the base pointer if we have one.
@@ -2003,7 +1991,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
Offset -= AFI->getLocalStackSize();
}
- return StackOffset(Offset, MVT::i8) + ScalableOffset;
+ return StackOffset::getFixed(Offset) + ScalableOffset;
}
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -2025,21 +2013,28 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) {
}
static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
- bool NeedsWinCFI) {
+ bool NeedsWinCFI, bool IsFirst) {
// If we are generating register pairs for a Windows function that requires
// EH support, then pair consecutive registers only. There are no unwind
// opcodes for saves/restores of non-consectuve register pairs.
- // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
+ // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
+ // save_lrpair.
// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
- // TODO: LR can be paired with any register. We don't support this yet in
- // the MCLayer. We need to add support for the save_lrpair unwind code.
if (Reg2 == AArch64::FP)
return true;
if (!NeedsWinCFI)
return false;
if (Reg2 == Reg1 + 1)
return false;
+ // If pairing a GPR with LR, the pair can be described by the save_lrpair
+ // opcode. If this is the first register pair, it would end up with a
+ // predecrement, but there's no save_lrpair_x opcode, so we can only do this
+ // if LR is paired with something else than the first register.
+ // The save_lrpair opcode requires the first register to be an odd one.
+ if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
+ (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
+ return false;
return true;
}
@@ -2048,9 +2043,10 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
/// LR and FP need to be allocated together when the frame needs to save
/// the frame-record. This means any other register pairing with LR is invalid.
static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
- bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) {
+ bool UsesWinAAPCS, bool NeedsWinCFI,
+ bool NeedsFrameRecord, bool IsFirst) {
if (UsesWinAAPCS)
- return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI);
+ return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
// If we need to store the frame record, don't pair any register
// with LR other than FP.
@@ -2114,14 +2110,22 @@ static void computeCalleeSaveRegisterPairs(
(Count & 1) == 0) &&
"Odd number of callee-saved regs to spill!");
int ByteOffset = AFI->getCalleeSavedStackSize();
+ int StackFillDir = -1;
+ int RegInc = 1;
+ unsigned FirstReg = 0;
+ if (NeedsWinCFI) {
+ // For WinCFI, fill the stack from the bottom up.
+ ByteOffset = 0;
+ StackFillDir = 1;
+ // As the CSI array is reversed to match PrologEpilogInserter, iterate
+ // backwards, to pair up registers starting from lower numbered registers.
+ RegInc = -1;
+ FirstReg = Count - 1;
+ }
int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
- // On Linux, we will have either one or zero non-paired register. On Windows
- // with CFI, we can have multiple unpaired registers in order to utilize the
- // available unwind codes. This flag assures that the alignment fixup is done
- // only once, as intened.
- bool FixupDone = false;
- for (unsigned i = 0; i < Count; ++i) {
+ // When iterating backwards, the loop condition relies on unsigned wraparound.
+ for (unsigned i = FirstReg; i < Count; i += RegInc) {
RegPairInfo RPI;
RPI.Reg1 = CSI[i].getReg();
@@ -2139,18 +2143,20 @@ static void computeCalleeSaveRegisterPairs(
llvm_unreachable("Unsupported register class.");
// Add the next reg to the pair if it is in the same register class.
- if (i + 1 < Count) {
- unsigned NextReg = CSI[i + 1].getReg();
+ if (unsigned(i + RegInc) < Count) {
+ unsigned NextReg = CSI[i + RegInc].getReg();
+ bool IsFirst = i == FirstReg;
switch (RPI.Type) {
case RegPairInfo::GPR:
if (AArch64::GPR64RegClass.contains(NextReg) &&
- !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI,
- NeedsFrameRecord))
+ !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
+ NeedsWinCFI, NeedsFrameRecord, IsFirst))
RPI.Reg2 = NextReg;
break;
case RegPairInfo::FPR64:
if (AArch64::FPR64RegClass.contains(NextReg) &&
- !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
+ !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+ IsFirst))
RPI.Reg2 = NextReg;
break;
case RegPairInfo::FPR128:
@@ -2179,7 +2185,7 @@ static void computeCalleeSaveRegisterPairs(
// The order of the registers in the list is controlled by
// getCalleeSavedRegs(), so they will always be in-order, as well.
assert((!RPI.isPaired() ||
- (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
+ (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
"Out of order callee saved regs!");
assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
@@ -2201,39 +2207,72 @@ static void computeCalleeSaveRegisterPairs(
"Callee-save registers not saved as adjacent register pair!");
RPI.FrameIdx = CSI[i].getFrameIdx();
+ if (NeedsWinCFI &&
+ RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
+ RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
int Scale = RPI.getScale();
+
+ int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
+ assert(OffsetPre % Scale == 0);
+
if (RPI.isScalable())
- ScalableByteOffset -= Scale;
+ ScalableByteOffset += StackFillDir * Scale;
else
- ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;
+ ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
assert(!(RPI.isScalable() && RPI.isPaired()) &&
"Paired spill/fill instructions don't exist for SVE vectors");
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
- if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
+ if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI &&
!RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
!RPI.isPaired()) {
- FixupDone = true;
- ByteOffset -= 8;
+ ByteOffset += 8 * StackFillDir;
assert(ByteOffset % 16 == 0);
assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
+ // A stack frame with a gap looks like this, bottom up:
+ // d9, d8. x21, gap, x20, x19.
+ // Set extra alignment on the x21 object (the only unpaired register)
+ // to create the gap above it.
MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
}
- int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
- assert(Offset % Scale == 0);
+ int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
+ assert(OffsetPost % Scale == 0);
+ // If filling top down (default), we want the offset after incrementing it.
+ // If fillibg bootom up (WinCFI) we need the original offset.
+ int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
RPI.Offset = Offset / Scale;
assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");
+ // Save the offset to frame record so that the FP register can point to the
+ // innermost frame record (spilled FP and LR registers).
+ if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
+ RPI.Reg2 == AArch64::FP) ||
+ (IsWindows && RPI.Reg1 == AArch64::FP &&
+ RPI.Reg2 == AArch64::LR)))
+ AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
+
RegPairs.push_back(RPI);
if (RPI.isPaired())
- ++i;
+ i += RegInc;
+ }
+ if (NeedsWinCFI) {
+ // If we need an alignment gap in the stack, align the topmost stack
+ // object. A stack frame with a gap looks like this, bottom up:
+ // x19, d8. d9, gap.
+ // Set extra alignment on the topmost stack object (the first element in
+ // CSI, which goes top down), to create the gap above it.
+ if (AFI->hasCalleeSaveStackFreeSpace())
+ MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
+ // We iterated bottom up over the registers; flip RegPairs back to top
+ // down order.
+ std::reverse(RegPairs.begin(), RegPairs.end());
}
}
@@ -2373,7 +2412,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
- MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);
+ MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
}
return true;
@@ -2665,6 +2704,21 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
}
+bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ bool NeedsWinCFI = needsWinCFI(MF);
+ // To match the canonical windows frame layout, reverse the list of
+ // callee saved registers to get them laid out by PrologEpilogInserter
+ // in the right order. (PrologEpilogInserter allocates stack objects top
+ // down. Windows canonical prologs store higher numbered registers at
+ // the top, thus have the CSI array start from the highest registers.)
+ if (NeedsWinCFI)
+ std::reverse(CSI.begin(), CSI.end());
+ // Let the generic code do the rest of the setup.
+ return false;
+}
+
bool AArch64FrameLowering::enableStackSlotScavenging(
const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -2707,7 +2761,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
#ifndef NDEBUG
// First process all fixed stack objects.
for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
- assert(MFI.getStackID(I) != TargetStackID::SVEVector &&
+ assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
"SVE vectors should never be passed on the stack by value, only by "
"reference.");
#endif
@@ -2737,7 +2791,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
SmallVector<int, 8> ObjectsToAllocate;
for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
unsigned StackID = MFI.getStackID(I);
- if (StackID != TargetStackID::SVEVector)
+ if (StackID != TargetStackID::ScalableVector)
continue;
if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
continue;
@@ -2891,12 +2945,12 @@ void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
const int64_t kMaxOffset = 255 * 16;
Register BaseReg = FrameReg;
- int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
+ int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
if (BaseRegOffsetBytes < kMinOffset ||
BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
- {BaseRegOffsetBytes, MVT::i8}, TII);
+ StackOffset::getFixed(BaseRegOffsetBytes), TII);
BaseReg = ScratchReg;
BaseRegOffsetBytes = 0;
}
@@ -2953,7 +3007,7 @@ void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
LoopI->setFlags(FrameRegUpdateFlags);
int64_t ExtraBaseRegUpdate =
- FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
+ FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
if (LoopSize < Size) {
assert(FrameRegUpdate);
assert(Size - LoopSize == 16);
@@ -3057,7 +3111,7 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
// realistically happens in function epilogue. Also, STGloop is expanded
// before that pass.
if (InsertI != MBB->end() &&
- canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
+ canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
&TotalOffset)) {
UpdateInstr = &*InsertI++;
LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
@@ -3220,7 +3274,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
/// before the update. This is easily retrieved as it is exactly the offset
/// that is set in processFunctionBeforeFrameFinalized.
-int AArch64FrameLowering::getFrameIndexReferencePreferSP(
+StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
const MachineFunction &MF, int FI, Register &FrameReg,
bool IgnoreSPUpdates) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3228,7 +3282,7 @@ int AArch64FrameLowering::getFrameIndexReferencePreferSP(
LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
<< MFI.getObjectOffset(FI) << "\n");
FrameReg = AArch64::SP;
- return MFI.getObjectOffset(FI);
+ return StackOffset::getFixed(MFI.getObjectOffset(FI));
}
return getFrameIndexReference(MF, FI, FrameReg);
@@ -3252,3 +3306,162 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
getStackAlign());
}
+
+namespace {
+struct FrameObject {
+ bool IsValid = false;
+ // Index of the object in MFI.
+ int ObjectIndex = 0;
+ // Group ID this object belongs to.
+ int GroupIndex = -1;
+ // This object should be placed first (closest to SP).
+ bool ObjectFirst = false;
+ // This object's group (which always contains the object with
+ // ObjectFirst==true) should be placed first.
+ bool GroupFirst = false;
+};
+
+class GroupBuilder {
+ SmallVector<int, 8> CurrentMembers;
+ int NextGroupIndex = 0;
+ std::vector<FrameObject> &Objects;
+
+public:
+ GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
+ void AddMember(int Index) { CurrentMembers.push_back(Index); }
+ void EndCurrentGroup() {
+ if (CurrentMembers.size() > 1) {
+ // Create a new group with the current member list. This might remove them
+ // from their pre-existing groups. That's OK, dealing with overlapping
+ // groups is too hard and unlikely to make a difference.
+ LLVM_DEBUG(dbgs() << "group:");
+ for (int Index : CurrentMembers) {
+ Objects[Index].GroupIndex = NextGroupIndex;
+ LLVM_DEBUG(dbgs() << " " << Index);
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+ NextGroupIndex++;
+ }
+ CurrentMembers.clear();
+ }
+};
+
+bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
+ // Objects at a lower index are closer to FP; objects at a higher index are
+ // closer to SP.
+ //
+ // For consistency in our comparison, all invalid objects are placed
+ // at the end. This also allows us to stop walking when we hit the
+ // first invalid item after it's all sorted.
+ //
+ // The "first" object goes first (closest to SP), followed by the members of
+ // the "first" group.
+ //
+ // The rest are sorted by the group index to keep the groups together.
+ // Higher numbered groups are more likely to be around longer (i.e. untagged
+ // in the function epilogue and not at some earlier point). Place them closer
+ // to SP.
+ //
+ // If all else equal, sort by the object index to keep the objects in the
+ // original order.
+ return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
+ A.ObjectIndex) <
+ std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
+ B.ObjectIndex);
+}
+} // namespace
+
+void AArch64FrameLowering::orderFrameObjects(
+ const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+ if (!OrderFrameObjects || ObjectsToAllocate.empty())
+ return;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
+ for (auto &Obj : ObjectsToAllocate) {
+ FrameObjects[Obj].IsValid = true;
+ FrameObjects[Obj].ObjectIndex = Obj;
+ }
+
+ // Identify stack slots that are tagged at the same time.
+ GroupBuilder GB(FrameObjects);
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+ int OpIndex;
+ switch (MI.getOpcode()) {
+ case AArch64::STGloop:
+ case AArch64::STZGloop:
+ OpIndex = 3;
+ break;
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ OpIndex = 1;
+ break;
+ default:
+ OpIndex = -1;
+ }
+
+ int TaggedFI = -1;
+ if (OpIndex >= 0) {
+ const MachineOperand &MO = MI.getOperand(OpIndex);
+ if (MO.isFI()) {
+ int FI = MO.getIndex();
+ if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
+ FrameObjects[FI].IsValid)
+ TaggedFI = FI;
+ }
+ }
+
+ // If this is a stack tagging instruction for a slot that is not part of a
+ // group yet, either start a new group or add it to the current one.
+ if (TaggedFI >= 0)
+ GB.AddMember(TaggedFI);
+ else
+ GB.EndCurrentGroup();
+ }
+ // Groups should never span multiple basic blocks.
+ GB.EndCurrentGroup();
+ }
+
+ // If the function's tagged base pointer is pinned to a stack slot, we want to
+ // put that slot first when possible. This will likely place it at SP + 0,
+ // and save one instruction when generating the base pointer because IRG does
+ // not allow an immediate offset.
+ const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+ Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
+ if (TBPI) {
+ FrameObjects[*TBPI].ObjectFirst = true;
+ FrameObjects[*TBPI].GroupFirst = true;
+ int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
+ if (FirstGroupIndex >= 0)
+ for (FrameObject &Object : FrameObjects)
+ if (Object.GroupIndex == FirstGroupIndex)
+ Object.GroupFirst = true;
+ }
+
+ llvm::stable_sort(FrameObjects, FrameObjectCompare);
+
+ int i = 0;
+ for (auto &Obj : FrameObjects) {
+ // All invalid items are sorted at the end, so it's safe to stop.
+ if (!Obj.IsValid)
+ break;
+ ObjectsToAllocate[i++] = Obj.ObjectIndex;
+ }
+
+ LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
+ : FrameObjects) {
+ if (!Obj.IsValid)
+ break;
+ dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
+ if (Obj.ObjectFirst)
+ dbgs() << ", first";
+ if (Obj.GroupFirst)
+ dbgs() << ", group-first";
+ dbgs() << "\n";
+ });
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 1ca8c3e9e2bf..80079a9d9836 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -13,7 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
-#include "AArch64StackOffset.h"
+#include "llvm/Support/TypeSize.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -41,8 +41,8 @@ public:
bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg, bool PreferFP,
bool ForSimm) const;
@@ -67,6 +67,11 @@ public:
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
@@ -89,11 +94,12 @@ public:
unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
- int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
- Register &FrameReg,
- bool IgnoreSPUpdates) const override;
- int getNonLocalFrameIndexReference(const MachineFunction &MF,
- int FI) const override;
+ StackOffset
+ getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ bool IgnoreSPUpdates) const override;
+ StackOffset getNonLocalFrameIndexReference(const MachineFunction &MF,
+ int FI) const override;
int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
bool isSupportedStackID(TargetStackID::Value ID) const override {
@@ -101,7 +107,7 @@ public:
default:
return false;
case TargetStackID::Default:
- case TargetStackID::SVEVector:
+ case TargetStackID::ScalableVector:
case TargetStackID::NoAlloc:
return true;
}
@@ -110,9 +116,13 @@ public:
bool isStackIdSafeForLocalArea(unsigned StackId) const override {
// We don't support putting SVE objects into the pre-allocated local
// frame block at the moment.
- return StackId != TargetStackID::SVEVector;
+ return StackId != TargetStackID::ScalableVector;
}
+ void
+ orderFrameObjects(const MachineFunction &MF,
+ SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
private:
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
uint64_t StackBumpBytes) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7799ebfbd68e..94b5d7718d0c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/APSInt.h"
@@ -190,9 +191,14 @@ public:
return SelectSVELogicalImm(N, VT, Imm);
}
- template <unsigned Low, unsigned High>
- bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
- return SelectSVEShiftImm64(N, Low, High, Imm);
+ template <MVT::SimpleValueType VT>
+ bool SelectSVEArithImm(SDValue N, SDValue &Imm) {
+ return SelectSVEArithImm(N, VT, Imm);
+ }
+
+ template <unsigned Low, unsigned High, bool AllowSaturation = false>
+ bool SelectSVEShiftImm(SDValue N, SDValue &Imm) {
+ return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
}
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
@@ -323,10 +329,10 @@ private:
bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
- bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
- SDValue &Imm);
+ bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High,
+ bool AllowSaturation, SDValue &Imm);
- bool SelectSVEArithImm(SDValue N, SDValue &Imm);
+ bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
};
@@ -1371,9 +1377,12 @@ void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
- // Transfer memoperands.
- MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
+ // Transfer memoperands. In the case of AArch64::LD64B, there won't be one,
+ // because it's too simple to have needed special treatment during lowering.
+ if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(N)) {
+ MachineMemOperand *MemOp = MemIntr->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
+ }
CurDAG->RemoveDeadNode(N);
}
@@ -3127,13 +3136,28 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
return false;
}
-bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
+bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
- uint64_t ImmVal = CNode->getSExtValue();
- SDLoc DL(N);
- ImmVal = ImmVal & 0xFF;
+ uint64_t ImmVal = CNode->getZExtValue();
+
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ ImmVal &= 0xFF;
+ break;
+ case MVT::i16:
+ ImmVal &= 0xFFFF;
+ break;
+ case MVT::i32:
+ ImmVal &= 0xFFFFFFFF;
+ break;
+ case MVT::i64:
+ break;
+ default:
+ llvm_unreachable("Unexpected type");
+ }
+
if (ImmVal < 256) {
- Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
return true;
}
}
@@ -3177,19 +3201,30 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
return false;
}
-// This method is only needed to "cast" i64s into i32s when the value
-// is a valid shift which has been splatted into a vector with i64 elements.
-// Every other type is fine in tablegen.
-bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
- uint64_t High, SDValue &Imm) {
+// SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
+// Rather than attempt to normalise everything we can sometimes saturate the
+// shift amount during selection. This function also allows for consistent
+// isel patterns by ensuring the resulting "Imm" node is of the i32 type
+// required by the instructions.
+bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low,
+ uint64_t High, bool AllowSaturation,
+ SDValue &Imm) {
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CN->getZExtValue();
- SDLoc DL(N);
- if (ImmVal >= Low && ImmVal <= High) {
- Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
- return true;
+ // Reject shift amounts that are too small.
+ if (ImmVal < Low)
+ return false;
+
+ // Reject or saturate shift amounts that are too big.
+ if (ImmVal > High) {
+ if (!AllowSaturation)
+ return false;
+ ImmVal = High;
}
+
+ Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
+ return true;
}
return false;
@@ -3798,6 +3833,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
}
break;
+ case Intrinsic::aarch64_ld64b:
+ SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0);
+ return;
}
} break;
case ISD::INTRINSIC_WO_CHAIN: {
@@ -4816,7 +4854,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
return EVT();
ElementCount EC = PredVT.getVectorElementCount();
- EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
+ EVT ScalarVT =
+ EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
return MemVT;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48ca9039b1bd..1be09186dc0a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27,7 +27,6 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/VectorUtils.h"
@@ -113,9 +112,76 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
"optimization"),
cl::init(true));
+// Temporary option added for the purpose of testing functionality added
+// to DAGCombiner.cpp in D92230. It is expected that this can be removed
+// in future when both implementations will be based off MGATHER rather
+// than the GLD1 nodes added for the SVE gather load intrinsics.
+static cl::opt<bool>
+EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
+ cl::desc("Combine extends of AArch64 masked "
+ "gather intrinsics"),
+ cl::init(true));
+
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+static inline EVT getPackedSVEVectorVT(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for vector");
+ case MVT::i8:
+ return MVT::nxv16i8;
+ case MVT::i16:
+ return MVT::nxv8i16;
+ case MVT::i32:
+ return MVT::nxv4i32;
+ case MVT::i64:
+ return MVT::nxv2i64;
+ case MVT::f16:
+ return MVT::nxv8f16;
+ case MVT::f32:
+ return MVT::nxv4f32;
+ case MVT::f64:
+ return MVT::nxv2f64;
+ case MVT::bf16:
+ return MVT::nxv8bf16;
+ }
+}
+
+// NOTE: Currently there's only a need to return integer vector types. If this
+// changes then just add an extra "type" parameter.
+static inline EVT getPackedSVEVectorVT(ElementCount EC) {
+ switch (EC.getKnownMinValue()) {
+ default:
+ llvm_unreachable("unexpected element count for vector");
+ case 16:
+ return MVT::nxv16i8;
+ case 8:
+ return MVT::nxv8i16;
+ case 4:
+ return MVT::nxv4i32;
+ case 2:
+ return MVT::nxv2i64;
+ }
+}
+
+static inline EVT getPromotedVTForPredicate(EVT VT) {
+ assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
+ "Expected scalable predicate vector type!");
+ switch (VT.getVectorMinNumElements()) {
+ default:
+ llvm_unreachable("unexpected element count for vector");
+ case 2:
+ return MVT::nxv2i64;
+ case 4:
+ return MVT::nxv4i32;
+ case 8:
+ return MVT::nxv8i16;
+ case 16:
+ return MVT::nxv16i8;
+ }
+}
+
/// Returns true if VT's elements occupy the lowest bit positions of its
/// associated register class without any intervening space.
///
@@ -128,6 +194,42 @@ static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
}
+// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
+// predicate and end with a passthru value matching the result type.
+static bool isMergePassthruOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
+ case AArch64ISD::BSWAP_MERGE_PASSTHRU:
+ case AArch64ISD::CTLZ_MERGE_PASSTHRU:
+ case AArch64ISD::CTPOP_MERGE_PASSTHRU:
+ case AArch64ISD::DUP_MERGE_PASSTHRU:
+ case AArch64ISD::ABS_MERGE_PASSTHRU:
+ case AArch64ISD::NEG_MERGE_PASSTHRU:
+ case AArch64ISD::FNEG_MERGE_PASSTHRU:
+ case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
+ case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
+ case AArch64ISD::FCEIL_MERGE_PASSTHRU:
+ case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
+ case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
+ case AArch64ISD::FRINT_MERGE_PASSTHRU:
+ case AArch64ISD::FROUND_MERGE_PASSTHRU:
+ case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
+ case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
+ case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
+ case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
+ case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
+ case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
+ case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
+ case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
+ case AArch64ISD::FSQRT_MERGE_PASSTHRU:
+ case AArch64ISD::FRECPX_MERGE_PASSTHRU:
+ case AArch64ISD::FABS_MERGE_PASSTHRU:
+ return true;
+ }
+}
+
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -161,7 +263,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addDRTypeForNEON(MVT::v1i64);
addDRTypeForNEON(MVT::v1f64);
addDRTypeForNEON(MVT::v4f16);
- addDRTypeForNEON(MVT::v4bf16);
+ if (Subtarget->hasBF16())
+ addDRTypeForNEON(MVT::v4bf16);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
@@ -170,7 +273,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
addQRTypeForNEON(MVT::v8f16);
- addQRTypeForNEON(MVT::v8bf16);
+ if (Subtarget->hasBF16())
+ addQRTypeForNEON(MVT::v8bf16);
}
if (Subtarget->hasSVE()) {
@@ -199,7 +303,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
}
- if (useSVEForFixedLengthVectors()) {
+ if (Subtarget->useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
addRegisterClass(VT, &AArch64::ZPRRegClass);
@@ -230,7 +334,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::nxv2f64 }) {
setCondCodeAction(ISD::SETO, VT, Expand);
setCondCodeAction(ISD::SETOLT, VT, Expand);
+ setCondCodeAction(ISD::SETLT, VT, Expand);
setCondCodeAction(ISD::SETOLE, VT, Expand);
+ setCondCodeAction(ISD::SETLE, VT, Expand);
setCondCodeAction(ISD::SETULT, VT, Expand);
setCondCodeAction(ISD::SETULE, VT, Expand);
setCondCodeAction(ISD::SETUGE, VT, Expand);
@@ -296,12 +402,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Virtually no operation on f128 is legal, but LLVM can't expand them when
// there's a valid register class, so we need custom operations in most cases.
setOperationAction(ISD::FABS, MVT::f128, Expand);
- setOperationAction(ISD::FADD, MVT::f128, Custom);
+ setOperationAction(ISD::FADD, MVT::f128, LibCall);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
setOperationAction(ISD::FCOS, MVT::f128, Expand);
- setOperationAction(ISD::FDIV, MVT::f128, Custom);
+ setOperationAction(ISD::FDIV, MVT::f128, LibCall);
setOperationAction(ISD::FMA, MVT::f128, Expand);
- setOperationAction(ISD::FMUL, MVT::f128, Custom);
+ setOperationAction(ISD::FMUL, MVT::f128, LibCall);
setOperationAction(ISD::FNEG, MVT::f128, Expand);
setOperationAction(ISD::FPOW, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
@@ -309,7 +415,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSIN, MVT::f128, Expand);
setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
setOperationAction(ISD::FSQRT, MVT::f128, Expand);
- setOperationAction(ISD::FSUB, MVT::f128, Custom);
+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);
setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
@@ -345,8 +451,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
@@ -401,6 +509,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
setOperationAction(ISD::CTPOP, MVT::i128, Custom);
+ setOperationAction(ISD::ABS, MVT::i32, Custom);
+ setOperationAction(ISD::ABS, MVT::i64, Custom);
+
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -588,6 +699,57 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+ // Generate outline atomics library calls only if LSE was not specified for
+ // subtarget
+ if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
+#define LCALLNAMES(A, B, N) \
+ setLibcallName(A##N##_RELAX, #B #N "_relax"); \
+ setLibcallName(A##N##_ACQ, #B #N "_acq"); \
+ setLibcallName(A##N##_REL, #B #N "_rel"); \
+ setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
+#define LCALLNAME4(A, B) \
+ LCALLNAMES(A, B, 1) \
+ LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
+#define LCALLNAME5(A, B) \
+ LCALLNAMES(A, B, 1) \
+ LCALLNAMES(A, B, 2) \
+ LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
+ LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
+ LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
+ LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
+ LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
+ LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
+ LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
+ }
+
// 128-bit loads and stores can be done without expanding
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
@@ -677,8 +839,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Trap.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
- if (Subtarget->isTargetWindows())
- setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+ setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
// We combine OR nodes for bitfield operations.
setTargetDAGCombine(ISD::OR);
@@ -688,6 +850,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Vector add and sub nodes may conceal a high-half opportunity.
// Also, try to fold ADD into CSINC/CSINV..
setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::ABS);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::XOR);
@@ -704,11 +867,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
if (Subtarget->supportsAddressTopByteIgnored())
setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MGATHER);
+ setTargetDAGCombine(ISD::MSCATTER);
+
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
@@ -717,6 +884,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::VECREDUCE_ADD);
setTargetDAGCombine(ISD::GlobalAddress);
@@ -836,28 +1005,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ // Saturates
for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
- // Vector reductions
- setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
- setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
- setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
- setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
- setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
-
- // Saturates
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
-
- setOperationAction(ISD::TRUNCATE, VT, Custom);
}
+
+ // Vector reductions
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+
+ if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
}
+ for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
+ MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ }
+ setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
@@ -918,46 +1092,112 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
// splat of 0 or undef) once vector selects supported in SVE codegen. See
// D68877 for more details.
- for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
- if (isTypeLegal(VT)) {
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
- setOperationAction(ISD::SDIV, VT, Custom);
- setOperationAction(ISD::UDIV, VT, Custom);
- setOperationAction(ISD::SMIN, VT, Custom);
- setOperationAction(ISD::UMIN, VT, Custom);
- setOperationAction(ISD::SMAX, VT, Custom);
- setOperationAction(ISD::UMAX, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- if (VT.getScalarType() == MVT::i1) {
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::TRUNCATE, VT, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
- }
- }
+ for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ setOperationAction(ISD::BSWAP, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::SMIN, VT, Custom);
+ setOperationAction(ISD::UMIN, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Custom);
+ setOperationAction(ISD::UMAX, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::ABS, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
}
- for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
+ // Illegal unpacked integer vector types.
+ for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ }
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
-
- for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
- if (isTypeLegal(VT)) {
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
- setOperationAction(ISD::FMA, VT, Custom);
+ for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+
+ // There are no legal MVT::nxv16f## based types.
+ if (VT != MVT::nxv16i1) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT, Custom);
}
}
+ for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
+ MVT::nxv4f32, MVT::nxv2f64}) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FDIV, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMINNUM, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FSUB, VT, Custom);
+ setOperationAction(ISD::FCEIL, VT, Custom);
+ setOperationAction(ISD::FFLOOR, VT, Custom);
+ setOperationAction(ISD::FNEARBYINT, VT, Custom);
+ setOperationAction(ISD::FRINT, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FROUNDEVEN, VT, Custom);
+ setOperationAction(ISD::FTRUNC, VT, Custom);
+ setOperationAction(ISD::FSQRT, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+ }
+
+ for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+
+ setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
- if (useSVEForFixedLengthVectors()) {
+ if (Subtarget->useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
addTypeForFixedLengthSVE(VT);
@@ -975,6 +1215,61 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, VT, Custom);
for (auto VT : {MVT::v8f16, MVT::v4f32})
setOperationAction(ISD::FP_ROUND, VT, Expand);
+
+ // These operations are not supported on NEON but SVE can do them.
+ setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+ setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
+ setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
+ setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
+ setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
+ setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
+ setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
+ setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
+ setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
+ setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
+ setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
+ setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
+ setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+ setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
+ setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
+ setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
+ setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
+ setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
+ setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
+ setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
+ setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
+
+ // Int operations with no NEON support.
+ for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+ MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ }
+
+ // FP operations with no NEON support.
+ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
+ MVT::v1f64, MVT::v2f64})
+ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+
+ // Use SVE for vectors with more than 2 elements.
+ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
}
}
@@ -1046,6 +1341,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
// F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
if (VT.isFloatingPoint() &&
+ VT.getVectorElementType() != MVT::bf16 &&
(VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
for (unsigned Opcode :
{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
@@ -1071,11 +1367,64 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Lower fixed length vector operations to scalable equivalents.
+ setOperationAction(ISD::ABS, VT, Custom);
setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::AND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ setOperationAction(ISD::BSWAP, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FCEIL, VT, Custom);
+ setOperationAction(ISD::FDIV, VT, Custom);
+ setOperationAction(ISD::FFLOOR, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMINNUM, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FNEARBYINT, VT, Custom);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FRINT, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FSQRT, VT, Custom);
+ setOperationAction(ISD::FSUB, VT, Custom);
+ setOperationAction(ISD::FTRUNC, VT, Custom);
setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::OR, VT, Custom);
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Custom);
+ setOperationAction(ISD::SMIN, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::UMAX, VT, Custom);
+ setOperationAction(ISD::UMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::XOR, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
}
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
@@ -1247,8 +1596,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
KnownBits Known2;
Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
- Known.Zero &= Known2.Zero;
- Known.One &= Known2.One;
+ Known = KnownBits::commonBits(Known, Known2);
break;
}
case AArch64ISD::LOADgot:
@@ -1388,15 +1736,38 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::THREAD_POINTER)
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
MAKE_CASE(AArch64ISD::ADD_PRED)
+ MAKE_CASE(AArch64ISD::MUL_PRED)
MAKE_CASE(AArch64ISD::SDIV_PRED)
+ MAKE_CASE(AArch64ISD::SHL_PRED)
+ MAKE_CASE(AArch64ISD::SMAX_PRED)
+ MAKE_CASE(AArch64ISD::SMIN_PRED)
+ MAKE_CASE(AArch64ISD::SRA_PRED)
+ MAKE_CASE(AArch64ISD::SRL_PRED)
+ MAKE_CASE(AArch64ISD::SUB_PRED)
MAKE_CASE(AArch64ISD::UDIV_PRED)
- MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
- MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
- MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
- MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
- MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
- MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
- MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::UMAX_PRED)
+ MAKE_CASE(AArch64ISD::UMIN_PRED)
+ MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
MAKE_CASE(AArch64ISD::ADC)
MAKE_CASE(AArch64ISD::SBC)
@@ -1465,10 +1836,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::UADDV)
MAKE_CASE(AArch64ISD::SRHADD)
MAKE_CASE(AArch64ISD::URHADD)
+ MAKE_CASE(AArch64ISD::SHADD)
+ MAKE_CASE(AArch64ISD::UHADD)
MAKE_CASE(AArch64ISD::SMINV)
MAKE_CASE(AArch64ISD::UMINV)
MAKE_CASE(AArch64ISD::SMAXV)
MAKE_CASE(AArch64ISD::UMAXV)
+ MAKE_CASE(AArch64ISD::SADDV_PRED)
+ MAKE_CASE(AArch64ISD::UADDV_PRED)
MAKE_CASE(AArch64ISD::SMAXV_PRED)
MAKE_CASE(AArch64ISD::UMAXV_PRED)
MAKE_CASE(AArch64ISD::SMINV_PRED)
@@ -1486,12 +1861,16 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FADD_PRED)
MAKE_CASE(AArch64ISD::FADDA_PRED)
MAKE_CASE(AArch64ISD::FADDV_PRED)
+ MAKE_CASE(AArch64ISD::FDIV_PRED)
MAKE_CASE(AArch64ISD::FMA_PRED)
MAKE_CASE(AArch64ISD::FMAXV_PRED)
+ MAKE_CASE(AArch64ISD::FMAXNM_PRED)
MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
MAKE_CASE(AArch64ISD::FMINV_PRED)
+ MAKE_CASE(AArch64ISD::FMINNM_PRED)
MAKE_CASE(AArch64ISD::FMINNMV_PRED)
- MAKE_CASE(AArch64ISD::NOT)
+ MAKE_CASE(AArch64ISD::FMUL_PRED)
+ MAKE_CASE(AArch64ISD::FSUB_PRED)
MAKE_CASE(AArch64ISD::BIT)
MAKE_CASE(AArch64ISD::CBZ)
MAKE_CASE(AArch64ISD::CBNZ)
@@ -1603,8 +1982,15 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::LDP)
MAKE_CASE(AArch64ISD::STP)
MAKE_CASE(AArch64ISD::STNP)
+ MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+ MAKE_CASE(AArch64ISD::UABD)
+ MAKE_CASE(AArch64ISD::SABD)
+ MAKE_CASE(AArch64ISD::CALL_RVMARKER)
}
#undef MAKE_CASE
return nullptr;
@@ -1692,6 +2078,7 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
+ case TargetOpcode::STATEPOINT:
return emitPatchPoint(MI, BB);
case AArch64::CATCHRET:
@@ -2517,21 +2904,10 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
return std::make_pair(Value, Overflow);
}
-SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
- RTLIB::Libcall Call) const {
- bool IsStrict = Op->isStrictFPOpcode();
- unsigned Offset = IsStrict ? 1 : 0;
- SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
- SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
- MakeLibCallOptions CallOptions;
- SDValue Result;
- SDLoc dl(Op);
- std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
- CallOptions, dl, Chain);
- return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
-}
+SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerToScalableOp(Op, DAG);
-static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
SDValue Sel = Op.getOperand(0);
SDValue Other = Op.getOperand(1);
SDLoc dl(Sel);
@@ -2706,16 +3082,18 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
- assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
- RTLIB::Libcall LC;
- LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (Op.getValueType().isScalableVector())
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
- return LowerF128Call(Op, DAG, LC);
+ assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SelectionDAG &DAG) const {
+ if (Op.getValueType().isScalableVector())
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
bool IsStrict = Op->isStrictFPOpcode();
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = SrcVal.getValueType();
@@ -2729,19 +3107,7 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
return Op;
}
- RTLIB::Libcall LC;
- LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
-
- // FP_ROUND node has a second operand indicating whether it is known to be
- // precise. That doesn't take part in the LibCall so we can't directly use
- // LowerF128Call.
- MakeLibCallOptions CallOptions;
- SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
- SDValue Result;
- SDLoc dl(Op);
- std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
- CallOptions, dl, Chain);
- return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
@@ -2751,6 +3117,14 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
// in the cost tables.
EVT InVT = Op.getOperand(0).getValueType();
EVT VT = Op.getValueType();
+
+ if (VT.isScalableVector()) {
+ unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
+ ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
+ : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
+ return LowerToPredicatedOp(Op, DAG, Opcode);
+ }
+
unsigned NumElts = InVT.getVectorNumElements();
// f16 conversions are promoted to f32 when full fp16 is not supported.
@@ -2763,7 +3137,9 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
}
- if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+ uint64_t VTSize = VT.getFixedSizeInBits();
+ uint64_t InVTSize = InVT.getFixedSizeInBits();
+ if (VTSize < InVTSize) {
SDLoc dl(Op);
SDValue Cv =
DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
@@ -2771,7 +3147,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
}
- if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+ if (VTSize > InVTSize) {
SDLoc dl(Op);
MVT ExtVT =
MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
@@ -2806,17 +3182,11 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
return Op;
}
- RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT ||
- Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
- else
- LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
-
- return LowerF128Call(Op, DAG, LC);
+ return SDValue();
}
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
@@ -2824,21 +3194,38 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue In = Op.getOperand(0);
EVT InVT = In.getValueType();
+ unsigned Opc = Op.getOpcode();
+ bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
+
+ if (VT.isScalableVector()) {
+ if (InVT.getVectorElementType() == MVT::i1) {
+ // We can't directly extend an SVE predicate; extend it first.
+ unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ EVT CastVT = getPromotedVTForPredicate(InVT);
+ In = DAG.getNode(CastOpc, dl, CastVT, In);
+ return DAG.getNode(Opc, dl, VT, In);
+ }
+
+ unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
+ : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
+ return LowerToPredicatedOp(Op, DAG, Opcode);
+ }
- if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+ uint64_t VTSize = VT.getFixedSizeInBits();
+ uint64_t InVTSize = InVT.getFixedSizeInBits();
+ if (VTSize < InVTSize) {
MVT CastVT =
MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
InVT.getVectorNumElements());
- In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+ In = DAG.getNode(Opc, dl, CastVT, In);
return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
}
- if (VT.getSizeInBits() > InVT.getSizeInBits()) {
- unsigned CastOpc =
- Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ if (VTSize > InVTSize) {
+ unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
EVT CastVT = VT.changeVectorElementTypeToInteger();
In = DAG.getNode(CastOpc, dl, CastVT, In);
- return DAG.getNode(Op.getOpcode(), dl, VT, In);
+ return DAG.getNode(Opc, dl, VT, In);
}
return Op;
@@ -2871,15 +3258,7 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
// fp128.
if (Op.getValueType() != MVT::f128)
return Op;
-
- RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::SINT_TO_FP ||
- Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
- LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
- else
- LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
-
- return LowerF128Call(Op, DAG, LC);
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
@@ -2993,7 +3372,8 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
}
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
- if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+ if (N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
N->getOperand(0)->getValueType(0),
N->getValueType(0),
@@ -3018,11 +3398,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ANY_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, true);
}
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::ANY_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, false);
}
@@ -3071,10 +3453,17 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
return DAG.getMergeValues({AND, Chain}, dl);
}
-static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // If SVE is available then i64 vector multiplications can also be made legal.
+ bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
+
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
+
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
- EVT VT = Op.getValueType();
assert(VT.is128BitVector() && VT.isInteger() &&
"unexpected type for custom-lowering ISD::MUL");
SDNode *N0 = Op.getOperand(0).getNode();
@@ -3233,11 +3622,77 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_ptrue:
return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_clz:
+ return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_cnt: {
+ SDValue Data = Op.getOperand(3);
+ // CTPOP only supports integer operands.
+ if (Data.getValueType().isFloatingPoint())
+ Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
+ return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Data, Op.getOperand(1));
+ }
case Intrinsic::aarch64_sve_dupq_lane:
return LowerDUPQLane(Op, DAG);
case Intrinsic::aarch64_sve_convert_from_svbool:
return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_fneg:
+ return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frintp:
+ return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frintm:
+ return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frinti:
+ return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frintx:
+ return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frinta:
+ return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frintn:
+ return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frintz:
+ return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_ucvtf:
+ return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
+ Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_scvtf:
+ return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
+ Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_fcvtzu:
+ return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
+ Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_fcvtzs:
+ return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
+ Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_fsqrt:
+ return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_frecpx:
+ return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_fabs:
+ return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_abs:
+ return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_neg:
+ return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case Intrinsic::aarch64_sve_convert_to_svbool: {
EVT OutVT = Op.getValueType();
EVT InVT = Op.getOperand(1).getValueType();
@@ -3263,6 +3718,49 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
Op.getOperand(1), Scalar);
}
+ case Intrinsic::aarch64_sve_rbit:
+ return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
+ Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_revb:
+ return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+ case Intrinsic::aarch64_sve_sxtb:
+ return DAG.getNode(
+ AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3),
+ DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_sxth:
+ return DAG.getNode(
+ AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3),
+ DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_sxtw:
+ return DAG.getNode(
+ AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3),
+ DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uxtb:
+ return DAG.getNode(
+ AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3),
+ DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uxth:
+ return DAG.getNode(
+ AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3),
+ DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uxtw:
+ return DAG.getNode(
+ AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3),
+ DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
+ Op.getOperand(1));
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
@@ -3302,19 +3800,291 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::aarch64_neon_srhadd:
- case Intrinsic::aarch64_neon_urhadd: {
- bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
- unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+ case Intrinsic::aarch64_neon_urhadd:
+ case Intrinsic::aarch64_neon_shadd:
+ case Intrinsic::aarch64_neon_uhadd: {
+ bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
+ IntNo == Intrinsic::aarch64_neon_shadd);
+ bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
+ IntNo == Intrinsic::aarch64_neon_urhadd);
+ unsigned Opcode =
+ IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
+ : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
+
+ case Intrinsic::aarch64_neon_uabd: {
+ return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+ case Intrinsic::aarch64_neon_sabd: {
+ return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
}
}
+bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
+ if (VT.getVectorElementType() == MVT::i32 &&
+ VT.getVectorElementCount().getKnownMinValue() >= 4)
+ return true;
+
+ return false;
+}
+
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
}
+unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+ std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::GLD1_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::GLD1_UXTW_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::GLD1_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::GLD1_SXTW_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
+ };
+ auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+ return AddrModes.find(Key)->second;
+}
+
+unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+ std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::SST1_PRED},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::SST1_UXTW_PRED},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::SST1_PRED},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::SST1_SXTW_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::SST1_SCALED_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::SST1_UXTW_SCALED_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::SST1_SCALED_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::SST1_SXTW_SCALED_PRED},
+ };
+ auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+ return AddrModes.find(Key)->second;
+}
+
+unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unimplemented opcode");
+ return Opcode;
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ return AArch64ISD::GLD1S_MERGE_ZERO;
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
+ }
+}
+
+bool getGatherScatterIndexIsExtended(SDValue Index) {
+ unsigned Opcode = Index.getOpcode();
+ if (Opcode == ISD::SIGN_EXTEND_INREG)
+ return true;
+
+ if (Opcode == ISD::AND) {
+ SDValue Splat = Index.getOperand(1);
+ if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
+ return false;
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
+ if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+// If the base pointer of a masked gather or scatter is null, we
+// may be able to swap BasePtr & Index and use the vector + register
+// or vector + immediate addressing mode, e.g.
+// VECTOR + REGISTER:
+// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
+// -> getelementptr %offset, <vscale x N x T> %indices
+// VECTOR + IMMEDIATE:
+// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
+// -> getelementptr #x, <vscale x N x T> %indices
+void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
+ unsigned &Opcode, bool IsGather,
+ SelectionDAG &DAG) {
+ if (!isNullConstant(BasePtr))
+ return;
+
+ ConstantSDNode *Offset = nullptr;
+ if (Index.getOpcode() == ISD::ADD)
+ if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
+ if (isa<ConstantSDNode>(SplatVal))
+ Offset = cast<ConstantSDNode>(SplatVal);
+ else {
+ BasePtr = SplatVal;
+ Index = Index->getOperand(0);
+ return;
+ }
+ }
+
+ unsigned NewOp =
+ IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
+
+ if (!Offset) {
+ std::swap(BasePtr, Index);
+ Opcode = NewOp;
+ return;
+ }
+
+ uint64_t OffsetVal = Offset->getZExtValue();
+ unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
+ auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
+
+ if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
+ // Index is out of range for the immediate addressing mode
+ BasePtr = ConstOffset;
+ Index = Index->getOperand(0);
+ return;
+ }
+
+ // Immediate is in range
+ Opcode = NewOp;
+ BasePtr = Index->getOperand(0);
+ Index = ConstOffset;
+}
+
+SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
+ assert(MGT && "Can only custom lower gather load nodes");
+
+ SDValue Index = MGT->getIndex();
+ SDValue Chain = MGT->getChain();
+ SDValue PassThru = MGT->getPassThru();
+ SDValue Mask = MGT->getMask();
+ SDValue BasePtr = MGT->getBasePtr();
+ ISD::LoadExtType ExtTy = MGT->getExtensionType();
+
+ ISD::MemIndexType IndexType = MGT->getIndexType();
+ bool IsScaled =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+ bool IsSigned =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+ bool IdxNeedsExtend =
+ getGatherScatterIndexIsExtended(Index) ||
+ Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+ bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
+
+ EVT VT = PassThru.getSimpleValueType();
+ EVT MemVT = MGT->getMemoryVT();
+ SDValue InputVT = DAG.getValueType(MemVT);
+
+ if (VT.getVectorElementType() == MVT::bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
+ // Handle FP data by using an integer gather and casting the result.
+ if (VT.isFloatingPoint()) {
+ EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+ PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
+ InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+ }
+
+ SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+
+ if (getGatherScatterIndexIsExtended(Index))
+ Index = Index.getOperand(0);
+
+ unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+ selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
+ /*isGather=*/true, DAG);
+
+ if (ResNeedsSignExtend)
+ Opcode = getSignExtendedGatherOpcode(Opcode);
+
+ SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
+ SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops);
+
+ if (VT.isFloatingPoint()) {
+ SDValue Cast = getSVESafeBitCast(VT, Gather, DAG);
+ return DAG.getMergeValues({Cast, Gather}, DL);
+ }
+
+ return Gather;
+}
+
+SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
+ assert(MSC && "Can only custom lower scatter store nodes");
+
+ SDValue Index = MSC->getIndex();
+ SDValue Chain = MSC->getChain();
+ SDValue StoreVal = MSC->getValue();
+ SDValue Mask = MSC->getMask();
+ SDValue BasePtr = MSC->getBasePtr();
+
+ ISD::MemIndexType IndexType = MSC->getIndexType();
+ bool IsScaled =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+ bool IsSigned =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+ bool NeedsExtend =
+ getGatherScatterIndexIsExtended(Index) ||
+ Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+ EVT VT = StoreVal.getSimpleValueType();
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ EVT MemVT = MSC->getMemoryVT();
+ SDValue InputVT = DAG.getValueType(MemVT);
+
+ if (VT.getVectorElementType() == MVT::bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
+ // Handle FP data by casting the data so an integer scatter can be used.
+ if (VT.isFloatingPoint()) {
+ EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
+ StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
+ InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+ }
+
+ if (getGatherScatterIndexIsExtended(Index))
+ Index = Index.getOperand(0);
+
+ unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
+ selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
+ /*isGather=*/false, DAG);
+
+ SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
+ return DAG.getNode(Opcode, DL, VTs, Ops);
+}
+
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
@@ -3380,8 +4150,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
// the custom lowering, as there are no un-paired non-temporal stores and
// legalization will break up 256 bit inputs.
+ ElementCount EC = MemVT.getVectorElementCount();
if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
- MemVT.getVectorElementCount().Min % 2u == 0 &&
+ EC.isKnownEven() &&
((MemVT.getScalarSizeInBits() == 8u ||
MemVT.getScalarSizeInBits() == 16u ||
MemVT.getScalarSizeInBits() == 32u ||
@@ -3390,11 +4161,11 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
- SDValue Hi = DAG.getNode(
- ISD::EXTRACT_SUBVECTOR, Dl,
- MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
- StoreNode->getValue(),
- DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+ SDValue Hi =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(),
+ DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
@@ -3419,6 +4190,25 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
return SDValue();
}
+// Generate SUBS and CSEL for integer abs.
+SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+
+ if (VT.isVector())
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
+
+ SDLoc DL(Op);
+ SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ Op.getOperand(0));
+ // Generate SUBS & CSEL.
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+ Op.getOperand(0), DAG.getConstant(0, DL, VT));
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
+ DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
+ Cmp.getValue(1));
+}
+
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -3471,17 +4261,35 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
- if (useSVEForFixedLengthVectorVT(Op.getValueType()))
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
- return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
case ISD::FSUB:
- return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
case ISD::FMUL:
- return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
case ISD::FMA:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
case ISD::FDIV:
- return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
+ case ISD::FNEG:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
+ case ISD::FCEIL:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
+ case ISD::FFLOOR:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
+ case ISD::FNEARBYINT:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
+ case ISD::FRINT:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
+ case ISD::FROUND:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
+ case ISD::FROUNDEVEN:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
+ case ISD::FTRUNC:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
+ case ISD::FSQRT:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
+ case ISD::FABS:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
case ISD::FP_ROUND:
case ISD::STRICT_FP_ROUND:
return LowerFP_ROUND(Op, DAG);
@@ -3495,6 +4303,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerRETURNADDR(Op, DAG);
case ISD::ADDROFRETURNADDR:
return LowerADDROFRETURNADDR(Op, DAG);
+ case ISD::CONCAT_VECTORS:
+ return LowerCONCAT_VECTORS(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
@@ -3510,17 +4320,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::INSERT_SUBVECTOR:
return LowerINSERT_SUBVECTOR(Op, DAG);
case ISD::SDIV:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
case ISD::UDIV:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
+ return LowerDIV(Op, DAG);
case ISD::SMIN:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
+ /*OverrideNEON=*/true);
case ISD::UMIN:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
+ /*OverrideNEON=*/true);
case ISD::SMAX:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
+ /*OverrideNEON=*/true);
case ISD::UMAX:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
+ /*OverrideNEON=*/true);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
@@ -3560,11 +4373,21 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
+ case ISD::MGATHER:
+ return LowerMGATHER(Op, DAG);
+ case ISD::MSCATTER:
+ return LowerMSCATTER(Op, DAG);
+ case ISD::VECREDUCE_SEQ_FADD:
+ return LowerVECREDUCE_SEQ_FADD(Op, DAG);
case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
return LowerVECREDUCE(Op, DAG);
@@ -3576,6 +4399,21 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::VSCALE:
return LowerVSCALE(Op, DAG);
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
+ case ISD::SIGN_EXTEND_INREG: {
+ // Only custom lower when ExtraVT has a legal byte based element type.
+ EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+ EVT ExtraEltVT = ExtraVT.getVectorElementType();
+ if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
+ (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
+ return SDValue();
+
+ return LowerToPredicatedOp(Op, DAG,
+ AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
+ }
case ISD::TRUNCATE:
return LowerTRUNCATE(Op, DAG);
case ISD::LOAD:
@@ -3583,31 +4421,49 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFixedLengthVectorLoadToSVE(Op, DAG);
llvm_unreachable("Unexpected request to lower ISD::LOAD");
case ISD::ADD:
- if (useSVEForFixedLengthVectorVT(Op.getValueType()))
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
- llvm_unreachable("Unexpected request to lower ISD::ADD");
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
+ case ISD::AND:
+ return LowerToScalableOp(Op, DAG);
+ case ISD::SUB:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
+ case ISD::FMAXNUM:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
+ case ISD::FMINNUM:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
+ case ISD::VSELECT:
+ return LowerFixedLengthVectorSelectToSVE(Op, DAG);
+ case ISD::ABS:
+ return LowerABS(Op, DAG);
+ case ISD::BITREVERSE:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
+ /*OverrideNEON=*/true);
+ case ISD::BSWAP:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
+ case ISD::CTLZ:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
+ /*OverrideNEON=*/true);
+ case ISD::CTTZ:
+ return LowerCTTZ(Op, DAG);
}
}
-bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
- // Prefer NEON unless larger SVE registers are available.
- return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+ return !Subtarget->useSVEForFixedLengthVectors();
}
-bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
- if (!useSVEForFixedLengthVectors())
+bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
+ EVT VT, bool OverrideNEON) const {
+ if (!Subtarget->useSVEForFixedLengthVectors())
return false;
if (!VT.isFixedLengthVector())
return false;
- // Fixed length predicates should be promoted to i8.
- // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
- if (VT.getVectorElementType() == MVT::i1)
- return false;
-
// Don't use SVE for vectors we cannot scalarize if required.
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ // Fixed length predicates should be promoted to i8.
+ // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
+ case MVT::i1:
default:
return false;
case MVT::i8:
@@ -3620,12 +4476,16 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
break;
}
+ // All SVE implementations support NEON sized vectors.
+ if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
+ return true;
+
// Ensure NEON MVTs only belong to a single register class.
- if (VT.getSizeInBits() <= 128)
+ if (VT.getFixedSizeInBits() <= 128)
return false;
// Don't use SVE for types that don't fit.
- if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
+ if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
return false;
// TODO: Perhaps an artificial restriction, but worth having whilst getting
@@ -3724,10 +4584,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
- assert(ArgLocs.size() == Ins.size());
SmallVector<SDValue, 16> ArgValues;
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
+ unsigned ExtraArgLocs = 0;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
if (Ins[i].Flags.isByVal()) {
// Byval is used for HFAs in the PCS, but the system should work in a
@@ -3855,16 +4715,44 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (VA.getLocInfo() == CCValAssign::Indirect) {
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- // If value is passed via pointer - do a load.
- ArgValue =
- DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
- }
- if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
- ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
- ArgValue, DAG.getValueType(MVT::i32));
- InVals.push_back(ArgValue);
+ uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
+ unsigned NumParts = 1;
+ if (Ins[i].Flags.isInConsecutiveRegs()) {
+ assert(!Ins[i].Flags.isInConsecutiveRegsLast());
+ while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+ ++NumParts;
+ }
+
+ MVT PartLoad = VA.getValVT();
+ SDValue Ptr = ArgValue;
+
+ // Ensure we generate all loads for each tuple part, whilst updating the
+ // pointer after each load correctly using vscale.
+ while (NumParts > 0) {
+ ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
+ InVals.push_back(ArgValue);
+ NumParts--;
+ if (NumParts > 0) {
+ SDValue BytesIncrement = DAG.getVScale(
+ DL, Ptr.getValueType(),
+ APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
+ Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ BytesIncrement, Flags);
+ ExtraArgLocs++;
+ i++;
+ }
+ }
+ } else {
+ if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+ ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+ ArgValue, DAG.getValueType(MVT::i32));
+ InVals.push_back(ArgValue);
+ }
}
+ assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
// varargs
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -4039,9 +4927,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
- CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
- ? RetCC_AArch64_WebKit_JS
- : RetCC_AArch64_AAPCS;
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
DenseMap<unsigned, SDValue> CopiedRegs;
@@ -4464,8 +5350,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Walk the register/memloc assignments, inserting copies/loads.
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
+ unsigned ExtraArgLocs = 0;
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -4507,18 +5394,49 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
+
+ uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
+ uint64_t PartSize = StoreSize;
+ unsigned NumParts = 1;
+ if (Outs[i].Flags.isInConsecutiveRegs()) {
+ assert(!Outs[i].Flags.isInConsecutiveRegsLast());
+ while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+ ++NumParts;
+ StoreSize *= NumParts;
+ }
+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
- int FI = MFI.CreateStackObject(
- VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
- MFI.setStackID(FI, TargetStackID::SVEVector);
+ int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
+ MFI.setStackID(FI, TargetStackID::ScalableVector);
- SDValue SpillSlot = DAG.getFrameIndex(
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+ SDValue Ptr = DAG.getFrameIndex(
FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
- Chain = DAG.getStore(
- Chain, DL, Arg, SpillSlot,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ SDValue SpillSlot = Ptr;
+
+ // Ensure we generate all stores for each tuple part, whilst updating the
+ // pointer after each store correctly using vscale.
+ while (NumParts) {
+ Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
+ NumParts--;
+ if (NumParts > 0) {
+ SDValue BytesIncrement = DAG.getVScale(
+ DL, Ptr.getValueType(),
+ APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
+
+ MPI = MachinePointerInfo(MPI.getAddrSpace());
+ Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ BytesIncrement, Flags);
+ ExtraArgLocs++;
+ i++;
+ }
+ }
+
Arg = SpillSlot;
break;
}
@@ -4538,20 +5456,18 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// take care of putting the two halves in the right place but we have to
// combine them.
SDValue &Bits =
- std::find_if(RegsToPass.begin(), RegsToPass.end(),
- [=](const std::pair<unsigned, SDValue> &Elt) {
- return Elt.first == VA.getLocReg();
- })
+ llvm::find_if(RegsToPass,
+ [=](const std::pair<unsigned, SDValue> &Elt) {
+ return Elt.first == VA.getLocReg();
+ })
->second;
Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
// Call site info is used for function's parameter entry value
// tracking. For now we track only simple cases when parameter
// is transferred through whole register.
- CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
- [&VA](MachineFunction::ArgRegPair ArgReg) {
- return ArgReg.Reg == VA.getLocReg();
- }),
- CSInfo.end());
+ llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
+ return ArgReg.Reg == VA.getLocReg();
+ });
} else {
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
@@ -4570,7 +5486,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
uint32_t BEAlign = 0;
unsigned OpSize;
if (VA.getLocInfo() == CCValAssign::Indirect)
- OpSize = VA.getLocVT().getSizeInBits();
+ OpSize = VA.getLocVT().getFixedSizeInBits();
else
OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
: VA.getValVT().getSizeInBits();
@@ -4730,8 +5646,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
return Ret;
}
+ unsigned CallOpc = AArch64ISD::CALL;
+ // Calls marked with "rv_marker" are special. They should be expanded to the
+ // call, directly followed by a special marker sequence. Use the CALL_RVMARKER
+ // to do that.
+ if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) {
+ assert(!IsTailCall && "tail calls cannot be marked with rv_marker");
+ CallOpc = AArch64ISD::CALL_RVMARKER;
+ }
+
// Returns a chain and a flag for retval copy to use.
- Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+ Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -4755,9 +5680,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool AArch64TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
- CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
- ? RetCC_AArch64_WebKit_JS
- : RetCC_AArch64_AAPCS;
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC);
@@ -4772,9 +5695,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
auto &MF = DAG.getMachineFunction();
auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
- CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
- ? RetCC_AArch64_WebKit_JS
- : RetCC_AArch64_AAPCS;
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
@@ -4819,11 +5740,9 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
if (RegsUsed.count(VA.getLocReg())) {
SDValue &Bits =
- std::find_if(RetVals.begin(), RetVals.end(),
- [=](const std::pair<unsigned, SDValue> &Elt) {
- return Elt.first == VA.getLocReg();
- })
- ->second;
+ llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
+ return Elt.first == VA.getLocReg();
+ })->second;
Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
} else {
RetVals.emplace_back(VA.getLocReg(), Arg);
@@ -5043,7 +5962,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SDValue FuncTLVGet = DAG.getLoad(
PtrMemVT, DL, Chain, DescAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
+ Align(PtrMemVT.getSizeInBits() / 8),
MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
@@ -5358,6 +6277,22 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
llvm_unreachable("Unexpected platform trying to use TLS");
}
+// Looks through \param Val to determine the bit that can be used to
+// check the sign of the value. It returns the unextended value and
+// the sign bit position.
+std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
+ if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
+ return {Val.getOperand(0),
+ cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
+ 1};
+
+ if (Val.getOpcode() == ISD::SIGN_EXTEND)
+ return {Val.getOperand(0),
+ Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
+
+ return {Val, Val.getValueSizeInBits() - 1};
+}
+
SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -5452,9 +6387,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
- uint64_t Mask = LHS.getValueSizeInBits() - 1;
+ uint64_t SignBitPos;
+ std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
- DAG.getConstant(Mask, dl, MVT::i64), Dest);
+ DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
}
}
if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
@@ -5462,9 +6398,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
- uint64_t Mask = LHS.getValueSizeInBits() - 1;
+ uint64_t SignBitPos;
+ std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
- DAG.getConstant(Mask, dl, MVT::i64), Dest);
+ DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
}
SDValue CCVal;
@@ -5611,6 +6548,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
}
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
+
assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
"Unexpected type for custom ctpop lowering");
@@ -5634,6 +6574,16 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
return Val;
}
+SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isScalableVector() ||
+ useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
+
+ SDLoc DL(Op);
+ SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
+ return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
+}
+
SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
@@ -5791,7 +6741,8 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
// instead of a CSEL in that case.
if (TrueVal == ~FalseVal) {
Opcode = AArch64ISD::CSINV;
- } else if (TrueVal == -FalseVal) {
+ } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
+ TrueVal == -FalseVal) {
Opcode = AArch64ISD::CSNEG;
} else if (TVal.getValueType() == MVT::i32) {
// If our operands are only 32-bit wide, make sure we use 32-bit
@@ -5991,6 +6942,9 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
SDValue Entry = Op.getOperand(2);
int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
+ auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+ AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
+
SDNode *Dest =
DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
@@ -6057,11 +7011,13 @@ SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
}
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG) const {
// The layout of the va_list struct is specified in the AArch64 Procedure Call
// Standard, section B.3.
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
+ auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
@@ -6071,56 +7027,64 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
SmallVector<SDValue, 4> MemOps;
// void *__stack at offset 0
+ unsigned Offset = 0;
SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
+ Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
- MachinePointerInfo(SV), /* Alignment = */ 8));
+ MachinePointerInfo(SV), Align(PtrSize)));
- // void *__gr_top at offset 8
+ // void *__gr_top at offset 8 (4 on ILP32)
+ Offset += PtrSize;
int GPRSize = FuncInfo->getVarArgsGPRSize();
if (GPRSize > 0) {
SDValue GRTop, GRTopAddr;
- GRTopAddr =
- DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
+ GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(Offset, DL, PtrVT));
GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
DAG.getConstant(GPRSize, DL, PtrVT));
+ GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
- MachinePointerInfo(SV, 8),
- /* Alignment = */ 8));
+ MachinePointerInfo(SV, Offset),
+ Align(PtrSize)));
}
- // void *__vr_top at offset 16
+ // void *__vr_top at offset 16 (8 on ILP32)
+ Offset += PtrSize;
int FPRSize = FuncInfo->getVarArgsFPRSize();
if (FPRSize > 0) {
SDValue VRTop, VRTopAddr;
VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
- DAG.getConstant(16, DL, PtrVT));
+ DAG.getConstant(Offset, DL, PtrVT));
VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
DAG.getConstant(FPRSize, DL, PtrVT));
+ VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
- MachinePointerInfo(SV, 16),
- /* Alignment = */ 8));
- }
-
- // int __gr_offs at offset 24
- SDValue GROffsAddr =
- DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
- MemOps.push_back(DAG.getStore(
- Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
- MachinePointerInfo(SV, 24), /* Alignment = */ 4));
-
- // int __vr_offs at offset 28
- SDValue VROffsAddr =
- DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
- MemOps.push_back(DAG.getStore(
- Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
- MachinePointerInfo(SV, 28), /* Alignment = */ 4));
+ MachinePointerInfo(SV, Offset),
+ Align(PtrSize)));
+ }
+
+ // int __gr_offs at offset 24 (12 on ILP32)
+ Offset += PtrSize;
+ SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(Offset, DL, PtrVT));
+ MemOps.push_back(
+ DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
+ GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
+
+ // int __vr_offs at offset 28 (16 on ILP32)
+ Offset += 4;
+ SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(Offset, DL, PtrVT));
+ MemOps.push_back(
+ DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
+ VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
@@ -6143,8 +7107,10 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
// pointer.
SDLoc DL(Op);
unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
- unsigned VaListSize = (Subtarget->isTargetDarwin() ||
- Subtarget->isTargetWindows()) ? PtrSize : 32;
+ unsigned VaListSize =
+ (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+ ? PtrSize
+ : Subtarget->isTargetILP32() ? 20 : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
@@ -6297,17 +7263,34 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDValue ReturnAddress;
if (Depth) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
- return DAG.getLoad(VT, DL, DAG.getEntryNode(),
- DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
- MachinePointerInfo());
+ ReturnAddress = DAG.getLoad(
+ VT, DL, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
+ } else {
+ // Return LR, which contains the return address. Mark it an implicit
+ // live-in.
+ unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+ ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+ }
+
+ // The XPACLRI instruction assembles to a hint-space instruction before
+ // Armv8.3-A therefore this instruction can be safely used for any pre
+ // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
+ // that instead.
+ SDNode *St;
+ if (Subtarget->hasPAuth()) {
+ St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
+ } else {
+ // XPACLRI operates on LR therefore we must move the operand accordingly.
+ SDValue Chain =
+ DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
+ St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
}
-
- // Return LR, which contains the return address. Mark it an implicit live-in.
- unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
- return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+ return SDValue(St, 0);
}
/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
@@ -6488,6 +7471,22 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
return SDValue();
}
+SDValue
+AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+ const DenormalMode &Mode) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+}
+
+SDValue
+AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
+ SelectionDAG &DAG) const {
+ return Op;
+}
+
SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps,
@@ -6511,17 +7510,8 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}
- if (!Reciprocal) {
- EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
- VT);
- SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
- SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
-
+ if (!Reciprocal)
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
- // Correct the result if the operand is 0.0.
- Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
- VT, Eq, Operand, Estimate);
- }
ExtraSteps = 0;
return Estimate;
@@ -6697,23 +7687,30 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
- if (VT.getSizeInBits() == 64)
+ if (VT.isScalableVector())
+ return std::make_pair(0U, nullptr);
+ if (VT.getFixedSizeInBits() == 64)
return std::make_pair(0U, &AArch64::GPR64commonRegClass);
return std::make_pair(0U, &AArch64::GPR32commonRegClass);
- case 'w':
+ case 'w': {
if (!Subtarget->hasFPARMv8())
break;
- if (VT.isScalableVector())
- return std::make_pair(0U, &AArch64::ZPRRegClass);
- if (VT.getSizeInBits() == 16)
+ if (VT.isScalableVector()) {
+ if (VT.getVectorElementType() != MVT::i1)
+ return std::make_pair(0U, &AArch64::ZPRRegClass);
+ return std::make_pair(0U, nullptr);
+ }
+ uint64_t VTSize = VT.getFixedSizeInBits();
+ if (VTSize == 16)
return std::make_pair(0U, &AArch64::FPR16RegClass);
- if (VT.getSizeInBits() == 32)
+ if (VTSize == 32)
return std::make_pair(0U, &AArch64::FPR32RegClass);
- if (VT.getSizeInBits() == 64)
+ if (VTSize == 64)
return std::make_pair(0U, &AArch64::FPR64RegClass);
- if (VT.getSizeInBits() == 128)
+ if (VTSize == 128)
return std::make_pair(0U, &AArch64::FPR128RegClass);
break;
+ }
// The instructions that this constraint is designed for can
// only take 128-bit registers so just use that regclass.
case 'x':
@@ -6734,10 +7731,11 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
} else {
PredicateConstraint PC = parsePredicateConstraint(Constraint);
if (PC != PredicateConstraint::Invalid) {
- assert(VT.isScalableVector());
+ if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
+ return std::make_pair(0U, nullptr);
bool restricted = (PC == PredicateConstraint::Upl);
return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
- : std::make_pair(0U, &AArch64::PPRRegClass);
+ : std::make_pair(0U, &AArch64::PPRRegClass);
}
}
if (StringRef("{cc}").equals_lower(Constraint))
@@ -6976,6 +7974,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
SDLoc dl(Op);
EVT VT = Op.getValueType();
+ assert(!VT.isScalableVector() &&
+ "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
unsigned NumElts = VT.getVectorNumElements();
struct ShuffleSourceInfo {
@@ -7046,8 +8046,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
}
}
unsigned ResMultiplier =
- VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
- NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
+ uint64_t VTSize = VT.getFixedSizeInBits();
+ NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
// If the source vector is too wide or too narrow, we may nevertheless be able
@@ -7056,17 +8057,18 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
- if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+ uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
+ if (SrcVTSize == VTSize)
continue;
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
- unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+ unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
- if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
- assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
+ if (SrcVTSize < VTSize) {
+ assert(2 * SrcVTSize == VTSize);
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
Src.ShuffleVec =
@@ -7075,7 +8077,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
continue;
}
- assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
+ if (SrcVTSize != 2 * VTSize) {
+ LLVM_DEBUG(
+ dbgs() << "Reshuffle failed: result vector too small to extract\n");
+ return SDValue();
+ }
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
LLVM_DEBUG(
@@ -7104,6 +8110,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+ if (!SrcVT.is64BitVector()) {
+ LLVM_DEBUG(
+ dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
+ "for SVE vectors.");
+ return SDValue();
+ }
+
Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
VEXTSrc2,
DAG.getConstant(Imm, dl, MVT::i32));
@@ -7120,7 +8133,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
- Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ Src.WindowScale =
+ SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
@@ -7144,8 +8158,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
- int BitsDefined =
- std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
+ int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
+ VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
@@ -7209,6 +8223,81 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
return true;
}
+/// Check if a vector shuffle corresponds to a DUP instructions with a larger
+/// element width than the vector lane type. If that is the case the function
+/// returns true and writes the value of the DUP instruction lane operand into
+/// DupLaneOp
+static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
+ unsigned &DupLaneOp) {
+ assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+ "Only possible block sizes for wide DUP are: 16, 32, 64");
+
+ if (BlockSize <= VT.getScalarSizeInBits())
+ return false;
+ if (BlockSize % VT.getScalarSizeInBits() != 0)
+ return false;
+ if (VT.getSizeInBits() % BlockSize != 0)
+ return false;
+
+ size_t SingleVecNumElements = VT.getVectorNumElements();
+ size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
+ size_t NumBlocks = VT.getSizeInBits() / BlockSize;
+
+ // We are looking for masks like
+ // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
+ // might be replaced by 'undefined'. BlockIndices will eventually contain
+ // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
+ // for the above examples)
+ SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
+ for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
+ for (size_t I = 0; I < NumEltsPerBlock; I++) {
+ int Elt = M[BlockIndex * NumEltsPerBlock + I];
+ if (Elt < 0)
+ continue;
+ // For now we don't support shuffles that use the second operand
+ if ((unsigned)Elt >= SingleVecNumElements)
+ return false;
+ if (BlockElts[I] < 0)
+ BlockElts[I] = Elt;
+ else if (BlockElts[I] != Elt)
+ return false;
+ }
+
+ // We found a candidate block (possibly with some undefs). It must be a
+ // sequence of consecutive integers starting with a value divisible by
+ // NumEltsPerBlock with some values possibly replaced by undef-s.
+
+ // Find first non-undef element
+ auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
+ assert(FirstRealEltIter != BlockElts.end() &&
+ "Shuffle with all-undefs must have been caught by previous cases, "
+ "e.g. isSplat()");
+ if (FirstRealEltIter == BlockElts.end()) {
+ DupLaneOp = 0;
+ return true;
+ }
+
+ // Index of FirstRealElt in BlockElts
+ size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
+
+ if ((unsigned)*FirstRealEltIter < FirstRealIndex)
+ return false;
+ // BlockElts[0] must have the following value if it isn't undef:
+ size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
+
+ // Check the first element
+ if (Elt0 % NumEltsPerBlock != 0)
+ return false;
+ // Check that the sequence indeed consists of consecutive integers (modulo
+ // undefs)
+ for (size_t I = 0; I < NumEltsPerBlock; I++)
+ if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
+ return false;
+
+ DupLaneOp = Elt0 / NumEltsPerBlock;
+ return true;
+}
+
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are different.
static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
@@ -7642,6 +8731,60 @@ static unsigned getDUPLANEOp(EVT EltType) {
llvm_unreachable("Invalid vector element type?");
}
+static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
+ unsigned Opcode, SelectionDAG &DAG) {
+ // Try to eliminate a bitcasted extract subvector before a DUPLANE.
+ auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
+ // Match: dup (bitcast (extract_subv X, C)), LaneC
+ if (BitCast.getOpcode() != ISD::BITCAST ||
+ BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+
+ // The extract index must align in the destination type. That may not
+ // happen if the bitcast is from narrow to wide type.
+ SDValue Extract = BitCast.getOperand(0);
+ unsigned ExtIdx = Extract.getConstantOperandVal(1);
+ unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
+ unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
+ unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
+ if (ExtIdxInBits % CastedEltBitWidth != 0)
+ return false;
+
+ // Update the lane value by offsetting with the scaled extract index.
+ LaneC += ExtIdxInBits / CastedEltBitWidth;
+
+ // Determine the casted vector type of the wide vector input.
+ // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
+ // Examples:
+ // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
+ // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
+ unsigned SrcVecNumElts =
+ Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
+ CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
+ SrcVecNumElts);
+ return true;
+ };
+ MVT CastVT;
+ if (getScaledOffsetDup(V, Lane, CastVT)) {
+ V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
+ } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ // The lane is incremented by the index of the extract.
+ // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
+ Lane += V.getConstantOperandVal(1);
+ V = V.getOperand(0);
+ } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+ // The lane is decremented if we are splatting from the 2nd operand.
+ // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
+ unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+ Lane -= Idx * VT.getVectorNumElements() / 2;
+ V = WidenVector(V.getOperand(Idx), DAG);
+ } else if (VT.getSizeInBits() == 64) {
+ // Widen the operand to 128-bit register with undef.
+ V = WidenVector(V, DAG);
+ }
+ return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
+}
+
SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -7675,57 +8818,26 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
// Otherwise, duplicate from the lane of the input vector.
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
- // Try to eliminate a bitcasted extract subvector before a DUPLANE.
- auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
- // Match: dup (bitcast (extract_subv X, C)), LaneC
- if (BitCast.getOpcode() != ISD::BITCAST ||
- BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
- return false;
-
- // The extract index must align in the destination type. That may not
- // happen if the bitcast is from narrow to wide type.
- SDValue Extract = BitCast.getOperand(0);
- unsigned ExtIdx = Extract.getConstantOperandVal(1);
- unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
- unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
- unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
- if (ExtIdxInBits % CastedEltBitWidth != 0)
- return false;
-
- // Update the lane value by offsetting with the scaled extract index.
- LaneC += ExtIdxInBits / CastedEltBitWidth;
-
- // Determine the casted vector type of the wide vector input.
- // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
- // Examples:
- // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
- // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
- unsigned SrcVecNumElts =
- Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
- CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
- SrcVecNumElts);
- return true;
- };
- MVT CastVT;
- if (getScaledOffsetDup(V1, Lane, CastVT)) {
- V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
- } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- // The lane is incremented by the index of the extract.
- // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
- Lane += V1.getConstantOperandVal(1);
- V1 = V1.getOperand(0);
- } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
- // The lane is decremented if we are splatting from the 2nd operand.
- // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
- unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
- Lane -= Idx * VT.getVectorNumElements() / 2;
- V1 = WidenVector(V1.getOperand(Idx), DAG);
- } else if (VT.getSizeInBits() == 64) {
- // Widen the operand to 128-bit register with undef.
- V1 = WidenVector(V1, DAG);
- }
- return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
+ return constructDup(V1, Lane, dl, VT, Opcode, DAG);
+ }
+
+ // Check if the mask matches a DUP for a wider element
+ for (unsigned LaneSize : {64U, 32U, 16U}) {
+ unsigned Lane = 0;
+ if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
+ unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
+ : LaneSize == 32 ? AArch64ISD::DUPLANE32
+ : AArch64ISD::DUPLANE16;
+ // Cast V1 to an integer vector with required lane size
+ MVT NewEltTy = MVT::getIntegerVT(LaneSize);
+ unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
+ MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
+ V1 = DAG.getBitcast(NewVecTy, V1);
+ // Constuct the DUP instruction
+ V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
+ // Cast back to the original type
+ return DAG.getBitcast(VT, V1);
+ }
}
if (isREVMask(ShuffleMask, VT, 64))
@@ -7796,7 +8908,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
EVT ScalarVT = VT.getVectorElementType();
- if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
+ if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
ScalarVT = MVT::i32;
return DAG.getNode(
@@ -7835,9 +8947,11 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
SDLoc dl(Op);
EVT VT = Op.getValueType();
EVT ElemVT = VT.getScalarType();
-
SDValue SplatVal = Op.getOperand(0);
+ if (useSVEForFixedLengthVectorVT(VT))
+ return LowerToScalableOp(Op, DAG);
+
// Extend input splat value where needed to fit into a GPR (32b or 64b only)
// FPRs don't have this restriction.
switch (ElemVT.getSimpleVT().SimpleTy) {
@@ -8267,6 +9381,9 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerToScalableOp(Op, DAG);
+
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
return Res;
@@ -8425,14 +9542,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
bool isConstant = true;
bool AllLanesExtractElt = true;
unsigned NumConstantLanes = 0;
+ unsigned NumDifferentLanes = 0;
+ unsigned NumUndefLanes = 0;
SDValue Value;
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
AllLanesExtractElt = false;
- if (V.isUndef())
+ if (V.isUndef()) {
+ ++NumUndefLanes;
continue;
+ }
if (i > 0)
isOnlyLowElement = false;
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
@@ -8448,8 +9569,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (!Value.getNode())
Value = V;
- else if (V != Value)
+ else if (V != Value) {
usesOnlyOneValue = false;
+ ++NumDifferentLanes;
+ }
}
if (!Value.getNode()) {
@@ -8575,11 +9698,20 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
}
}
+ // If we need to insert a small number of different non-constant elements and
+ // the vector width is sufficiently large, prefer using DUP with the common
+ // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
+ // skip the constant lane handling below.
+ bool PreferDUPAndInsert =
+ !isConstant && NumDifferentLanes >= 1 &&
+ NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
+ NumDifferentLanes >= NumConstantLanes;
+
// If there was only one constant value used and for more than one lane,
// start by splatting that value, then replace the non-constant lanes. This
// is better than the default, which will perform a separate initialization
// for each lane.
- if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+ if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
// Firstly, try to materialize the splat constant.
SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
Val = ConstantBuildVector(Vec, DAG);
@@ -8615,6 +9747,22 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
return shuffle;
}
+ if (PreferDUPAndInsert) {
+ // First, build a constant vector with the common element.
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned I = 0; I < NumElts; ++I)
+ Ops.push_back(Value);
+ SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
+ // Next, insert the elements that do not match the common value.
+ for (unsigned I = 0; I < NumElts; ++I)
+ if (Op.getOperand(I) != Value)
+ NewVector =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
+ Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
+
+ return NewVector;
+ }
+
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
@@ -8663,6 +9811,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
return SDValue();
}
+SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getValueType().isScalableVector() &&
+ isTypeLegal(Op.getValueType()) &&
+ "Expected legal scalable vector type!");
+
+ if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
+ return Op;
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
@@ -8758,7 +9918,8 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
- if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
+ if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
+ InVT.getSizeInBits() == 128)
return Op;
return SDValue();
@@ -8772,9 +9933,34 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
EVT InVT = Op.getOperand(1).getValueType();
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- // We don't have any patterns for scalable vector yet.
- if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
+ if (InVT.isScalableVector()) {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ if (!isTypeLegal(VT) || !VT.isInteger())
+ return SDValue();
+
+ SDValue Vec0 = Op.getOperand(0);
+ SDValue Vec1 = Op.getOperand(1);
+
+ // Ensure the subvector is half the size of the main vector.
+ if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
+ return SDValue();
+
+ // Extend elements of smaller vector...
+ EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
+ SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+
+ if (Idx == 0) {
+ SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
+ return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
+ } else if (Idx == InVT.getVectorMinNumElements()) {
+ SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
+ return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
+ }
+
return SDValue();
+ }
// This will be matched by custom code during ISelDAGToDAG.
if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
@@ -8783,6 +9969,42 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
return SDValue();
}
+SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
+ return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
+
+ assert(VT.isScalableVector() && "Expected a scalable vector.");
+
+ bool Signed = Op.getOpcode() == ISD::SDIV;
+ unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
+
+ if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
+ return LowerToPredicatedOp(Op, DAG, PredOpcode);
+
+ // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
+ // operations, and truncate the result.
+ EVT WidenedVT;
+ if (VT == MVT::nxv16i8)
+ WidenedVT = MVT::nxv8i16;
+ else if (VT == MVT::nxv8i16)
+ WidenedVT = MVT::nxv4i32;
+ else
+ llvm_unreachable("Unexpected Custom DIV operation");
+
+ SDLoc dl(Op);
+ unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+ unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
+ SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
+ SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
+ SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
+ SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
+ SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
+ SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
+ return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
+}
+
bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
// Currently no fixed length shuffles that require SVE are legal.
if (useSVEForFixedLengthVectorVT(VT))
@@ -8867,14 +10089,6 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
-// Attempt to form urhadd(OpA, OpB) from
-// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
-// The original form of this expression is
-// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
-// is called the srl will have been lowered to AArch64ISD::VLSHR and the
-// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
-// This pass can also recognize a variant of this pattern that uses sign
-// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -8890,66 +10104,12 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
}
if (!VT.isVector() || VT.isScalableVector())
- return Op;
+ return SDValue();
if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
- // Since we are looking for a right shift by a constant value of 1 and we are
- // operating on types at least 16 bits in length (sign/zero extended OpA and
- // OpB, which are at least 8 bits), it follows that the truncate will always
- // discard the shifted-in bit and therefore the right shift will be logical
- // regardless of the signedness of OpA and OpB.
- SDValue Shift = Op.getOperand(0);
- if (Shift.getOpcode() != AArch64ISD::VLSHR)
- return Op;
-
- // Is the right shift using an immediate value of 1?
- uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
- if (ShiftAmount != 1)
- return Op;
-
- SDValue Sub = Shift->getOperand(0);
- if (Sub.getOpcode() != ISD::SUB)
- return Op;
-
- SDValue Xor = Sub.getOperand(1);
- if (Xor.getOpcode() != ISD::XOR)
- return Op;
-
- SDValue ExtendOpA = Xor.getOperand(0);
- SDValue ExtendOpB = Sub.getOperand(0);
- unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
- unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
- if (!(ExtendOpAOpc == ExtendOpBOpc &&
- (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
- return Op;
-
- // Is the result of the right shift being truncated to the same value type as
- // the original operands, OpA and OpB?
- SDValue OpA = ExtendOpA.getOperand(0);
- SDValue OpB = ExtendOpB.getOperand(0);
- EVT OpAVT = OpA.getValueType();
- assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
- if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
- return Op;
-
- // Is the XOR using a constant amount of all ones in the right hand side?
- uint64_t C;
- if (!isAllConstantBuildVector(Xor.getOperand(1), C))
- return Op;
-
- unsigned ElemSizeInBits = VT.getScalarSizeInBits();
- APInt CAsAPInt(ElemSizeInBits, C);
- if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
- return Op;
-
- SDLoc DL(Op);
- bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
- unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
- SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
-
- return ResultURHADD;
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
@@ -8967,8 +10127,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
- if (VT.isScalableVector())
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
@@ -8979,9 +10139,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
- if (VT.isScalableVector()) {
- unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
- : AArch64ISD::SRL_MERGE_OP1;
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
+ unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
+ : AArch64ISD::SRL_PRED;
return LowerToPredicatedOp(Op, DAG, Opc);
}
@@ -9033,7 +10193,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
else
Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
- return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+ return DAG.getNOT(dl, Fcmeq, VT);
}
case AArch64CC::EQ:
if (IsZero)
@@ -9072,7 +10232,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
else
Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
- return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+ return DAG.getNOT(dl, Cmeq, VT);
}
case AArch64CC::EQ:
if (IsZero)
@@ -9113,6 +10273,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
}
+ if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
+ return LowerFixedLengthVectorSetccToSVE(Op, DAG);
+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -9185,6 +10348,51 @@ static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+
+ // Try to lower fixed length reductions to SVE.
+ EVT SrcVT = Src.getValueType();
+ bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
+ Op.getOpcode() == ISD::VECREDUCE_OR ||
+ Op.getOpcode() == ISD::VECREDUCE_XOR ||
+ Op.getOpcode() == ISD::VECREDUCE_FADD ||
+ (Op.getOpcode() != ISD::VECREDUCE_ADD &&
+ SrcVT.getVectorElementType() == MVT::i64);
+ if (SrcVT.isScalableVector() ||
+ useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
+
+ if (SrcVT.getVectorElementType() == MVT::i1)
+ return LowerPredReductionToSVE(Op, DAG);
+
+ switch (Op.getOpcode()) {
+ case ISD::VECREDUCE_ADD:
+ return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
+ case ISD::VECREDUCE_AND:
+ return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
+ case ISD::VECREDUCE_OR:
+ return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
+ case ISD::VECREDUCE_SMAX:
+ return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
+ case ISD::VECREDUCE_SMIN:
+ return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
+ case ISD::VECREDUCE_UMAX:
+ return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
+ case ISD::VECREDUCE_UMIN:
+ return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
+ case ISD::VECREDUCE_XOR:
+ return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
+ case ISD::VECREDUCE_FADD:
+ return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
+ case ISD::VECREDUCE_FMAX:
+ return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
+ case ISD::VECREDUCE_FMIN:
+ return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
+ default:
+ llvm_unreachable("Unhandled fixed length reduction");
+ }
+ }
+
+ // Lower NEON reductions.
SDLoc dl(Op);
switch (Op.getOpcode()) {
case ISD::VECREDUCE_ADD:
@@ -9198,18 +10406,16 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
case ISD::VECREDUCE_UMIN:
return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
case ISD::VECREDUCE_FMAX: {
- assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
- Op.getOperand(0));
+ Src);
}
case ISD::VECREDUCE_FMIN: {
- assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
- Op.getOperand(0));
+ Src);
}
default:
llvm_unreachable("Unhandled reduction");
@@ -9219,7 +10425,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
- if (!Subtarget.hasLSE())
+ if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
return SDValue();
// LSE has an atomic load-add instruction, but not a load-sub.
@@ -9236,7 +10442,7 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
- if (!Subtarget.hasLSE())
+ if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
return SDValue();
// LSE has an atomic load-clear instruction, but not a load-and.
@@ -9337,16 +10543,17 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
template <unsigned NumVecs>
-static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
- const CallInst &CI) {
+static bool
+setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
+ AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
Info.opc = ISD::INTRINSIC_VOID;
// Retrieve EC from first vector argument.
- const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
+ const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
ElementCount EC = VT.getVectorElementCount();
#ifndef NDEBUG
// Check the assumption that all input vectors are the same type.
for (unsigned I = 0; I < NumVecs; ++I)
- assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
+ assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
"Invalid type.");
#endif
// memVT is `NumVecs * VT`.
@@ -9369,11 +10576,11 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
auto &DL = I.getModule()->getDataLayout();
switch (Intrinsic) {
case Intrinsic::aarch64_sve_st2:
- return setInfoSVEStN<2>(Info, I);
+ return setInfoSVEStN<2>(*this, DL, Info, I);
case Intrinsic::aarch64_sve_st3:
- return setInfoSVEStN<3>(Info, I);
+ return setInfoSVEStN<3>(*this, DL, Info, I);
case Intrinsic::aarch64_sve_st4:
- return setInfoSVEStN<4>(Info, I);
+ return setInfoSVEStN<4>(*this, DL, Info, I);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
@@ -9529,15 +10736,15 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
- unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
- unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
+ uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
return NumBits1 > NumBits2;
}
bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
- unsigned NumBits1 = VT1.getSizeInBits();
- unsigned NumBits2 = VT2.getSizeInBits();
+ uint64_t NumBits1 = VT1.getFixedSizeInBits();
+ uint64_t NumBits2 = VT2.getFixedSizeInBits();
return NumBits1 > NumBits2;
}
@@ -9779,6 +10986,43 @@ bool AArch64TargetLowering::shouldSinkOperands(
return true;
}
+ case Instruction::Mul: {
+ bool IsProfitable = false;
+ for (auto &Op : I->operands()) {
+ // Make sure we are not already sinking this operand
+ if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+ continue;
+
+ ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
+ if (!Shuffle || !Shuffle->isZeroEltSplat())
+ continue;
+
+ Value *ShuffleOperand = Shuffle->getOperand(0);
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
+ if (!Insert)
+ continue;
+
+ Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
+ if (!OperandInstr)
+ continue;
+
+ ConstantInt *ElementConstant =
+ dyn_cast<ConstantInt>(Insert->getOperand(2));
+ // Check that the insertelement is inserting into element 0
+ if (!ElementConstant || ElementConstant->getZExtValue() != 0)
+ continue;
+
+ unsigned Opcode = OperandInstr->getOpcode();
+ if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
+ continue;
+
+ Ops.push_back(&Shuffle->getOperandUse(0));
+ Ops.push_back(&Op);
+ IsProfitable = true;
+ }
+
+ return IsProfitable;
+ }
default:
return false;
}
@@ -10114,11 +11358,12 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
{Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
- assert(VT.getVectorElementCount().Min % N == 0 &&
+ assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
"invalid tuple vector type!");
- EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- VT.getVectorElementCount() / N);
+ EVT SplitVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorElementCount().divideCoefficientBy(N));
assert(isTypeLegal(SplitVT));
SmallVector<EVT, 5> VTs(N, SplitVT);
@@ -10409,32 +11654,77 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
}
-// Generate SUBS and CSEL for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
+// VECREDUCE_ADD( EXTEND(v16i8_type) ) to
+// VECREDUCE_ADD( DOTv16i8(v16i8_type) )
+static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *ST) {
+ SDValue Op0 = N->getOperand(0);
+ if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32)
+ return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
+ if (Op0.getValueType().getVectorElementType() != MVT::i32)
+ return SDValue();
- // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
- // and change it to SUB and CSEL.
- if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
- N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
- N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
- if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
- if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
- SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- N0.getOperand(0));
- // Generate SUBS & CSEL.
- SDValue Cmp =
- DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
- N0.getOperand(0), DAG.getConstant(0, DL, VT));
- return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
- DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
- SDValue(Cmp.getNode(), 1));
- }
- return SDValue();
+ unsigned ExtOpcode = Op0.getOpcode();
+ if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ EVT Op0VT = Op0.getOperand(0).getValueType();
+ if (Op0VT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(Op0);
+ SDValue Ones = DAG.getConstant(1, DL, Op0VT);
+ SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
+ auto DotIntrisic = (ExtOpcode == ISD::ZERO_EXTEND)
+ ? Intrinsic::aarch64_neon_udot
+ : Intrinsic::aarch64_neon_sdot;
+ SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(),
+ DAG.getConstant(DotIntrisic, DL, MVT::i32), Zeros,
+ Ones, Op0.getOperand(0));
+ return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
+}
+
+// Given a ABS node, detect the following pattern:
+// (ABS (SUB (EXTEND a), (EXTEND b))).
+// Generates UABD/SABD instruction.
+static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SDValue AbsOp1 = N->getOperand(0);
+ SDValue Op0, Op1;
+
+ if (AbsOp1.getOpcode() != ISD::SUB)
+ return SDValue();
+
+ Op0 = AbsOp1.getOperand(0);
+ Op1 = AbsOp1.getOperand(1);
+
+ unsigned Opc0 = Op0.getOpcode();
+ // Check if the operands of the sub are (zero|sign)-extended.
+ if (Opc0 != Op1.getOpcode() ||
+ (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
+ return SDValue();
+
+ EVT VectorT1 = Op0.getOperand(0).getValueType();
+ EVT VectorT2 = Op1.getOperand(0).getValueType();
+ // Check if vectors are of same type and valid size.
+ uint64_t Size = VectorT1.getFixedSizeInBits();
+ if (VectorT1 != VectorT2 || (Size != 64 && Size != 128))
+ return SDValue();
+
+ // Check if vector element types are valid.
+ EVT VT1 = VectorT1.getVectorElementType();
+ if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32)
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+ Op1 = Op1.getOperand(0);
+ unsigned ABDOpcode =
+ (Opc0 == ISD::SIGN_EXTEND) ? AArch64ISD::SABD : AArch64ISD::UABD;
+ SDValue ABD =
+ DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
}
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
@@ -10443,10 +11733,7 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
- return Cmp;
-
- return performIntegerAbsCombine(N, DAG);
+ return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
}
SDValue
@@ -10505,9 +11792,157 @@ static bool IsSVECntIntrinsic(SDValue S) {
return false;
}
+/// Calculates what the pre-extend type is, based on the extension
+/// operation node provided by \p Extend.
+///
+/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
+/// pre-extend type is pulled directly from the operand, while other extend
+/// operations need a bit more inspection to get this information.
+///
+/// \param Extend The SDNode from the DAG that represents the extend operation
+/// \param DAG The SelectionDAG hosting the \p Extend node
+///
+/// \returns The type representing the \p Extend source type, or \p MVT::Other
+/// if no valid type can be determined
+static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
+ switch (Extend.getOpcode()) {
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return Extend.getOperand(0).getValueType();
+ case ISD::AssertSext:
+ case ISD::AssertZext:
+ case ISD::SIGN_EXTEND_INREG: {
+ VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
+ if (!TypeNode)
+ return MVT::Other;
+ return TypeNode->getVT();
+ }
+ case ISD::AND: {
+ ConstantSDNode *Constant =
+ dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
+ if (!Constant)
+ return MVT::Other;
+
+ uint32_t Mask = Constant->getZExtValue();
+
+ if (Mask == UCHAR_MAX)
+ return MVT::i8;
+ else if (Mask == USHRT_MAX)
+ return MVT::i16;
+ else if (Mask == UINT_MAX)
+ return MVT::i32;
+
+ return MVT::Other;
+ }
+ default:
+ return MVT::Other;
+ }
+
+ llvm_unreachable("Code path unhandled in calculatePreExtendType!");
+}
+
+/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
+/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
+ SelectionDAG &DAG) {
+
+ ShuffleVectorSDNode *ShuffleNode =
+ dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
+ if (!ShuffleNode)
+ return SDValue();
+
+ // Ensuring the mask is zero before continuing
+ if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
+ return SDValue();
+
+ SDValue InsertVectorElt = VectorShuffle.getOperand(0);
+
+ if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue InsertLane = InsertVectorElt.getOperand(2);
+ ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
+ // Ensures the insert is inserting into lane 0
+ if (!Constant || Constant->getZExtValue() != 0)
+ return SDValue();
+
+ SDValue Extend = InsertVectorElt.getOperand(1);
+ unsigned ExtendOpcode = Extend.getOpcode();
+
+ bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
+ ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
+ ExtendOpcode == ISD::AssertSext;
+ if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
+ ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
+ return SDValue();
+
+ EVT TargetType = VectorShuffle.getValueType();
+ EVT PreExtendType = calculatePreExtendType(Extend, DAG);
+
+ if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
+ TargetType != MVT::v2i64) ||
+ (PreExtendType == MVT::Other))
+ return SDValue();
+
+ // Restrict valid pre-extend data type
+ if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
+ PreExtendType != MVT::i32)
+ return SDValue();
+
+ EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
+
+ if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
+ return SDValue();
+
+ if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+ return SDValue();
+
+ SDLoc DL(VectorShuffle);
+
+ SDValue InsertVectorNode = DAG.getNode(
+ InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
+ DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
+ DAG.getConstant(0, DL, MVT::i64));
+
+ std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
+
+ SDValue VectorShuffleNode =
+ DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
+ DAG.getUNDEF(PreExtendVT), ShuffleMask);
+
+ SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+ DL, TargetType, VectorShuffleNode);
+
+ return ExtendNode;
+}
+
+/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
+/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
+static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
+ // If the value type isn't a vector, none of the operands are going to be dups
+ if (!Mul->getValueType(0).isVector())
+ return SDValue();
+
+ SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
+ SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+
+ // Neither operands have been changed, don't make any further changes
+ if (!Op0 && !Op1)
+ return SDValue();
+
+ SDLoc DL(Mul);
+ return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
+ Op0 ? Op0 : Mul->getOperand(0),
+ Op1 ? Op1 : Mul->getOperand(1));
+}
+
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
+
+ if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
+ return Ext;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -11042,6 +12477,9 @@ static SDValue performSVEAndCombine(SDNode *N,
return DAG.getNode(Opc, DL, N->getValueType(0), And);
}
+ if (!EnableCombineMGatherIntrinsics)
+ return SDValue();
+
SDValue Mask = N->getOperand(1);
if (!Src.hasOneUse())
@@ -11095,6 +12533,11 @@ static SDValue performANDCombine(SDNode *N,
if (VT.isScalableVector())
return performSVEAndCombine(N, DCI);
+ // The combining code below works only for NEON vectors. In particular, it
+ // does not work for SVE when dealing with vectors wider than 128 bits.
+ if (!(VT.is64BitVector() || VT.is128BitVector()))
+ return SDValue();
+
BuildVectorSDNode *BVN =
dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
if (!BVN)
@@ -11155,6 +12598,143 @@ static SDValue performSRLCombine(SDNode *N,
return SDValue();
}
+// Attempt to form urhadd(OpA, OpB) from
+// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
+// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
+// The original form of the first expression is
+// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
+// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
+// Before this function is called the srl will have been lowered to
+// AArch64ISD::VLSHR.
+// This pass can also recognize signed variants of the patterns that use sign
+// extension instead of zero extension and form a srhadd(OpA, OpB) or a
+// shadd(OpA, OpB) from them.
+static SDValue
+performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // Since we are looking for a right shift by a constant value of 1 and we are
+ // operating on types at least 16 bits in length (sign/zero extended OpA and
+ // OpB, which are at least 8 bits), it follows that the truncate will always
+ // discard the shifted-in bit and therefore the right shift will be logical
+ // regardless of the signedness of OpA and OpB.
+ SDValue Shift = N->getOperand(0);
+ if (Shift.getOpcode() != AArch64ISD::VLSHR)
+ return SDValue();
+
+ // Is the right shift using an immediate value of 1?
+ uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
+ if (ShiftAmount != 1)
+ return SDValue();
+
+ SDValue ExtendOpA, ExtendOpB;
+ SDValue ShiftOp0 = Shift.getOperand(0);
+ unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
+ if (ShiftOp0Opc == ISD::SUB) {
+
+ SDValue Xor = ShiftOp0.getOperand(1);
+ if (Xor.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ // Is the XOR using a constant amount of all ones in the right hand side?
+ uint64_t C;
+ if (!isAllConstantBuildVector(Xor.getOperand(1), C))
+ return SDValue();
+
+ unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+ APInt CAsAPInt(ElemSizeInBits, C);
+ if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
+ return SDValue();
+
+ ExtendOpA = Xor.getOperand(0);
+ ExtendOpB = ShiftOp0.getOperand(0);
+ } else if (ShiftOp0Opc == ISD::ADD) {
+ ExtendOpA = ShiftOp0.getOperand(0);
+ ExtendOpB = ShiftOp0.getOperand(1);
+ } else
+ return SDValue();
+
+ unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
+ unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
+ if (!(ExtendOpAOpc == ExtendOpBOpc &&
+ (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+ return SDValue();
+
+ // Is the result of the right shift being truncated to the same value type as
+ // the original operands, OpA and OpB?
+ SDValue OpA = ExtendOpA.getOperand(0);
+ SDValue OpB = ExtendOpB.getOperand(0);
+ EVT OpAVT = OpA.getValueType();
+ assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
+ if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+ return SDValue();
+
+ SDLoc DL(N);
+ bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
+ bool IsRHADD = ShiftOp0Opc == ISD::SUB;
+ unsigned HADDOpc = IsSignExtend
+ ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
+ : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
+ SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
+
+ return ResultHADD;
+}
+
+static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
+ switch (Opcode) {
+ case ISD::FADD:
+ return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
+ case ISD::ADD:
+ return VT == MVT::i64;
+ default:
+ return false;
+ }
+}
+
+static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+ ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
+
+ EVT VT = N->getValueType(0);
+ const bool FullFP16 =
+ static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+
+ // Rewrite for pairwise fadd pattern
+ // (f32 (extract_vector_elt
+ // (fadd (vXf32 Other)
+ // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
+ // ->
+ // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
+ // (extract_vector_elt (vXf32 Other) 1))
+ if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
+ hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
+ SDLoc DL(N0);
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+
+ ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
+ SDValue Other = N00;
+
+ // And handle the commutative case.
+ if (!Shuffle) {
+ Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
+ Other = N01;
+ }
+
+ if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
+ Other == Shuffle->getOperand(0)) {
+ return DAG.getNode(N0->getOpcode(), DL, VT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+ DAG.getConstant(0, DL, MVT::i64)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+ DAG.getConstant(1, DL, MVT::i64)));
+ }
+ }
+
+ return SDValue();
+}
+
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -11200,9 +12780,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- // Optimise concat_vectors of two [us]rhadds that use extracted subvectors
- // from the same original vectors. Combine these into a single [us]rhadd that
- // operates on the two original vectors. Example:
+ // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
+ // subvectors from the same original vectors. Combine these into a single
+ // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
// extract_subvector (v16i8 OpB,
// <0>))),
@@ -11212,7 +12792,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// ->
// (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
- (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
+ (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
+ N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
SDValue N10 = N1->getOperand(0);
@@ -11517,6 +13098,43 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
}
+// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
+static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ // Only scalar integer and vector types.
+ if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
+ return SDValue();
+
+ auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+ auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
+ if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
+ return SDValue();
+
+ SDValue Op1 = LHS->getOperand(0);
+ SDValue Op2 = RHS->getOperand(0);
+ EVT OpVT1 = Op1.getValueType();
+ EVT OpVT2 = Op2.getValueType();
+ if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
+ Op2.getOpcode() != AArch64ISD::UADDV ||
+ OpVT1.getVectorElementType() != VT)
+ return SDValue();
+
+ SDValue Val1 = Op1.getOperand(0);
+ SDValue Val2 = Op2.getOperand(0);
+ EVT ValVT = Val1->getValueType(0);
+ SDLoc DL(N);
+ SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by
// patterns like:
@@ -11570,6 +13188,16 @@ static SDValue performAddSubLongCombine(SDNode *N,
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
}
+static SDValue performAddSubCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ // Try to change sum of two reductions.
+ if (SDValue Val = performUADDVCombine(N, DAG))
+ return Val;
+
+ return performAddSubLongCombine(N, DCI, DAG);
+}
+
// Massage DAGs which we can use the high-half "long" operations on into
// something isel will recognize better. E.g.
//
@@ -11583,8 +13211,8 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDValue LHS = N->getOperand(1);
- SDValue RHS = N->getOperand(2);
+ SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
+ SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
assert(LHS.getValueType().is64BitVector() &&
RHS.getValueType().is64BitVector() &&
"unexpected shape for long operation");
@@ -11602,6 +13230,9 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
return SDValue();
}
+ if (IID == Intrinsic::not_intrinsic)
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
N->getOperand(0), LHS, RHS);
}
@@ -11700,34 +13331,6 @@ static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
DAG.getConstant(0, dl, MVT::i64));
}
-static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
- SelectionDAG &DAG) {
- SDLoc dl(N);
- LLVMContext &Ctx = *DAG.getContext();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- EVT VT = N->getValueType(0);
- SDValue Pred = N->getOperand(1);
- SDValue Data = N->getOperand(2);
- EVT DataVT = Data.getValueType();
-
- if (DataVT.getVectorElementType().isScalarInteger() &&
- (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) {
- if (!TLI.isTypeLegal(DataVT))
- return SDValue();
-
- EVT OutputVT = EVT::getVectorVT(Ctx, VT,
- AArch64::NeonBitsPerVector / VT.getSizeInBits());
- SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
- SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
- SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);
-
- return Result;
- }
-
- return SDValue();
-}
-
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Op1 = N->getOperand(1);
@@ -11770,7 +13373,8 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
- EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });
+ EVT ByteVT =
+ EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
// Convert everything to the domain of EXT (i.e bytes).
SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
@@ -11870,6 +13474,25 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
return DAG.getZExtOrTrunc(Res, DL, VT);
}
+static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ SDValue Pred = N->getOperand(1);
+ SDValue VecToReduce = N->getOperand(2);
+
+ // NOTE: The integer reduction's result type is not always linked to the
+ // operand's element type so we construct it from the intrinsic's result type.
+ EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
+ SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
+
+ // SVE reductions set the whole vector register with the first element
+ // containing the reduction result, which we'll now extract.
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+ Zero);
+}
+
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
SelectionDAG &DAG) {
SDLoc DL(N);
@@ -11910,6 +13533,25 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
Zero);
}
+// If a merged operation has no inactive lanes we can relax it to a predicated
+// or unpredicated operation, which potentially allows better isel (perhaps
+// using immediate forms) or relaxing register reuse requirements.
+static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc,
+ SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
+ assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
+ SDValue Pg = N->getOperand(1);
+
+ // ISD way to specify an all active predicate.
+ if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
+ (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
+ return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
+ N->getOperand(2), N->getOperand(3));
+
+ // FUTURE: SplatVector(true)
+ return SDValue();
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -11964,20 +13606,28 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_crc32h:
case Intrinsic::aarch64_crc32ch:
return tryCombineCRC32(0xffff, N, DAG);
+ case Intrinsic::aarch64_sve_saddv:
+ // There is no i64 version of SADDV because the sign is irrelevant.
+ if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
+ return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
+ else
+ return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
+ case Intrinsic::aarch64_sve_uaddv:
+ return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
case Intrinsic::aarch64_sve_smaxv:
- return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
case Intrinsic::aarch64_sve_umaxv:
- return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
case Intrinsic::aarch64_sve_sminv:
- return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
case Intrinsic::aarch64_sve_uminv:
- return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
case Intrinsic::aarch64_sve_orv:
- return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
case Intrinsic::aarch64_sve_eorv:
- return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
case Intrinsic::aarch64_sve_andv:
- return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
+ return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
case Intrinsic::aarch64_sve_index:
return LowerSVEIntrinsicIndex(N, DAG);
case Intrinsic::aarch64_sve_dup:
@@ -11988,26 +13638,19 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_sve_ext:
return LowerSVEIntrinsicEXT(N, DAG);
case Intrinsic::aarch64_sve_smin:
- return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
case Intrinsic::aarch64_sve_umin:
- return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
case Intrinsic::aarch64_sve_smax:
- return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
case Intrinsic::aarch64_sve_umax:
- return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
case Intrinsic::aarch64_sve_lsl:
- return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
case Intrinsic::aarch64_sve_lsr:
- return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
case Intrinsic::aarch64_sve_asr:
- return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
- N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
case Intrinsic::aarch64_sve_cmphs:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
@@ -12100,18 +13743,15 @@ static SDValue performExtendCombine(SDNode *N,
// helps the backend to decide that an sabdl2 would be useful, saving a real
// extract_high operation.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
- N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+ (N->getOperand(0).getOpcode() == AArch64ISD::UABD ||
+ N->getOperand(0).getOpcode() == AArch64ISD::SABD)) {
SDNode *ABDNode = N->getOperand(0).getNode();
- unsigned IID = getIntrinsicID(ABDNode);
- if (IID == Intrinsic::aarch64_neon_sabd ||
- IID == Intrinsic::aarch64_neon_uabd) {
- SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
- if (!NewABD.getNode())
- return SDValue();
+ SDValue NewABD =
+ tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
+ if (!NewABD.getNode())
+ return SDValue();
- return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
- NewABD);
- }
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
}
// This is effectively a custom type legalization for AArch64.
@@ -12594,6 +14234,31 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
S->getMemOperand()->getFlags());
}
+static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT ResVT = N->getValueType(0);
+
+ // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
+ if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
+ if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+ SDValue X = Op0.getOperand(0).getOperand(0);
+ return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
+ }
+ }
+
+ // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
+ if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
+ if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
+ SDValue Z = Op1.getOperand(0).getOperand(1);
+ return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
+ }
+ }
+
+ return SDValue();
+}
+
/// Target-specific DAG combine function for post-increment LD1 (lane) and
/// post-increment LD1R.
static SDValue performPostLD1Combine(SDNode *N,
@@ -12732,6 +14397,54 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
+static SDValue performMaskedGatherScatterCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
+ assert(MGS && "Can only combine gather load or scatter store nodes");
+
+ SDLoc DL(MGS);
+ SDValue Chain = MGS->getChain();
+ SDValue Scale = MGS->getScale();
+ SDValue Index = MGS->getIndex();
+ SDValue Mask = MGS->getMask();
+ SDValue BasePtr = MGS->getBasePtr();
+ ISD::MemIndexType IndexType = MGS->getIndexType();
+
+ EVT IdxVT = Index.getValueType();
+
+ if (DCI.isBeforeLegalize()) {
+ // SVE gather/scatter requires indices of i32/i64. Promote anything smaller
+ // prior to legalisation so the result can be split if required.
+ if ((IdxVT.getVectorElementType() == MVT::i8) ||
+ (IdxVT.getVectorElementType() == MVT::i16)) {
+ EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32);
+ if (MGS->isIndexSigned())
+ Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+ else
+ Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index);
+
+ if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
+ SDValue PassThru = MGT->getPassThru();
+ SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale };
+ return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+ PassThru.getValueType(), DL, Ops,
+ MGT->getMemOperand(),
+ MGT->getIndexType(), MGT->getExtensionType());
+ } else {
+ auto *MSC = cast<MaskedScatterSDNode>(MGS);
+ SDValue Data = MSC->getValue();
+ SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale };
+ return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+ MSC->getMemoryVT(), DL, Ops,
+ MSC->getMemOperand(), IndexType,
+ MSC->isTruncatingStore());
+ }
+ }
+ }
+
+ return SDValue();
+}
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
@@ -13703,9 +15416,6 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
static SDValue
performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
SDLoc DL(N);
SDValue Src = N->getOperand(0);
unsigned Opc = Src->getOpcode();
@@ -13732,9 +15442,7 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
"Sign extending from an invalid type");
- EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
- VT.getVectorElementType(),
- VT.getVectorElementCount() * 2);
+ EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
ExtOp, DAG.getValueType(ExtVT));
@@ -13742,6 +15450,12 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
}
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (!EnableCombineMGatherIntrinsics)
+ return SDValue();
+
// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;
@@ -13881,9 +15595,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
+ case ISD::ABS:
+ return performABSCombine(N, DAG, DCI, Subtarget);
case ISD::ADD:
case ISD::SUB:
- return performAddSubLongCombine(N, DCI, DAG);
+ return performAddSubCombine(N, DCI, DAG);
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
@@ -13910,6 +15626,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performExtendCombine(N, DCI, DAG);
case ISD::SIGN_EXTEND_INREG:
return performSignExtendInRegCombine(N, DCI, DAG);
+ case ISD::TRUNCATE:
+ return performVectorTruncateCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
case ISD::SELECT:
@@ -13922,6 +15640,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
+ case ISD::MGATHER:
+ case ISD::MSCATTER:
+ return performMaskedGatherScatterCombine(N, DCI, DAG);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
@@ -13933,8 +15654,14 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performPostLD1Combine(N, DCI, false);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
+ case AArch64ISD::UZP1:
+ return performUzpCombine(N, DAG);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performExtractVectorEltCombine(N, DAG);
+ case ISD::VECREDUCE_ADD:
+ return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -14083,10 +15810,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
EVT ResVT = N->getValueType(0);
- uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+ uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
+ SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
SDValue Val =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
- DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
return DAG.getMergeValues({Val, Chain}, DL);
}
case Intrinsic::aarch64_sve_tuple_set: {
@@ -14097,10 +15824,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
SDValue Vec = N->getOperand(4);
EVT TupleVT = Tuple.getValueType();
- uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
+ uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
- uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
+ uint64_t NumLanes =
+ Vec.getValueType().getVectorElementCount().getKnownMinValue();
if ((TupleLanes % NumLanes) != 0)
report_fatal_error("invalid tuple vector!");
@@ -14112,9 +15840,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
if (I == IdxConst)
Opnds.push_back(Vec);
else {
- Opnds.push_back(
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
- DAG.getConstant(I * NumLanes, DL, MVT::i32)));
+ SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
+ Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
+ Vec.getValueType(), Tuple, ExtIdx));
}
}
SDValue Concat =
@@ -14336,7 +16064,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
ElementCount ResEC = VT.getVectorElementCount();
- if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
+ if (InVT.getVectorElementCount() != (ResEC * 2))
return;
auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -14344,7 +16072,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
return;
unsigned Index = CIndex->getZExtValue();
- if ((Index != 0) && (Index != ResEC.Min))
+ if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
return;
unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
@@ -14379,7 +16107,7 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
- if (Subtarget->hasLSE()) {
+ if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
SDValue Ops[] = {
@@ -14460,7 +16188,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
case ISD::CTPOP:
- Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
+ if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
+ Results.push_back(Result);
return;
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
@@ -14608,14 +16337,30 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
// Nand not supported in LSE.
if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
// Leave 128 bits to LLSC.
- return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
+ if (Subtarget->hasLSE() && Size < 128)
+ return AtomicExpansionKind::None;
+ if (Subtarget->outlineAtomics() && Size < 128) {
+ // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
+ // Don't outline them unless
+ // (1) high level <atomic> support approved:
+ // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
+ // (2) low level libgcc and compiler-rt support implemented by:
+ // min/max outline atomics helpers
+ if (AI->getOperation() != AtomicRMWInst::Min &&
+ AI->getOperation() != AtomicRMWInst::Max &&
+ AI->getOperation() != AtomicRMWInst::UMin &&
+ AI->getOperation() != AtomicRMWInst::UMax) {
+ return AtomicExpansionKind::None;
+ }
+ }
+ return AtomicExpansionKind::LLSC;
}
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
// If subtarget has LSE, leave cmpxchg intact for codegen.
- if (Subtarget->hasLSE())
+ if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
return AtomicExpansionKind::None;
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
@@ -15126,6 +16871,92 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
Store->isTruncatingStore());
}
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+
+ bool Signed = Op.getOpcode() == ISD::SDIV;
+ unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
+
+ // Scalable vector i32/i64 DIV is supported.
+ if (EltVT == MVT::i32 || EltVT == MVT::i64)
+ return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
+
+ // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
+ EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
+
+ // Convert the operands to scalable vectors.
+ SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+ SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+
+ // Extend the scalable operands.
+ unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+ unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
+ SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
+ SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
+ SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
+ SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
+
+ // Convert back to fixed vectors so the DIV can be further lowered.
+ Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
+ Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
+ Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
+ Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
+ SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
+ Op0Lo, Op1Lo);
+ SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
+ Op0Hi, Op1Hi);
+
+ // Convert again to scalable vectors to truncate.
+ ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
+ ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
+ SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
+ ResultLo, ResultHi);
+
+ return convertFromScalableVector(DAG, VT, ScalableResult);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+ Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+ bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
+ unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+
+ // Repeatedly unpack Val until the result is of the desired element type.
+ switch (ContainerVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unimplemented container type");
+ case MVT::nxv16i8:
+ Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
+ if (VT.getVectorElementType() == MVT::i16)
+ break;
+ LLVM_FALLTHROUGH;
+ case MVT::nxv8i16:
+ Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
+ if (VT.getVectorElementType() == MVT::i32)
+ break;
+ LLVM_FALLTHROUGH;
+ case MVT::nxv4i32:
+ Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
+ assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
+ break;
+ }
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -15162,17 +16993,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
return convertFromScalableVector(DAG, VT, Val);
}
+// Convert vector operation 'Op' to an equivalent predicated operation whereby
+// the original operation's type is used to construct a suitable predicate.
+// NOTE: The results for inactive lanes are undefined.
SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
SelectionDAG &DAG,
- unsigned NewOp) const {
+ unsigned NewOp,
+ bool OverrideNEON) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
auto Pg = getPredicateForVector(DAG, DL, VT);
- if (useSVEForFixedLengthVectorVT(VT)) {
+ if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- // Create list of operands by convereting existing ones to scalable types.
+ // Create list of operands by converting existing ones to scalable types.
SmallVector<SDValue, 4> Operands = {Pg};
for (const SDValue &V : Op->op_values()) {
if (isa<CondCodeSDNode>(V)) {
@@ -15180,11 +17015,21 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
continue;
}
- assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+ if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
+ EVT VTArg = VTNode->getVT().getVectorElementType();
+ EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
+ Operands.push_back(DAG.getValueType(NewVTArg));
+ continue;
+ }
+
+ assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
"Only fixed length vectors are supported!");
Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
}
+ if (isMergePassthruOpcode(NewOp))
+ Operands.push_back(DAG.getUNDEF(ContainerVT));
+
auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
return convertFromScalableVector(DAG, VT, ScalableRes);
}
@@ -15193,10 +17038,228 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
SmallVector<SDValue, 4> Operands = {Pg};
for (const SDValue &V : Op->op_values()) {
- assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+ assert((!V.getValueType().isVector() ||
+ V.getValueType().isScalableVector()) &&
"Only scalable vectors are supported!");
Operands.push_back(V);
}
+ if (isMergePassthruOpcode(NewOp))
+ Operands.push_back(DAG.getUNDEF(VT));
+
return DAG.getNode(NewOp, DL, VT, Operands);
}
+
+// If a fixed length vector operation has no side effects when applied to
+// undefined elements, we can safely use scalable vectors to perform the same
+// operation without needing to worry about predication.
+SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(useSVEForFixedLengthVectorVT(VT) &&
+ "Only expected to lower fixed length vector operation!");
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ // Create list of operands by converting existing ones to scalable types.
+ SmallVector<SDValue, 4> Ops;
+ for (const SDValue &V : Op->op_values()) {
+ assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
+
+ // Pass through non-vector operands.
+ if (!V.getValueType().isVector()) {
+ Ops.push_back(V);
+ continue;
+ }
+
+ // "cast" fixed length vector to a scalable vector.
+ assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+ "Only fixed length vectors are supported!");
+ Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
+ }
+
+ auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
+ SelectionDAG &DAG) const {
+ SDLoc DL(ScalarOp);
+ SDValue AccOp = ScalarOp.getOperand(0);
+ SDValue VecOp = ScalarOp.getOperand(1);
+ EVT SrcVT = VecOp.getValueType();
+ EVT ResVT = SrcVT.getVectorElementType();
+
+ EVT ContainerVT = SrcVT;
+ if (SrcVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+ VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+ }
+
+ SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+ // Convert operands to Scalable.
+ AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), AccOp, Zero);
+
+ // Perform reduction.
+ SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
+ Pg, AccOp, VecOp);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
+}
+
+SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
+ SelectionDAG &DAG) const {
+ SDLoc DL(ReduceOp);
+ SDValue Op = ReduceOp.getOperand(0);
+ EVT OpVT = Op.getValueType();
+ EVT VT = ReduceOp.getValueType();
+
+ if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
+
+ switch (ReduceOp.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::VECREDUCE_OR:
+ return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
+ case ISD::VECREDUCE_AND: {
+ Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
+ return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
+ }
+ case ISD::VECREDUCE_XOR: {
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
+ SDValue Cntp =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
+ return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
+ SDValue ScalarOp,
+ SelectionDAG &DAG) const {
+ SDLoc DL(ScalarOp);
+ SDValue VecOp = ScalarOp.getOperand(0);
+ EVT SrcVT = VecOp.getValueType();
+
+ if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+ VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+ }
+
+ // UADDV always returns an i64 result.
+ EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
+ SrcVT.getVectorElementType();
+ EVT RdxVT = SrcVT;
+ if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
+ RdxVT = getPackedSVEVectorVT(ResVT);
+
+ SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
+ SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
+ Rdx, DAG.getConstant(0, DL, MVT::i64));
+
+ // The VEC_REDUCE nodes expect an element size result.
+ if (ResVT != ScalarOp.getValueType())
+ Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
+
+ return Res;
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+
+ EVT InVT = Op.getOperand(1).getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+ SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
+ SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
+
+ // Convert the mask to a predicated (NOTE: We don't need to worry about
+ // inactive lanes since VSELECT is safe when given undefined elements).
+ EVT MaskVT = Op.getOperand(0).getValueType();
+ EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
+ auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
+ Mask = DAG.getNode(ISD::TRUNCATE, DL,
+ MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
+
+ auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
+ Mask, Op1, Op2);
+
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT InVT = Op.getOperand(0).getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+
+ assert(useSVEForFixedLengthVectorVT(InVT) &&
+ "Only expected to lower fixed length vector operation!");
+ assert(Op.getValueType() == InVT.changeTypeToInteger() &&
+ "Expected integer result of the same bit length as the inputs!");
+
+ // Expand floating point vector comparisons.
+ if (InVT.isFloatingPoint())
+ return SDValue();
+
+ auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+ auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+ EVT CmpVT = Pg.getValueType();
+ auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
+ {Pg, Op1, Op2, Op.getOperand(2)});
+
+ EVT PromoteVT = ContainerVT.changeTypeToInteger();
+ auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
+ return convertFromScalableVector(DAG, Op.getValueType(), Promote);
+}
+
+SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT InVT = Op.getValueType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ (void)TLI;
+
+ assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
+ InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
+ "Only expect to cast between legal scalable vector types!");
+ assert((VT.getVectorElementType() == MVT::i1) ==
+ (InVT.getVectorElementType() == MVT::i1) &&
+ "Cannot cast between data and predicate scalable vector types!");
+
+ if (InVT == VT)
+ return Op;
+
+ if (VT.getVectorElementType() == MVT::i1)
+ return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
+
+ EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
+ EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
+ assert((VT == PackedVT || InVT == PackedInVT) &&
+ "Cannot cast between unpacked scalable vector types!");
+
+ // Pack input if required.
+ if (InVT != PackedInVT)
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
+
+ Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
+
+ // Unpack result if required.
+ if (VT != PackedVT)
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
+
+ return Op;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 4fe77481706b..9550197159e6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -72,19 +72,50 @@ enum NodeType : unsigned {
ADC,
SBC, // adc, sbc instructions
- // Arithmetic instructions
+ // Predicated instructions where inactive lanes produce undefined results.
ADD_PRED,
FADD_PRED,
+ FDIV_PRED,
+ FMA_PRED,
+ FMAXNM_PRED,
+ FMINNM_PRED,
+ FMUL_PRED,
+ FSUB_PRED,
+ MUL_PRED,
SDIV_PRED,
+ SHL_PRED,
+ SMAX_PRED,
+ SMIN_PRED,
+ SRA_PRED,
+ SRL_PRED,
+ SUB_PRED,
UDIV_PRED,
- FMA_PRED,
- SMIN_MERGE_OP1,
- UMIN_MERGE_OP1,
- SMAX_MERGE_OP1,
- UMAX_MERGE_OP1,
- SHL_MERGE_OP1,
- SRL_MERGE_OP1,
- SRA_MERGE_OP1,
+ UMAX_PRED,
+ UMIN_PRED,
+
+ // Predicated instructions with the result of inactive lanes provided by the
+ // last operand.
+ FABS_MERGE_PASSTHRU,
+ FCEIL_MERGE_PASSTHRU,
+ FFLOOR_MERGE_PASSTHRU,
+ FNEARBYINT_MERGE_PASSTHRU,
+ FNEG_MERGE_PASSTHRU,
+ FRECPX_MERGE_PASSTHRU,
+ FRINT_MERGE_PASSTHRU,
+ FROUND_MERGE_PASSTHRU,
+ FROUNDEVEN_MERGE_PASSTHRU,
+ FSQRT_MERGE_PASSTHRU,
+ FTRUNC_MERGE_PASSTHRU,
+ FP_ROUND_MERGE_PASSTHRU,
+ FP_EXTEND_MERGE_PASSTHRU,
+ UINT_TO_FP_MERGE_PASSTHRU,
+ SINT_TO_FP_MERGE_PASSTHRU,
+ FCVTZU_MERGE_PASSTHRU,
+ FCVTZS_MERGE_PASSTHRU,
+ SIGN_EXTEND_INREG_MERGE_PASSTHRU,
+ ZERO_EXTEND_INREG_MERGE_PASSTHRU,
+ ABS_MERGE_PASSTHRU,
+ NEG_MERGE_PASSTHRU,
SETCC_MERGE_ZERO,
@@ -188,10 +219,18 @@ enum NodeType : unsigned {
SADDV,
UADDV,
+ // Vector halving addition
+ SHADD,
+ UHADD,
+
// Vector rounding halving addition
SRHADD,
URHADD,
+ // Absolute difference
+ UABD,
+ SABD,
+
// Vector across-lanes min/max
// Only the lower result lane is defined.
SMINV,
@@ -199,6 +238,8 @@ enum NodeType : unsigned {
SMAXV,
UMAXV,
+ SADDV_PRED,
+ UADDV_PRED,
SMAXV_PRED,
UMAXV_PRED,
SMINV_PRED,
@@ -207,9 +248,6 @@ enum NodeType : unsigned {
EORV_PRED,
ANDV_PRED,
- // Vector bitwise negation
- NOT,
-
// Vector bitwise insertion
BIT,
@@ -269,9 +307,14 @@ enum NodeType : unsigned {
PTEST,
PTRUE,
+ BITREVERSE_MERGE_PASSTHRU,
+ BSWAP_MERGE_PASSTHRU,
+ CTLZ_MERGE_PASSTHRU,
+ CTPOP_MERGE_PASSTHRU,
DUP_MERGE_PASSTHRU,
INDEX_VECTOR,
+ // Cast between vectors of the same element type but differ in length.
REINTERPRET_CAST,
LD1_MERGE_ZERO,
@@ -381,7 +424,11 @@ enum NodeType : unsigned {
LDP,
STP,
- STNP
+ STNP,
+
+ // Pseudo for a OBJC call that gets emitted together with a special `mov
+ // x29, x29` marker instruction.
+ CALL_RVMARKER
};
} // end namespace AArch64ISD
@@ -391,12 +438,14 @@ namespace {
// Any instruction that defines a 32-bit result zeros out the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
+// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
+// 32 bits, they're probably just qualifying a CopyFromReg.
// FIXME: X86 also checks for CMOV here. Do we need something similar?
static inline bool isDef32(const SDNode &N) {
unsigned Opc = N.getOpcode();
return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
- Opc != ISD::CopyFromReg;
+ Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
+ Opc != ISD::AssertZext;
}
} // end anonymous namespace
@@ -455,12 +504,6 @@ public:
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
- /// Returns true if a cast between SrcAS and DestAS is a noop.
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
- // Addrspacecasts are always noops.
- return true;
- }
-
/// This method returns a target specific FastISel object, or null if the
/// target does not support "fast" ISel.
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -741,9 +784,7 @@ public:
/// illegal as the original, thus leading to an infinite legalisation loop.
/// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
/// vector types this override can be removed.
- bool mergeStoresAfterLegalization(EVT VT) const override {
- return !useSVEForFixedLengthVectors();
- }
+ bool mergeStoresAfterLegalization(EVT VT) const override;
private:
/// Keep a pointer to the AArch64Subtarget around so that we can
@@ -774,6 +815,10 @@ private:
SDValue ThisVal) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -858,24 +903,28 @@ private:
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
- unsigned NewOp) const;
+ SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
+ bool OverrideNEON = false) const;
+ SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
- RTLIB::Libcall Call) const;
+ SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
@@ -890,7 +939,17 @@ private:
SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
+ SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
+ SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op,
+ SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const;
+ SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const;
+ SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp,
+ SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
SelectionDAG &DAG) const;
@@ -902,6 +961,10 @@ private:
bool Reciprocal) const override;
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &ExtraSteps) const override;
+ SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+ const DenormalMode &Mode) const override;
+ SDValue getSqrtResultForDenormInput(SDValue Operand,
+ SelectionDAG &DAG) const override;
unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -933,6 +996,7 @@ private:
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
+ bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
@@ -959,8 +1023,21 @@ private:
bool shouldLocalize(const MachineInstr &MI,
const TargetTransformInfo *TTI) const override;
- bool useSVEForFixedLengthVectors() const;
- bool useSVEForFixedLengthVectorVT(EVT VT) const;
+ // Normally SVE is only used for byte size vectors that do not fit within a
+ // NEON vector. This changes when OverrideNEON is true, allowing SVE to be
+ // used for 64bit and 128bit vectors as well.
+ bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
+
+ // With the exception of data-predicate transitions, no instructions are
+ // required to cast between legal scalable vector types. However:
+ // 1. Packed and unpacked types have different bit lengths, meaning BITCAST
+ // is not universally useable.
+ // 2. Most unpacked integer types are not legal and thus integer extends
+ // cannot be used to convert between unpacked and packed types.
+ // These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used
+ // to transition between unpacked and packed types of the same element type,
+ // with BITCAST used otherwise.
+ SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const;
};
namespace AArch64 {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4f4ba692c2db..cf08f56e5b08 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -60,10 +60,14 @@ class AArch64Inst<Format f, string cstr> : Instruction {
bits<2> Form = F.Value;
// Defaults
+ bit isWhile = 0;
+ bit isPTestLike = 0;
FalseLanesEnum FalseLanes = FalseLanesNone;
DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
ElementSizeEnum ElementSize = ElementSizeNone;
+ let TSFlags{10} = isPTestLike;
+ let TSFlags{9} = isWhile;
let TSFlags{8-7} = FalseLanes.Value;
let TSFlags{6-3} = DestructiveInstType.Value;
let TSFlags{2-0} = ElementSize.Value;
@@ -263,6 +267,7 @@ def adrplabel : Operand<i64> {
let EncoderMethod = "getAdrLabelOpValue";
let PrintMethod = "printAdrpLabel";
let ParserMatchClass = AdrpOperand;
+ let OperandType = "OPERAND_PCREL";
}
def AdrOperand : AsmOperandClass {
@@ -325,7 +330,7 @@ def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
}
def SImm8Operand : SImmOperand<8>;
-def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 127; }]> {
+def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 128; }]> {
let ParserMatchClass = SImm8Operand;
let DecoderMethod = "DecodeSImm<8>";
}
@@ -914,6 +919,13 @@ def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
let ParserMatchClass = Imm0_1Operand;
}
+// timm0_1 - as above, but use TargetConstant (TImmLeaf)
+def timm0_1 : Operand<i64>, TImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = Imm0_1Operand;
+}
+
// imm0_15 predicate - True if the immediate is in the range [0,15]
def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 16;
@@ -1289,8 +1301,9 @@ class SimpleSystemI<bit L, dag iops, string asm, string operands,
}
// System instructions which have an Rt register.
-class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
- : BaseSystemI<L, oops, iops, asm, operands>,
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands,
+ list<dag> pattern = []>
+ : BaseSystemI<L, oops, iops, asm, operands, pattern>,
Sched<[WriteSys]> {
bits<5> Rt;
let Inst{4-0} = Rt;
@@ -1318,6 +1331,16 @@ class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
let Inst{4-0} = Rt;
}
+// System instructions that pass a register argument
+// This class assumes the register is for input rather than output.
+class RegInputSystemI<bits<4> CRm, bits<3> Op2, string asm,
+ list<dag> pattern = []>
+ : RtSystemI<0, (outs), (ins GPR64:$Rt), asm, "\t$Rt", pattern> {
+ let Inst{20-12} = 0b000110001;
+ let Inst{11-8} = CRm;
+ let Inst{7-5} = Op2;
+}
+
// System instructions for transactional memory - no operand
class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
: TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
@@ -1358,6 +1381,14 @@ def barrier_op : Operand<i32> {
let PrintMethod = "printBarrierOption";
let ParserMatchClass = BarrierAsmOperand;
}
+def BarriernXSAsmOperand : AsmOperandClass {
+ let Name = "BarriernXS";
+ let ParserMethod = "tryParseBarriernXSOperand";
+}
+def barrier_nxs_op : Operand<i32> {
+ let PrintMethod = "printBarriernXSOption";
+ let ParserMatchClass = BarriernXSAsmOperand;
+}
class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
list<dag> pattern = []>
: SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
@@ -1439,6 +1470,7 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
"mrs", "\t$Rt, $systemreg"> {
bits<16> systemreg;
let Inst{20-5} = systemreg;
+ let DecoderNamespace = "Fallback";
}
// FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -1448,6 +1480,7 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
"msr", "\t$systemreg, $Rt"> {
bits<16> systemreg;
let Inst{20-5} = systemreg;
+ let DecoderNamespace = "Fallback";
}
def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
@@ -1937,11 +1970,21 @@ class SignAuthTwoOperand<bits<4> opc, string asm,
let Inst{4-0} = Rd;
}
+class ClearAuth<bits<1> data, string asm>
+ : I<(outs GPR64:$Rd), (ins GPR64:$Rn), asm, "\t$Rd", "$Rd = $Rn", []>, Sched<[]> {
+ bits<5> Rd;
+ let Inst{31-11} = 0b110110101100000101000;
+ let Inst{10} = data;
+ let Inst{9-5} = 0b11111;
+ let Inst{4-0} = Rd;
+}
+
// Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
: I<(outs), iops, asm, ops, "", []>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
+ let Defs = [NZCV];
bits<5> Rn;
let Inst{31} = sf;
let Inst{30-15} = 0b0111010000000000;
@@ -3929,7 +3972,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm,
"$Rn = $wback,@earlyclobber $wback", []>,
- Sched<[WriteLD, WriteAdr]>;
+ Sched<[WriteAdr, WriteLD]>;
let mayStore = 1, mayLoad = 0 in
class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
@@ -3975,7 +4018,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset),
asm, "$Rn = $wback,@earlyclobber $wback", []>,
- Sched<[WriteLD, WriteAdr]>;
+ Sched<[WriteAdr, WriteLD]>;
let mayStore = 1, mayLoad = 0 in
class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
@@ -4072,7 +4115,7 @@ class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
: BaseLoadStorePairPreIdx<opc, V, 1,
(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
(ins GPR64sp:$Rn, indextype:$offset), asm>,
- Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+ Sched<[WriteAdr, WriteLD, WriteLDHi]>;
let mayStore = 1, mayLoad = 0 in
class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
@@ -4113,7 +4156,7 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
: BaseLoadStorePairPostIdx<opc, V, 1,
(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
(ins GPR64sp:$Rn, idxtype:$offset), asm>,
- Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+ Sched<[WriteAdr, WriteLD, WriteLDHi]>;
let mayStore = 1, mayLoad = 0 in
class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
@@ -7831,9 +7874,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
- v2f32, v8i8>;
+ v2f32, v4bf16>;
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
- v4f32, v16i8>;
+ v4f32, v8bf16>;
}
class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
@@ -7851,7 +7894,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
(InputType RegType:$Rn),
(InputType (bitconvert (AccumType
(AArch64duplane32 (v4f32 V128:$Rm),
- VectorIndexH:$idx)))))))]> {
+ VectorIndexS:$idx)))))))]> {
bits<2> idx;
let Inst{21} = idx{0}; // L
@@ -7861,16 +7904,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
- ".2h", V64, v2f32, v8i8>;
+ ".2h", V64, v2f32, v4bf16>;
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
- ".2h", V128, v4f32, v16i8>;
+ ".2h", V128, v4f32, v8bf16>;
}
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
- (v16i8 V128:$Rn),
- (v16i8 V128:$Rm)))]> {
+ (v8bf16 V128:$Rn),
+ (v8bf16 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
}
@@ -7880,10 +7923,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
[(set (v4f32 V128:$dst),
(v4f32 (OpNode (v4f32 V128:$Rd),
- (v16i8 V128:$Rn),
- (v16i8 (bitconvert (v8bf16
+ (v8bf16 V128:$Rn),
+ (v8bf16
(AArch64duplane16 (v8bf16 V128_lo:$Rm),
- VectorIndexH:$idx)))))))]>,
+ VectorIndexH:$idx)))))]>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
@@ -7907,8 +7950,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
V128, asm, ".4s",
[(set (v4f32 V128:$dst),
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
- (v16i8 V128:$Rn),
- (v16i8 V128:$Rm)))]> {
+ (v8bf16 V128:$Rn),
+ (v8bf16 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
", $Rm", ".8h", "}");
}
@@ -10586,14 +10629,14 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
(v4f16 V64:$Rn),
(v4f16 V64:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype,
asm, ".8h",
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
(v8f16 V128:$Rn),
(v8f16 V128:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
}
let Predicates = [HasComplxNum, HasNEON] in {
@@ -10602,21 +10645,21 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
(v2f32 V64:$Rn),
(v2f32 V64:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype,
asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v4f32 V128:$Rn),
(v4f32 V128:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype,
asm, ".2d",
[(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
(v2f64 V128:$Rn),
(v2f64 V128:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
}
}
@@ -10658,14 +10701,14 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
(v4f16 V64:$Rn),
(v4f16 V64:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128,
rottype, asm, ".8h",
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
(v8f16 V128:$Rn),
(v8f16 V128:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
}
let Predicates = [HasComplxNum, HasNEON] in {
@@ -10674,21 +10717,21 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
(v2f32 V64:$Rn),
(v2f32 V64:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128,
rottype, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v4f32 V128:$Rn),
(v4f32 V128:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128,
rottype, asm, ".2d",
[(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
(v2f64 V128:$Rn),
(v2f64 V128:$Rm),
- (rottype i32:$rot)))]>;
+ (i32 rottype:$rot)))]>;
}
}
@@ -11216,6 +11259,35 @@ multiclass STOPregister<string asm, string instr> {
!cast<Instruction>(instr # "X")>;
}
+class LoadStore64B_base<bits<3> opc, string asm_inst, string asm_ops,
+ dag iops, dag oops, list<dag> pat>
+ : I<oops, iops, asm_inst, asm_ops, "", pat>,
+ Sched<[]> /* FIXME: fill in scheduling details once known */ {
+ bits<5> Rt;
+ bits<5> Rn;
+ let Inst{31-21} = 0b11111000001;
+ let Inst{15} = 1;
+ let Inst{14-12} = opc;
+ let Inst{11-10} = 0b00;
+ let Inst{9-5} = Rn;
+ let Inst{4-0} = Rt;
+
+ let Predicates = [HasV8_7a];
+}
+
+class LoadStore64B<bits<3> opc, string asm_inst, dag iops, dag oops,
+ list<dag> pat = []>
+ : LoadStore64B_base<opc, asm_inst, "\t$Rt, [$Rn]", iops, oops, pat> {
+ let Inst{20-16} = 0b11111;
+}
+
+class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []>
+ : LoadStore64B_base<opc, asm_inst, "\t$Rs, $Rt, [$Rn]",
+ (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs GPR64:$Rs), pat> {
+ bits<5> Rs;
+ let Inst{20-16} = Rs;
+}
+
//----------------------------------------------------------------------------
// Allow the size specifier tokens to be upper case, not just lower.
def : TokenAlias<".4B", ".4b">; // Add dot product
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index a0e7c782f68c..25656fac1d2f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -88,6 +88,29 @@ def G_DUP: AArch64GenericInstruction {
let InOperandList = (ins type1:$lane);
let hasSideEffects = 0;
}
+
+// Represents a lane duplicate operation.
+def G_DUPLANE8 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src, type1:$lane);
+ let hasSideEffects = 0;
+}
+def G_DUPLANE16 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src, type1:$lane);
+ let hasSideEffects = 0;
+}
+def G_DUPLANE32 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src, type1:$lane);
+ let hasSideEffects = 0;
+}
+def G_DUPLANE64 : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src, type1:$lane);
+ let hasSideEffects = 0;
+}
+
// Represents a trn1 instruction. Produced post-legalization from
// G_SHUFFLE_VECTORs with appropriate masks.
def G_TRN1 : AArch64GenericInstruction {
@@ -111,6 +134,28 @@ def G_EXT: AArch64GenericInstruction {
let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm);
}
+// Represents a vector G_ASHR with an immediate.
+def G_VASHR : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, untyped_imm_0:$imm);
+}
+
+// Represents a vector G_LSHR with an immediate.
+def G_VLSHR : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, untyped_imm_0:$imm);
+}
+
+// Represents an integer to FP conversion on the FPR bank.
+def G_SITOF : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+}
+def G_UITOF : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src);
+}
+
def : GINodeEquiv<G_REV16, AArch64rev16>;
def : GINodeEquiv<G_REV32, AArch64rev32>;
def : GINodeEquiv<G_REV64, AArch64rev64>;
@@ -119,6 +164,21 @@ def : GINodeEquiv<G_UZP2, AArch64uzp2>;
def : GINodeEquiv<G_ZIP1, AArch64zip1>;
def : GINodeEquiv<G_ZIP2, AArch64zip2>;
def : GINodeEquiv<G_DUP, AArch64dup>;
+def : GINodeEquiv<G_DUPLANE8, AArch64duplane8>;
+def : GINodeEquiv<G_DUPLANE16, AArch64duplane16>;
+def : GINodeEquiv<G_DUPLANE32, AArch64duplane32>;
+def : GINodeEquiv<G_DUPLANE64, AArch64duplane64>;
def : GINodeEquiv<G_TRN1, AArch64trn1>;
def : GINodeEquiv<G_TRN2, AArch64trn2>;
def : GINodeEquiv<G_EXT, AArch64ext>;
+def : GINodeEquiv<G_VASHR, AArch64vashr>;
+def : GINodeEquiv<G_VLSHR, AArch64vlshr>;
+def : GINodeEquiv<G_SITOF, AArch64sitof>;
+def : GINodeEquiv<G_UITOF, AArch64uitof>;
+
+def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
+
+// These are patterns that we only use for GlobalISel via the importer.
+def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)),
+ (vector_extract (v2f32 FPR64:$Rn), (i64 1)))),
+ (f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 08f80c9aa361..6b38e216a854 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -107,6 +107,13 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
break;
+ case TargetOpcode::STATEPOINT:
+ NumBytes = StatepointOpers(&MI).getNumPatchBytes();
+ assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+ // No patch bytes means a normal call inst is emitted
+ if (NumBytes == 0)
+ NumBytes = 4;
+ break;
case AArch64::TLSDESC_CALLSEQ:
// This gets lowered to an instruction sequence which takes 16 bytes
NumBytes = 16;
@@ -287,6 +294,31 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
}
}
+ // If we're allowed to modify and the block ends in a unconditional branch
+ // which could simply fallthrough, remove the branch. (Note: This case only
+ // matters when we can't understand the whole sequence, otherwise it's also
+ // handled by BranchFolding.cpp.)
+ if (AllowModify && isUncondBranchOpcode(LastOpc) &&
+ MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
+ LastInst->eraseFromParent();
+ LastInst = SecondLastInst;
+ LastOpc = LastInst->getOpcode();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+ assert(!isUncondBranchOpcode(LastOpc) &&
+ "unreachable unconditional branches removed above");
+
+ if (isCondBranchOpcode(LastOpc)) {
+ // Block ends with fall-through condbranch.
+ parseCondBranch(LastInst, TBB, Cond);
+ return false;
+ }
+ return true; // Can't handle indirect branch.
+ } else {
+ SecondLastInst = &*I;
+ SecondLastOpc = SecondLastInst->getOpcode();
+ }
+ }
+
// If there are three terminators, we don't know what sort of block this is.
if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
return true;
@@ -321,6 +353,56 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
return true;
}
+bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const {
+ // For the moment, handle only a block which ends with a cb(n)zx followed by
+ // a fallthrough. Why this? Because it is a common form.
+ // TODO: Should we handle b.cc?
+
+ MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+ if (I == MBB.end())
+ return true;
+
+ // Skip over SpeculationBarrierEndBB terminators
+ if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
+ I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
+ --I;
+ }
+
+ if (!isUnpredicatedTerminator(*I))
+ return true;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = &*I;
+ unsigned LastOpc = LastInst->getOpcode();
+ if (!isCondBranchOpcode(LastOpc))
+ return true;
+
+ switch (LastOpc) {
+ default:
+ return true;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ break;
+ };
+
+ MBP.TrueDest = LastInst->getOperand(1).getMBB();
+ assert(MBP.TrueDest && "expected!");
+ MBP.FalseDest = MBB.getNextNode();
+
+ MBP.ConditionDef = nullptr;
+ MBP.SingleUseCondition = false;
+
+ MBP.LHS = LastInst->getOperand(0);
+ MBP.RHS = MachineOperand::CreateImm(0);
+ MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
+ return false;
+}
+
bool AArch64InstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
if (Cond[0].getImm() != -1) {
@@ -1037,6 +1119,13 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
switch (MI.getOpcode()) {
default:
break;
+ case AArch64::PTEST_PP:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = MI.getOperand(1).getReg();
+ // Not sure about the mask and value for now...
+ CmpMask = ~0;
+ CmpValue = 0;
+ return true;
case AArch64::SUBSWrr:
case AArch64::SUBSWrs:
case AArch64::SUBSWrx:
@@ -1192,10 +1281,9 @@ static bool areCFlagsAccessedBetweenInstrs(
return true;
// From must be above To.
- assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
- [From](MachineInstr &MI) {
- return MI.getIterator() == From;
- }) != To->getParent()->rend());
+ assert(std::any_of(
+ ++To.getReverse(), To->getParent()->rend(),
+ [From](MachineInstr &MI) { return MI.getIterator() == From; }));
// We iterate backward starting at \p To until we hit \p From.
for (const MachineInstr &Instr :
@@ -1208,6 +1296,127 @@ static bool areCFlagsAccessedBetweenInstrs(
return false;
}
+/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
+/// operation which could set the flags in an identical manner
+bool AArch64InstrInfo::optimizePTestInstr(
+ MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
+ const MachineRegisterInfo *MRI) const {
+ auto *Mask = MRI->getUniqueVRegDef(MaskReg);
+ auto *Pred = MRI->getUniqueVRegDef(PredReg);
+ auto NewOp = Pred->getOpcode();
+ bool OpChanged = false;
+
+ unsigned MaskOpcode = Mask->getOpcode();
+ unsigned PredOpcode = Pred->getOpcode();
+ bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
+ bool PredIsWhileLike = isWhileOpcode(PredOpcode);
+
+ if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) {
+ // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't
+ // deactivate any lanes OTHER_INST might set.
+ uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode);
+ uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
+
+ // Must be an all active predicate of matching element size.
+ if ((PredElementSize != MaskElementSize) ||
+ (Mask->getOperand(1).getImm() != 31))
+ return false;
+
+ // Fallthough to simply remove the PTEST.
+ } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) {
+ // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
+ // instruction that sets the flags as PTEST would.
+
+ // Fallthough to simply remove the PTEST.
+ } else if (PredIsPTestLike) {
+ // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both
+ // instructions use the same predicate.
+ auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (Mask != PTestLikeMask)
+ return false;
+
+ // Fallthough to simply remove the PTEST.
+ } else {
+ switch (Pred->getOpcode()) {
+ case AArch64::BRKB_PPzP:
+ case AArch64::BRKPB_PPzPP: {
+ // Op 0 is chain, 1 is the mask, 2 the previous predicate to
+ // propagate, 3 the new predicate.
+
+ // Check to see if our mask is the same as the brkpb's. If
+ // not the resulting flag bits may be different and we
+ // can't remove the ptest.
+ auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (Mask != PredMask)
+ return false;
+
+ // Switch to the new opcode
+ NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP
+ : AArch64::BRKPBS_PPzPP;
+ OpChanged = true;
+ break;
+ }
+ case AArch64::BRKN_PPzP: {
+ auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
+ if (Mask != PredMask)
+ return false;
+
+ NewOp = AArch64::BRKNS_PPzP;
+ OpChanged = true;
+ break;
+ }
+ default:
+ // Bail out if we don't recognize the input
+ return false;
+ }
+ }
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // If the predicate is in a different block (possibly because its been
+ // hoisted out), then assume the flags are set in between statements.
+ if (Pred->getParent() != PTest->getParent())
+ return false;
+
+ // If another instruction between the propagation and test sets the
+ // flags, don't remove the ptest.
+ MachineBasicBlock::iterator I = Pred, E = PTest;
+ ++I; // Skip past the predicate op itself.
+ for (; I != E; ++I) {
+ const MachineInstr &Inst = *I;
+
+ // TODO: If the ptest flags are unused, we could still remove it.
+ if (Inst.modifiesRegister(AArch64::NZCV, TRI))
+ return false;
+ }
+
+ // If we pass all the checks, it's safe to remove the PTEST and use the flags
+ // as they are prior to PTEST. Sometimes this requires the tested PTEST
+ // operand to be replaced with an equivalent instruction that also sets the
+ // flags.
+ Pred->setDesc(get(NewOp));
+ PTest->eraseFromParent();
+ if (OpChanged) {
+ bool succeeded = UpdateOperandRegClass(*Pred);
+ (void)succeeded;
+ assert(succeeded && "Operands have incompatible register classes!");
+ Pred->addRegisterDefined(AArch64::NZCV, TRI);
+ }
+
+ // Ensure that the flags def is live.
+ if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
+ unsigned i = 0, e = Pred->getNumOperands();
+ for (; i != e; ++i) {
+ MachineOperand &MO = Pred->getOperand(i);
+ if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
+ MO.setIsDead(false);
+ break;
+ }
+ }
+ }
+ return true;
+}
+
/// Try to optimize a compare instruction. A compare instruction is an
/// instruction which produces AArch64::NZCV. It can be truly compare
/// instruction
@@ -1246,6 +1455,9 @@ bool AArch64InstrInfo::optimizeCompareInstr(
return true;
}
+ if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
+ return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
+
// Continue only if we have a "ri" where immediate is zero.
// FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
// function.
@@ -2062,6 +2274,24 @@ bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
return true;
}
+Optional<ExtAddrMode>
+AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const {
+ const MachineOperand *Base; // Filled with the base operand of MI.
+ int64_t Offset; // Filled with the offset of MI.
+ bool OffsetIsScalable;
+ if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
+ return None;
+
+ if (!Base->isReg())
+ return None;
+ ExtAddrMode AM;
+ AM.BaseReg = Base->getReg();
+ AM.Displacement = Offset;
+ AM.ScaledReg = 0;
+ return AM;
+}
+
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
bool &OffsetIsScalable, unsigned &Width,
@@ -3060,7 +3290,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_PXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 4:
@@ -3104,7 +3334,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 24:
@@ -3126,7 +3356,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 48:
@@ -3137,7 +3367,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZZZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 64:
@@ -3148,7 +3378,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZZZZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
}
@@ -3214,7 +3444,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_PXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 4:
@@ -3258,7 +3488,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 24:
@@ -3280,7 +3510,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 48:
@@ -3291,7 +3521,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZZZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 64:
@@ -3302,7 +3532,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZZZZXI;
- StackID = TargetStackID::SVEVector;
+ StackID = TargetStackID::ScalableVector;
}
break;
}
@@ -3329,6 +3559,47 @@ bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
});
}
+void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
+ const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
+ // The smallest scalable element supported by scaled SVE addressing
+ // modes are predicates, which are 2 scalable bytes in size. So the scalable
+ // byte offset must always be a multiple of 2.
+ assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
+
+ // VGSized offsets are divided by '2', because the VG register is the
+ // the number of 64bit granules as opposed to 128bit vector chunks,
+ // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
+ // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
+ // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
+ ByteSized = Offset.getFixed();
+ VGSized = Offset.getScalable() / 2;
+}
+
+/// Returns the offset in parts to which this frame offset can be
+/// decomposed for the purpose of describing a frame offset.
+/// For non-scalable offsets this is simply its byte size.
+void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
+ const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
+ int64_t &NumDataVectors) {
+ // The smallest scalable element supported by scaled SVE addressing
+ // modes are predicates, which are 2 scalable bytes in size. So the scalable
+ // byte offset must always be a multiple of 2.
+ assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
+
+ NumBytes = Offset.getFixed();
+ NumDataVectors = 0;
+ NumPredicateVectors = Offset.getScalable() / 2;
+ // This method is used to get the offsets to adjust the frame offset.
+ // If the function requires ADDPL to be used and needs more than two ADDPL
+ // instructions, part of the offset is folded into NumDataVectors so that it
+ // uses ADDVL for part of it, reducing the number of ADDPL instructions.
+ if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
+ NumPredicateVectors > 62) {
+ NumDataVectors = NumPredicateVectors / 8;
+ NumPredicateVectors -= NumDataVectors * 8;
+ }
+}
+
// Helper function to emit a frame offset adjustment from a given
// pointer (SrcReg), stored into DestReg. This function is explicit
// in that it requires the opcode.
@@ -3438,12 +3709,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
MachineInstr::MIFlag Flag, bool SetNZCV,
bool NeedsWinCFI, bool *HasWinCFI) {
int64_t Bytes, NumPredicateVectors, NumDataVectors;
- Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
+ AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
+ Offset, Bytes, NumPredicateVectors, NumDataVectors);
// First emit non-scalable frame offsets, or a simple 'mov'.
if (Bytes || (!Offset && SrcReg != DestReg)) {
- assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
- "SP increment/decrement not 16-byte aligned");
+ assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
+ "SP increment/decrement not 8-byte aligned");
unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
if (Bytes < 0) {
Bytes = -Bytes;
@@ -3698,7 +3970,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
// Construct the complete offset.
bool IsMulVL = ScaleValue.isScalable();
unsigned Scale = ScaleValue.getKnownMinSize();
- int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
+ int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
const MachineOperand &ImmOpnd =
MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
@@ -3740,11 +4012,9 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
*OutUnscaledOp = *UnscaledOp;
if (IsMulVL)
- SOffset = StackOffset(Offset, MVT::nxv1i8) +
- StackOffset(SOffset.getBytes(), MVT::i8);
+ SOffset = StackOffset::get(SOffset.getFixed(), Offset);
else
- SOffset = StackOffset(Offset, MVT::i8) +
- StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
+ SOffset = StackOffset::get(Offset, SOffset.getScalable());
return AArch64FrameOffsetCanUpdate |
(SOffset ? 0 : AArch64FrameOffsetIsLegal);
}
@@ -3756,7 +4026,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned ImmIdx = FrameRegIdx + 1;
if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
- Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
+ Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
MI.getOperand(0).getReg(), FrameReg, Offset, TII,
MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
@@ -3861,7 +4131,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
return false;
}
-// FP Opcodes that can be combined with a FMUL
+// FP Opcodes that can be combined with a FMUL.
static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
switch (Inst.getOpcode()) {
default:
@@ -3883,8 +4153,12 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
case AArch64::FSUBv2f64:
case AArch64::FSUBv4f32:
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
- return (Options.UnsafeFPMath ||
- Options.AllowFPOpFusion == FPOpFusion::Fast);
+ // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
+ // the target options or if FADD/FSUB has the contract fast-math flag.
+ return Options.UnsafeFPMath ||
+ Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Inst.getFlag(MachineInstr::FmContract);
+ return true;
}
return false;
}
@@ -4364,8 +4638,8 @@ bool AArch64InstrInfo::isThroughputPattern(
/// pattern evaluator stops checking as soon as it finds a faster sequence.
bool AArch64InstrInfo::getMachineCombinerPatterns(
- MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const {
// Integer patterns
if (getMaddPatterns(Root, Patterns))
return true;
@@ -4373,7 +4647,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getFMAPatterns(Root, Patterns))
return true;
- return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
}
enum class FMAInstKind { Default, Indexed, Accumulator };
@@ -4596,7 +4871,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
- MachineInstr *MUL;
+ MachineInstr *MUL = nullptr;
const TargetRegisterClass *RC;
unsigned Opc;
switch (Pattern) {
@@ -5417,6 +5692,9 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
+ // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and
+ // CodeGen/AArch64/urem-seteq-nonzero.ll.
+ // assert(MUL && "MUL was never set");
DelInstrs.push_back(MUL);
DelInstrs.push_back(&Root);
}
@@ -5756,84 +6034,20 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
const outliner::Candidate &b) {
- const Function &Fa = a.getMF()->getFunction();
- const Function &Fb = b.getMF()->getFunction();
-
- // If none of the functions have the "sign-return-address" attribute their
- // signing behaviour is equal
- if (!Fa.hasFnAttribute("sign-return-address") &&
- !Fb.hasFnAttribute("sign-return-address")) {
- return true;
- }
+ const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
+ const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
- // If both functions have the "sign-return-address" attribute their signing
- // behaviour is equal, if the values of the attributes are equal
- if (Fa.hasFnAttribute("sign-return-address") &&
- Fb.hasFnAttribute("sign-return-address")) {
- StringRef ScopeA =
- Fa.getFnAttribute("sign-return-address").getValueAsString();
- StringRef ScopeB =
- Fb.getFnAttribute("sign-return-address").getValueAsString();
- return ScopeA.equals(ScopeB);
- }
-
- // If function B doesn't have the "sign-return-address" attribute but A does,
- // the functions' signing behaviour is equal if A's value for
- // "sign-return-address" is "none" and vice versa.
- if (Fa.hasFnAttribute("sign-return-address")) {
- StringRef ScopeA =
- Fa.getFnAttribute("sign-return-address").getValueAsString();
- return ScopeA.equals("none");
- }
-
- if (Fb.hasFnAttribute("sign-return-address")) {
- StringRef ScopeB =
- Fb.getFnAttribute("sign-return-address").getValueAsString();
- return ScopeB.equals("none");
- }
-
- llvm_unreachable("Unkown combination of sign-return-address attributes");
+ return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
+ MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
}
static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
const outliner::Candidate &b) {
- const Function &Fa = a.getMF()->getFunction();
- const Function &Fb = b.getMF()->getFunction();
-
- // If none of the functions have the "sign-return-address-key" attribute
- // their keys are equal
- if (!Fa.hasFnAttribute("sign-return-address-key") &&
- !Fb.hasFnAttribute("sign-return-address-key")) {
- return true;
- }
-
- // If both functions have the "sign-return-address-key" attribute their
- // keys are equal if the values of "sign-return-address-key" are equal
- if (Fa.hasFnAttribute("sign-return-address-key") &&
- Fb.hasFnAttribute("sign-return-address-key")) {
- StringRef KeyA =
- Fa.getFnAttribute("sign-return-address-key").getValueAsString();
- StringRef KeyB =
- Fb.getFnAttribute("sign-return-address-key").getValueAsString();
- return KeyA.equals(KeyB);
- }
-
- // If B doesn't have the "sign-return-address-key" attribute, both keys are
- // equal, if function a has the default key (a_key)
- if (Fa.hasFnAttribute("sign-return-address-key")) {
- StringRef KeyA =
- Fa.getFnAttribute("sign-return-address-key").getValueAsString();
- return KeyA.equals_lower("a_key");
- }
-
- if (Fb.hasFnAttribute("sign-return-address-key")) {
- StringRef KeyB =
- Fb.getFnAttribute("sign-return-address-key").getValueAsString();
- return KeyB.equals_lower("a_key");
- }
+ const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
+ const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
- llvm_unreachable("Unkown combination of sign-return-address-key attributes");
+ return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
}
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
@@ -5889,9 +6103,10 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
// necessary. However, at this point we don't know if the outlined function
// will have a RET instruction so we assume the worst.
- const Function &FCF = FirstCand.getMF()->getFunction();
const TargetRegisterInfo &TRI = getRegisterInfo();
- if (FCF.hasFnAttribute("sign-return-address")) {
+ if (FirstCand.getMF()
+ ->getInfo<AArch64FunctionInfo>()
+ ->shouldSignReturnAddress(true)) {
// One PAC and one AUT instructions
NumBytesToCreateFrame += 8;
@@ -5948,10 +6163,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
return false;
};
// Remove candidates with illegal stack modifying instructions
- RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
- RepeatedSequenceLocs.end(),
- hasIllegalSPModification),
- RepeatedSequenceLocs.end());
+ llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
// If the sequence doesn't have enough candidates left, then we're done.
if (RepeatedSequenceLocs.size() < 2)
@@ -5994,10 +6206,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
// Erase every candidate that violates the restrictions above. (It could be
// true that we have viable candidates, so it's not worth bailing out in
// the case that, say, 1 out of 20 candidates violate the restructions.)
- RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
- RepeatedSequenceLocs.end(),
- CantGuaranteeValueAcrossCall),
- RepeatedSequenceLocs.end());
+ llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall);
// If the sequence doesn't have enough candidates left, then we're done.
if (RepeatedSequenceLocs.size() < 2)
@@ -6020,7 +6229,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
NumBytesToCreateFrame += 4;
bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
- return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
+ return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
});
// We check to see if CFI Instructions are present, and if they are
@@ -6189,6 +6398,60 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
FrameID = MachineOutlinerNoLRSave;
} else {
SetCandidateCallInfo(MachineOutlinerDefault, 12);
+
+ // Bugzilla ID: 46767
+ // TODO: Check if fixing up the stack more than once is safe so we can
+ // outline these.
+ //
+ // An outline resulting in a caller that requires stack fixups at the
+ // callsite to a callee that also requires stack fixups can happen when
+ // there are no available registers at the candidate callsite for a
+ // candidate that itself also has calls.
+ //
+ // In other words if function_containing_sequence in the following pseudo
+ // assembly requires that we save LR at the point of the call, but there
+ // are no available registers: in this case we save using SP and as a
+ // result the SP offsets requires stack fixups by multiples of 16.
+ //
+ // function_containing_sequence:
+ // ...
+ // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
+ // call OUTLINED_FUNCTION_N
+ // restore LR from SP
+ // ...
+ //
+ // OUTLINED_FUNCTION_N:
+ // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
+ // ...
+ // bl foo
+ // restore LR from SP
+ // ret
+ //
+ // Because the code to handle more than one stack fixup does not
+ // currently have the proper checks for legality, these cases will assert
+ // in the AArch64 MachineOutliner. This is because the code to do this
+ // needs more hardening, testing, better checks that generated code is
+ // legal, etc and because it is only verified to handle a single pass of
+ // stack fixup.
+ //
+ // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
+ // these cases until they are known to be handled. Bugzilla 46767 is
+ // referenced in comments at the assert site.
+ //
+ // To avoid asserting (or generating non-legal code on noassert builds)
+ // we remove all candidates which would need more than one stack fixup by
+ // pruning the cases where the candidate has calls while also having no
+ // available LR and having no available general purpose registers to copy
+ // LR to (ie one extra stack save/restore).
+ //
+ if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
+ erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
+ return (std::any_of(
+ C.front(), std::next(C.back()),
+ [](const MachineInstr &MI) { return MI.isCall(); })) &&
+ (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C));
+ });
+ }
}
// If we dropped all of the candidates, bail out here.
@@ -6557,7 +6820,7 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
// If v8.3a features are available we can replace a RET instruction by
// RETAA or RETAB and omit the AUT instructions
- if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
+ if (Subtarget.hasPAuth() && MBBAUT != MBB.end() &&
MBBAUT->getOpcode() == AArch64::RET) {
BuildMI(MBB, MBBAUT, DL,
TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
@@ -6609,9 +6872,12 @@ void AArch64InstrInfo::buildOutlinedFrame(
return MI.isCall() && !MI.isReturn();
};
- if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
+ if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
// Fix up the instructions in the range, since we're going to modify the
// stack.
+
+ // Bugzilla ID: 46767
+ // TODO: Check if fixing up twice is safe so we can outline these.
assert(OF.FrameConstructionID != MachineOutlinerDefault &&
"Can only fix up stack references once");
fixupPostOutline(MBB);
@@ -6668,27 +6934,11 @@ void AArch64InstrInfo::buildOutlinedFrame(
// If a bunch of candidates reach this point they must agree on their return
// address signing. It is therefore enough to just consider the signing
// behaviour of one of them
- const Function &CF = OF.Candidates.front().getMF()->getFunction();
- bool ShouldSignReturnAddr = false;
- if (CF.hasFnAttribute("sign-return-address")) {
- StringRef Scope =
- CF.getFnAttribute("sign-return-address").getValueAsString();
- if (Scope.equals("all"))
- ShouldSignReturnAddr = true;
- else if (Scope.equals("non-leaf") && !IsLeafFunction)
- ShouldSignReturnAddr = true;
- }
+ const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>();
+ bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction);
// a_key is the default
- bool ShouldSignReturnAddrWithAKey = true;
- if (CF.hasFnAttribute("sign-return-address-key")) {
- const StringRef Key =
- CF.getFnAttribute("sign-return-address-key").getValueAsString();
- // Key can either be a_key or b_key
- assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
- "Return address signing key must be either a_key or b_key");
- ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
- }
+ bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey();
// If this is a tail call outlined function, then there's already a return.
if (OF.FrameConstructionID == MachineOutlinerTailCall ||
@@ -6847,10 +7097,9 @@ Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
!MI.getOperand(2).isImm())
return None;
- Offset = MI.getOperand(2).getImm() * Sign;
int Shift = MI.getOperand(3).getImm();
assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
- Offset = Offset << Shift;
+ Offset = Sign * (MI.getOperand(2).getImm() << Shift);
}
}
return RegImmPair{MI.getOperand(1).getReg(), Offset};
@@ -6926,6 +7175,14 @@ uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
return get(Opc).TSFlags & AArch64::ElementSizeMask;
}
+bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
+ return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
+}
+
+bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
+ return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
+}
+
unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
return AArch64::BLRNoIP;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 298c04d81708..7434987e0617 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -15,7 +15,6 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
-#include "AArch64StackOffset.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -113,6 +112,10 @@ public:
/// Hint that pairing the given load or store is unprofitable.
static void suppressLdStPair(MachineInstr &MI);
+ Optional<ExtAddrMode>
+ getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const override;
+
bool getMemOperandsWithOffsetWidth(
const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
@@ -188,6 +191,9 @@ public:
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify = false) const override;
+ bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const override;
unsigned removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved = nullptr) const override;
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
@@ -229,9 +235,10 @@ public:
/// Return true when there is potentially a faster code sequence
/// for an instruction chain ending in ``Root``. All potential patterns are
/// listed in the ``Patterns`` array.
- bool getMachineCombinerPatterns(
- MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &Patterns) const override;
+ bool
+ getMachineCombinerPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const override;
/// Return true when Inst is associative and commutative so that it can be
/// reassociated.
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
@@ -273,6 +280,12 @@ public:
bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
/// Returns the vector element size (B, H, S or D) of an SVE opcode.
uint64_t getElementSizeForOpcode(unsigned Opc) const;
+ /// Returns true if the opcode is for an SVE instruction that sets the
+ /// condition codes as if it's results had been fed to a PTEST instruction
+ /// along with the same general predicate.
+ bool isPTestLikeOpcode(unsigned Opc) const;
+ /// Returns true if the opcode is for an SVE WHILE## instruction.
+ bool isWhileOpcode(unsigned Opc) const;
/// Returns true if the instruction has a shift by immediate that can be
/// executed in one cycle less.
static bool isFalkorShiftExtFast(const MachineInstr &MI);
@@ -286,6 +299,13 @@ public:
Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
Register Reg) const override;
+ static void decomposeStackOffsetForFrameOffsets(const StackOffset &Offset,
+ int64_t &NumBytes,
+ int64_t &NumPredicateVectors,
+ int64_t &NumDataVectors);
+ static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset,
+ int64_t &ByteSized,
+ int64_t &VGSized);
#define GET_INSTRINFO_HELPER_DECLS
#include "AArch64GenInstrInfo.inc"
@@ -314,6 +334,12 @@ private:
/// Returns an unused general-purpose register which can be used for
/// constructing an outlined call if one exists. Returns 0 otherwise.
unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+
+ /// Remove a ptest of a predicate-generating operation that already sets, or
+ /// can be made to set, the condition codes in an identical manner
+ bool optimizePTestInstr(MachineInstr *PTest, unsigned MaskReg,
+ unsigned PredReg,
+ const MachineRegisterInfo *MRI) const;
};
/// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
@@ -397,6 +423,18 @@ static inline bool isIndirectBranchOpcode(int Opc) {
return false;
}
+static inline bool isPTrueOpcode(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::PTRUE_B:
+ case AArch64::PTRUE_H:
+ case AArch64::PTRUE_S:
+ case AArch64::PTRUE_D:
+ return true;
+ default:
+ return false;
+ }
+}
+
/// Return opcode to be used for indirect calls.
unsigned getBLRCallOpcode(const MachineFunction &MF);
@@ -404,6 +442,7 @@ unsigned getBLRCallOpcode(const MachineFunction &MF);
#define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits
#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit
#define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits
+#define TSFLAG_INSTR_FLAGS(X) ((X) << 9) // 2-bits
// }
namespace AArch64 {
@@ -436,9 +475,14 @@ enum FalseLaneType {
FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2),
};
+// NOTE: This is a bit field.
+static const uint64_t InstrFlagIsWhile = TSFLAG_INSTR_FLAGS(0x1);
+static const uint64_t InstrFlagIsPTestLike = TSFLAG_INSTR_FLAGS(0x2);
+
#undef TSFLAG_ELEMENT_SIZE_TYPE
#undef TSFLAG_DESTRUCTIVE_INST_TYPE
#undef TSFLAG_FALSE_LANE_TYPE
+#undef TSFLAG_INSTR_FLAGS
int getSVEPseudoMap(uint16_t Opcode);
int getSVERevInstr(uint16_t Opcode);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f4a5f639e497..171d3dbaa814 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -25,14 +25,16 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">,
AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
+def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">,
+ AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
def HasVH : Predicate<"Subtarget->hasVH()">,
AssemblerPredicate<(all_of FeatureVH), "vh">;
def HasLOR : Predicate<"Subtarget->hasLOR()">,
AssemblerPredicate<(all_of FeatureLOR), "lor">;
-def HasPA : Predicate<"Subtarget->hasPA()">,
- AssemblerPredicate<(all_of FeaturePA), "pa">;
+def HasPAuth : Predicate<"Subtarget->hasPAuth()">,
+ AssemblerPredicate<(all_of FeaturePAuth), "pauth">;
def HasJS : Predicate<"Subtarget->hasJS()">,
AssemblerPredicate<(all_of FeatureJS), "jsconv">;
@@ -46,9 +48,6 @@ def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">,
def HasNV : Predicate<"Subtarget->hasNV()">,
AssemblerPredicate<(all_of FeatureNV), "nv">;
-def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">,
- AssemblerPredicate<(all_of FeatureRASv8_4), "rasv8_4">;
-
def HasMPAM : Predicate<"Subtarget->hasMPAM()">,
AssemblerPredicate<(all_of FeatureMPAM), "mpam">;
@@ -70,8 +69,8 @@ def HasPMU : Predicate<"Subtarget->hasPMU()">,
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
-def HasFMI : Predicate<"Subtarget->hasFMI()">,
- AssemblerPredicate<(all_of FeatureFMI), "fmi">;
+def HasFlagM : Predicate<"Subtarget->hasFlagM()">,
+ AssemblerPredicate<(all_of FeatureFlagM), "flagm">;
def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">,
AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
@@ -152,6 +151,16 @@ def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">,
AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">,
AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
+def HasXS : Predicate<"Subtarget->hasXS()">,
+ AssemblerPredicate<(all_of FeatureXS), "xs">;
+def HasWFxT : Predicate<"Subtarget->hasWFxT()">,
+ AssemblerPredicate<(all_of FeatureWFxT), "wfxt">;
+def HasLS64 : Predicate<"Subtarget->hasLS64()">,
+ AssemblerPredicate<(all_of FeatureLS64), "ls64">;
+def HasBRBE : Predicate<"Subtarget->hasBRBE()">,
+ AssemblerPredicate<(all_of FeatureBRBE), "brbe">;
+def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">,
+ AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
@@ -402,6 +411,12 @@ def AArch64call : SDNode<"AArch64ISD::CALL",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
+
+def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
[SDNPHasChain]>;
def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
@@ -484,7 +499,6 @@ def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>;
def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>;
-def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
@@ -504,7 +518,7 @@ def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
- (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+ (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
@@ -556,6 +570,18 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
+def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>;
+def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>;
+
+def AArch64uabd_n : SDNode<"AArch64ISD::UABD", SDT_AArch64binvec>;
+def AArch64sabd_n : SDNode<"AArch64ISD::SABD", SDT_AArch64binvec>;
+
+def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
+ [(AArch64uabd_n node:$lhs, node:$rhs),
+ (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
+def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
+ [(AArch64sabd_n node:$lhs, node:$rhs),
+ (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -591,8 +617,8 @@ let RecomputePerFunction = 1 in {
// Avoid generating STRQro if it is slow, unless we're optimizing for code size.
def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">;
- def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
- def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+ def UseBTI : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
+ def NotUseBTI : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>;
def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
@@ -690,7 +716,8 @@ def : Pat<(AArch64LOADgot tconstpool:$addr),
// 32-bit jump table destination is actually only 2 instructions since we can
// use the table itself as a PC-relative base. But optimization occurs after
// branch relaxation so be pessimistic.
-let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in {
+let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch",
+ isNotDuplicable = 1 in {
def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
Sched<[]>;
@@ -774,8 +801,34 @@ def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
let Inst{12} = 0;
let Predicates = [HasTRACEV8_4];
}
+
+def DSBnXS : CRmSystemI<barrier_nxs_op, 0b001, "dsb"> {
+ let CRm{1-0} = 0b11;
+ let Inst{9-8} = 0b10;
+ let Predicates = [HasXS];
+}
+
+let Predicates = [HasWFxT] in {
+def WFET : RegInputSystemI<0b0000, 0b000, "wfet">;
+def WFIT : RegInputSystemI<0b0000, 0b001, "wfit">;
+}
+
+// Branch Record Buffer two-word mnemonic instructions
+class BRBEI<bits<3> op2, string keyword>
+ : SimpleSystemI<0, (ins), "brb", keyword>, Sched<[WriteSys]> {
+ let Inst{31-8} = 0b110101010000100101110010;
+ let Inst{7-5} = op2;
+ let Predicates = [HasBRBE];
+}
+def BRB_IALL: BRBEI<0b100, "\tiall">;
+def BRB_INJ: BRBEI<0b101, "\tinj">;
+
}
+// Allow uppercase and lowercase keyword arguments for BRB IALL and BRB INJ
+def : TokenAlias<"INJ", "inj">;
+def : TokenAlias<"IALL", "iall">;
+
// ARMv8.2-A Dot Product
let Predicates = [HasDotProd] in {
defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
@@ -796,6 +849,23 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
+
+// Vector-scalar BFDOT:
+// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
+// register (the instruction uses a single 32-bit lane from it), so the pattern
+// is a bit tricky.
+def : Pat<(v2f32 (int_aarch64_neon_bfdot
+ (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+ (v4bf16 (bitconvert
+ (v2i32 (AArch64duplane32
+ (v4i32 (bitconvert
+ (v8bf16 (insert_subvector undef,
+ (v4bf16 V64:$Rm),
+ (i64 0))))),
+ VectorIndexS:$idx)))))),
+ (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+ (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+ VectorIndexS:$idx)>;
}
// ARMv8.6A AArch64 matrix multiplication
@@ -895,6 +965,7 @@ let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
(FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
}
+
let Predicates = [HasComplxNum, HasNEON] in {
def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
(FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
@@ -908,6 +979,47 @@ let Predicates = [HasComplxNum, HasNEON] in {
}
}
+multiclass FCMLA_PATS<ValueType ty, RegisterClass Reg> {
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+ (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 0)>;
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+ (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 1)>;
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+ (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 2)>;
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))),
+ (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>;
+}
+
+multiclass FCMLA_LANE_PATS<ValueType ty, RegisterClass Reg, dag RHSDup> {
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+ (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>;
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+ (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 1)>;
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+ (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 2)>;
+ def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)),
+ (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 3)>;
+}
+
+
+let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+ defm : FCMLA_PATS<v4f16, V64>;
+ defm : FCMLA_PATS<v8f16, V128>;
+
+ defm : FCMLA_LANE_PATS<v4f16, V64,
+ (v4f16 (bitconvert (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexD:$idx))))>;
+ defm : FCMLA_LANE_PATS<v8f16, V128,
+ (v8f16 (bitconvert (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))>;
+}
+let Predicates = [HasComplxNum, HasNEON] in {
+ defm : FCMLA_PATS<v2f32, V64>;
+ defm : FCMLA_PATS<v4f32, V128>;
+ defm : FCMLA_PATS<v2f64, V128>;
+
+ defm : FCMLA_LANE_PATS<v4f32, V128,
+ (v4f32 (bitconvert (v2i64 (AArch64duplane64 (v2i64 V128:$Rm), VectorIndexD:$idx))))>;
+}
+
// v8.3a Pointer Authentication
// These instructions inhabit part of the hint space and so can be used for
// armv8 targets. Keeping the old HINT mnemonic when compiling without PA is
@@ -961,7 +1073,7 @@ def : InstAlias<"autib1716", (AUTIB1716), 0>;
def : InstAlias<"xpaclri", (XPACLRI), 0>;
// These pointer authentication instructions require armv8.3a
-let Predicates = [HasPA] in {
+let Predicates = [HasPAuth] in {
// When PA is enabled, a better mnemonic should be emitted.
def : InstAlias<"paciaz", (PACIAZ), 1>;
@@ -992,8 +1104,8 @@ let Predicates = [HasPA] in {
defm PAC : SignAuth<0b000, 0b010, "pac">;
defm AUT : SignAuth<0b001, 0b011, "aut">;
- def XPACI : SignAuthZero<0b100, 0b00, "xpaci">;
- def XPACD : SignAuthZero<0b100, 0b01, "xpacd">;
+ def XPACI : ClearAuth<0, "xpaci">;
+ def XPACD : ClearAuth<1, "xpacd">;
def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
// Combined Instructions
@@ -1028,7 +1140,7 @@ let Predicates = [HasPA] in {
}
// v8.3a floating point conversion for javascript
-let Predicates = [HasJS, HasFPARMv8] in
+let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in
def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
"fjcvtzs",
[(set GPR32:$Rd,
@@ -1037,7 +1149,7 @@ def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
} // HasJS, HasFPARMv8
// v8.4 Flag manipulation instructions
-let Predicates = [HasFMI] in {
+let Predicates = [HasFlagM], Defs = [NZCV], Uses = [NZCV] in {
def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
let Inst{20-5} = 0b0000001000000000;
}
@@ -1045,7 +1157,7 @@ def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
"{\t$Rn, $imm, $mask}">;
-} // HasFMI
+} // HasFlagM
// v8.5 flag manipulation instructions
let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in {
@@ -1094,9 +1206,12 @@ def HWASAN_CHECK_MEMACCESS : Pseudo<
(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
[(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
Sched<[]>;
+}
+
+let Uses = [ X20 ], Defs = [ X16, X17, LR, NZCV ] in {
def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
- [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
+ [(int_hwasan_check_memaccess_shortgranules X20, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
Sched<[]>;
}
@@ -1443,8 +1558,16 @@ def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext_inreg GPR64:$Rm, i32))),
+ (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext GPR32:$Rm))),
+ (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>;
def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
(SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (and GPR64:$Rm, 0xFFFFFFFF))),
+ (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>;
+def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (zext GPR32:$Rm))),
+ (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>;
def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
(UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
@@ -2031,6 +2154,8 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>,
Sched<[WriteBrReg]>,
PseudoInstExpansion<(BLR GPR64:$Rn)>;
+ def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>,
+ Sched<[WriteBrReg]>;
} // isCall
def : Pat<(AArch64call GPR64:$Rn),
@@ -2040,6 +2165,10 @@ def : Pat<(AArch64call GPR64noip:$Rn),
(BLRNoIP GPR64noip:$Rn)>,
Requires<[SLSBLRMitigation]>;
+def : Pat<(AArch64call_rvmarker GPR64:$Rn),
+ (BLR_RVMARKER GPR64:$Rn)>,
+ Requires<[NoSLSBLRMitigation]>;
+
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
} // isBranch, isTerminator, isBarrier, isIndirectBranch
@@ -3701,18 +3830,6 @@ def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-// And here "(-a) + b*(-c)"
-
-let Predicates = [HasNEON, HasFullFP16] in
-def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))),
- (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
-
-def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
- (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
- (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
//===----------------------------------------------------------------------===//
// Floating point comparison instructions.
//===----------------------------------------------------------------------===//
@@ -3783,7 +3900,7 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
// Floating point immediate move.
//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
defm FMOV : FPMoveImmediate<"fmov">;
}
@@ -3792,7 +3909,7 @@ defm FMOV : FPMoveImmediate<"fmov">;
//===----------------------------------------------------------------------===//
defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
- int_aarch64_neon_uabd>;
+ AArch64uabd>;
// Match UABDL in log2-shuffle patterns.
def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))))),
@@ -3920,19 +4037,11 @@ def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
-def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
-def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
-def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
-def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
-def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
@@ -4038,17 +4147,6 @@ defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
- (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
- (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
- (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
@@ -4062,9 +4160,9 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
-defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
-defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+ TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
+defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
+defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
@@ -4081,9 +4179,9 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
- TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
-defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
-defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+ TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
+defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
+defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
@@ -4481,6 +4579,10 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
(FCVTPSv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
(FCVTPUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtzs (v1f64 FPR64:$Rn))),
+ (FCVTZSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtzu (v1f64 FPR64:$Rn))),
+ (FCVTZUv1i64 FPR64:$Rn)>;
def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
(FRECPEv1f16 FPR16:$Rn)>;
@@ -4652,9 +4754,9 @@ defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
- int_aarch64_neon_sabd>;
+ AArch64sabd>;
defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
- int_aarch64_neon_sabd>;
+ AArch64sabd>;
defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
@@ -4675,20 +4777,58 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
- int_aarch64_neon_uabd>;
+ AArch64uabd>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
- BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+ BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
- BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+ BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
- BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+ BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
- BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+ BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>;
+
+// Additional patterns for [SU]ML[AS]L
+multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperator vecopnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v4i16 (opnode
+ V64:$Ra,
+ (v4i16 (extract_subvector
+ (vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)),
+ (i64 0))))),
+ (EXTRACT_SUBREG (v8i16 (INST8B
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub),
+ V64:$Rn, V64:$Rm)), dsub)>;
+ def : Pat<(v2i32 (opnode
+ V64:$Ra,
+ (v2i32 (extract_subvector
+ (vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)),
+ (i64 0))))),
+ (EXTRACT_SUBREG (v4i32 (INST4H
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub),
+ V64:$Rn, V64:$Rm)), dsub)>;
+ def : Pat<(v1i64 (opnode
+ V64:$Ra,
+ (v1i64 (extract_subvector
+ (vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)),
+ (i64 0))))),
+ (EXTRACT_SUBREG (v2i64 (INST2S
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub),
+ V64:$Rn, V64:$Rm)), dsub)>;
+}
+
+defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
+ UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
+ SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
+ UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
+ SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
// Additional patterns for SMULL and UMULL
multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
@@ -4901,6 +5041,26 @@ defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
+
+let Predicates = [HasFullFP16] in {
+def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))),
+ (FADDPv2i16p
+ (EXTRACT_SUBREG
+ (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))),
+ dsub))>;
+def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))),
+ (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>;
+}
+def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))),
+ (FADDPv2i32p
+ (EXTRACT_SUBREG
+ (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))),
+ dsub))>;
+def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))),
+ (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f64 (vecreduce_fadd (v2f64 V128:$Rn))),
+ (FADDPv2i64p V128:$Rn)>;
+
def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -5152,6 +5312,16 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
(i64 0)),
dsub)>;
+def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0),
+ (i64 VectorIndexH:$imm)),
+ (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
+def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0),
+ (i64 VectorIndexS:$imm)),
+ (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>;
+def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0),
+ (i64 VectorIndexD:$imm)),
+ (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
(INSvi16lane
@@ -6663,7 +6833,17 @@ def : Pat<(i32 (trunc GPR64sp:$src)),
// __builtin_trap() uses the BRK instruction on AArch64.
def : Pat<(trap), (BRK 1)>;
-def : Pat<(debugtrap), (BRK 0xF000)>, Requires<[IsWindows]>;
+def : Pat<(debugtrap), (BRK 0xF000)>;
+
+def ubsan_trap_xform : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() | ('U' << 8), SDLoc(N), MVT::i32);
+}]>;
+
+def ubsan_trap_imm : TImmLeaf<i32, [{
+ return isUInt<8>(Imm);
+}], ubsan_trap_xform>;
+
+def : Pat<(ubsantrap ubsan_trap_imm:$kind), (BRK ubsan_trap_imm:$kind)>;
// Multiply high patterns which multiply the lower subvector using smull/umull
// and the upper subvector with smull2/umull2. Then shuffle the high the high
@@ -7459,6 +7639,9 @@ def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
(vector_extract (v4f32 FPR128:$Rn), (i64 1))),
(f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
+ (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
+ (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
// Scalar 64-bit shifts in FPR64 registers.
def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
@@ -7661,6 +7844,23 @@ let AddedComplexity = 10 in {
// FIXME: add SVE dot-product patterns.
}
+let Predicates = [HasLS64] in {
+ def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
+ (outs GPR64x8:$Rt)>;
+ def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn),
+ (outs)>;
+ def ST64BV: Store64BV<0b011, "st64bv">;
+ def ST64BV0: Store64BV<0b010, "st64bv0">;
+
+ class ST64BPattern<Intrinsic intrinsic, Instruction instruction>
+ : Pat<(intrinsic GPR64sp:$addr, GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7),
+ (instruction (REG_SEQUENCE GPR64x8Class, $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7), $addr)>;
+
+ def : ST64BPattern<int_aarch64_st64b, ST64B>;
+ def : ST64BPattern<int_aarch64_st64bv, ST64BV>;
+ def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>;
+}
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index d975b8bd04fe..ad180cb2935e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1186,8 +1186,10 @@ bool AArch64LoadStoreOpt::findMatchingStore(
// store instruction writes and the stored value is not modified, we can
// promote the load. Since we do not handle stores with pre-/post-index,
// it's unnecessary to check if BaseReg is modified by the store itself.
+ // Also we can't handle stores without an immediate offset operand,
+ // while the operand might be the address for a global variable.
if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
- BaseReg == getLdStBaseOp(MI).getReg() &&
+ BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() &&
isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
StoreI = MBBI;
@@ -1550,16 +1552,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
continue;
}
}
- // If the destination register of the loads is the same register, bail
- // and keep looking. A load-pair instruction with both destination
- // registers the same is UNPREDICTABLE and will result in an exception.
- if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
+ // If the destination register of one load is the same register or a
+ // sub/super register of the other load, bail and keep looking. A
+ // load-pair instruction with both destination registers the same is
+ // UNPREDICTABLE and will result in an exception.
+ if (MayLoad &&
+ TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
TRI);
MemInsns.push_back(&MI);
continue;
}
+ // If the BaseReg has been modified, then we cannot do the optimization.
+ // For example, in the following pattern
+ // ldr x1 [x2]
+ // ldr x2 [x3]
+ // ldr x4 [x2, #8],
+ // the first and third ldr cannot be converted to ldp x1, x4, [x2]
+ if (!ModifiedRegUnits.available(BaseReg))
+ return E;
+
// If the Rt of the second instruction was not modified or used between
// the two instructions and none of the instructions between the second
// and first alias with the second, we can combine the second into the
@@ -1750,6 +1763,11 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
return false;
}
+static bool needsWinCFI(const MachineFunction *MF) {
+ return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+ MF->getFunction().needsUnwindTableEntry();
+}
+
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -1790,14 +1808,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
// the memory access (I) and the increment (MBBI) can access the memory
// region defined by [SP, MBBI].
const bool BaseRegSP = BaseReg == AArch64::SP;
- if (BaseRegSP) {
+ if (BaseRegSP && needsWinCFI(I->getMF())) {
// FIXME: For now, we always block the optimization over SP in windows
// targets as it requires to adjust the unwind/debug info, messing up
// the unwind info can actually cause a miscompile.
- const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
- if (MAI->usesWindowsCFI() &&
- I->getMF()->getFunction().needsUnwindTableEntry())
- return E;
+ return E;
}
for (unsigned Count = 0; MBBI != E && Count < Limit;
@@ -1853,6 +1868,14 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
}
}
+ const bool BaseRegSP = BaseReg == AArch64::SP;
+ if (BaseRegSP && needsWinCFI(I->getMF())) {
+ // FIXME: For now, we always block the optimization over SP in windows
+ // targets as it requires to adjust the unwind/debug info, messing up
+ // the unwind info can actually cause a miscompile.
+ return E;
+ }
+
// Track which register units have been modified and used between the first
// insn (inclusive) and the second insn.
ModifiedRegUnits.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index afd5ae6bcbf2..10e191ff44cf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -203,6 +203,12 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
RefFlags |= AArch64MCExpr::VK_SABS;
} else {
RefFlags |= AArch64MCExpr::VK_ABS;
+
+ if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+ RefFlags |= AArch64MCExpr::VK_PAGE;
+ else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+ AArch64II::MO_PAGEOFF)
+ RefFlags |= AArch64MCExpr::VK_PAGEOFF | AArch64MCExpr::VK_NC;
}
if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index a37e38072554..41343ba9700c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -14,6 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
using namespace llvm;
@@ -30,3 +33,82 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
if (YamlMFI.HasRedZone.hasValue())
HasRedZone = YamlMFI.HasRedZone;
}
+
+static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
+ // The function should be signed in the following situations:
+ // - sign-return-address=all
+ // - sign-return-address=non-leaf and the functions spills the LR
+ if (!F.hasFnAttribute("sign-return-address")) {
+ const Module &M = *F.getParent();
+ if (const auto *Sign = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("sign-return-address"))) {
+ if (Sign->getZExtValue()) {
+ if (const auto *All = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("sign-return-address-all")))
+ return {true, All->getZExtValue()};
+ return {true, false};
+ }
+ }
+ return {false, false};
+ }
+
+ StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
+ if (Scope.equals("none"))
+ return {false, false};
+
+ if (Scope.equals("all"))
+ return {true, true};
+
+ assert(Scope.equals("non-leaf"));
+ return {true, false};
+}
+
+static bool ShouldSignWithBKey(const Function &F) {
+ if (!F.hasFnAttribute("sign-return-address-key")) {
+ if (const auto *BKey = mdconst::extract_or_null<ConstantInt>(
+ F.getParent()->getModuleFlag("sign-return-address-with-bkey")))
+ return BKey->getZExtValue();
+ return false;
+ }
+
+ const StringRef Key =
+ F.getFnAttribute("sign-return-address-key").getValueAsString();
+ assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
+ return Key.equals_lower("b_key");
+}
+
+AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) {
+ // If we already know that the function doesn't have a redzone, set
+ // HasRedZone here.
+ if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
+ HasRedZone = false;
+
+ const Function &F = MF.getFunction();
+ std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F);
+ SignWithBKey = ShouldSignWithBKey(F);
+
+ if (!F.hasFnAttribute("branch-target-enforcement")) {
+ if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+ F.getParent()->getModuleFlag("branch-target-enforcement")))
+ BranchTargetEnforcement = BTE->getZExtValue();
+ return;
+ }
+
+ const StringRef BTIEnable = F.getFnAttribute("branch-target-enforcement").getValueAsString();
+ assert(BTIEnable.equals_lower("true") || BTIEnable.equals_lower("false"));
+ BranchTargetEnforcement = BTIEnable.equals_lower("true");
+}
+
+bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
+ if (!SignReturnAddress)
+ return false;
+ if (SignReturnAddressAll)
+ return true;
+ return SpillsLR;
+}
+
+bool AArch64FunctionInfo::shouldSignReturnAddress() const {
+ return shouldSignReturnAddress(llvm::any_of(
+ MF.getFrameInfo().getCalleeSavedInfo(),
+ [](const auto &Info) { return Info.getReg() == AArch64::LR; }));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 84aa53f2bece..f60e2b6c316e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
#include <cassert>
@@ -36,6 +35,9 @@ class MachineInstr;
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
/// contains private AArch64-specific information for each MachineFunction.
class AArch64FunctionInfo final : public MachineFunctionInfo {
+ /// Backreference to the machine function.
+ MachineFunction &MF;
+
/// Number of bytes of arguments this function has on the stack. If the callee
/// is expected to restore the argument stack this should be a multiple of 16,
/// all usable during a tail call.
@@ -126,26 +128,40 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// that must be forwarded to every musttail call.
SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
- // Offset from SP-at-entry to the tagged base pointer.
- // Tagged base pointer is set up to point to the first (lowest address) tagged
- // stack slot.
- unsigned TaggedBasePointerOffset = 0;
+ /// FrameIndex for the tagged base pointer.
+ Optional<int> TaggedBasePointerIndex;
+
+ /// Offset from SP-at-entry to the tagged base pointer.
+ /// Tagged base pointer is set up to point to the first (lowest address)
+ /// tagged stack slot.
+ unsigned TaggedBasePointerOffset;
/// OutliningStyle denotes, if a function was outined, how it was outlined,
/// e.g. Tail Call, Thunk, or Function if none apply.
Optional<std::string> OutliningStyle;
-public:
- AArch64FunctionInfo() = default;
+ // Offset from SP-after-callee-saved-spills (i.e. SP-at-entry minus
+ // CalleeSavedStackSize) to the address of the frame record.
+ int CalleeSaveBaseToFrameRecordOffset = 0;
- explicit AArch64FunctionInfo(MachineFunction &MF) {
- (void)MF;
+ /// SignReturnAddress is true if PAC-RET is enabled for the function with
+ /// defaults being sign non-leaf functions only, with the B key.
+ bool SignReturnAddress = false;
+
+ /// SignReturnAddressAll modifies the default PAC-RET mode to signing leaf
+ /// functions as well.
+ bool SignReturnAddressAll = false;
+
+ /// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
+ bool SignWithBKey = false;
+
+ /// BranchTargetEnforcement enables placing BTI instructions at potential
+ /// indirect branch destinations.
+ bool BranchTargetEnforcement = false;
+
+public:
+ explicit AArch64FunctionInfo(MachineFunction &MF);
- // If we already know that the function doesn't have a redzone, set
- // HasRedZone here.
- if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
- HasRedZone = false;
- }
void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
@@ -281,15 +297,14 @@ public:
void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
unsigned getJumpTableEntrySize(int Idx) const {
- auto It = JumpTableEntryInfo.find(Idx);
- if (It != JumpTableEntryInfo.end())
- return It->second.first;
- return 4;
+ return JumpTableEntryInfo[Idx].first;
}
MCSymbol *getJumpTableEntryPCRelSymbol(int Idx) const {
- return JumpTableEntryInfo.find(Idx)->second.second;
+ return JumpTableEntryInfo[Idx].second;
}
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym) {
+ if ((unsigned)Idx >= JumpTableEntryInfo.size())
+ JumpTableEntryInfo.resize(Idx+1);
JumpTableEntryInfo[Idx] = std::make_pair(Size, PCRelSym);
}
@@ -331,6 +346,11 @@ public:
return ForwardedMustTailRegParms;
}
+ Optional<int> getTaggedBasePointerIndex() const {
+ return TaggedBasePointerIndex;
+ }
+ void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; }
+
unsigned getTaggedBasePointerOffset() const {
return TaggedBasePointerOffset;
}
@@ -338,12 +358,26 @@ public:
TaggedBasePointerOffset = Offset;
}
+ int getCalleeSaveBaseToFrameRecordOffset() const {
+ return CalleeSaveBaseToFrameRecordOffset;
+ }
+ void setCalleeSaveBaseToFrameRecordOffset(int Offset) {
+ CalleeSaveBaseToFrameRecordOffset = Offset;
+ }
+
+ bool shouldSignReturnAddress() const;
+ bool shouldSignReturnAddress(bool SpillsLR) const;
+
+ bool shouldSignWithBKey() const { return SignWithBKey; }
+
+ bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
+
private:
// Hold the lists of LOHs.
MILOHContainer LOHContainerSet;
SetOfInstructions LOHRelated;
- DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo;
+ SmallVector<std::pair<unsigned, MCSymbol *>, 2> JumpTableEntryInfo;
};
namespace yaml {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index 9a2103579a6a..f3b8ef16d6f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -21,7 +21,7 @@ namespace {
/// CMN, CMP, TST followed by Bcc
static bool isArithmeticBccPair(const MachineInstr *FirstMI,
- const MachineInstr &SecondMI) {
+ const MachineInstr &SecondMI, bool CmpOnly) {
if (SecondMI.getOpcode() != AArch64::Bcc)
return false;
@@ -29,6 +29,13 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI,
if (FirstMI == nullptr)
return true;
+ // If we're in CmpOnly mode, we only fuse arithmetic instructions that
+ // discard their result.
+ if (CmpOnly && !(FirstMI->getOperand(0).getReg() == AArch64::XZR ||
+ FirstMI->getOperand(0).getReg() == AArch64::WZR)) {
+ return false;
+ }
+
switch (FirstMI->getOpcode()) {
case AArch64::ADDSWri:
case AArch64::ADDSWrr:
@@ -380,8 +387,11 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
// All checking functions assume that the 1st instr is a wildcard if it is
// unspecified.
- if (ST.hasArithmeticBccFusion() && isArithmeticBccPair(FirstMI, SecondMI))
- return true;
+ if (ST.hasCmpBccFusion() || ST.hasArithmeticBccFusion()) {
+ bool CmpOnly = !ST.hasArithmeticBccFusion();
+ if (isArithmeticBccPair(FirstMI, SecondMI, CmpOnly))
+ return true;
+ }
if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI))
return true;
if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 0d75ab7ac8a9..019220e3a527 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -408,6 +408,11 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) {
O.getReg() != CmpReg;
}))
continue;
+
+ // Don't remove a move immediate that implicitly defines the upper
+ // bits as different.
+ if (TRI->isSuperRegister(DefReg, KnownReg.Reg) && KnownReg.Imm < 0)
+ continue;
}
if (IsCopy)
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 3e9c8c7b6df2..f90856d14b2f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,7 +15,6 @@
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
-#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
@@ -25,6 +24,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
@@ -240,6 +240,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
}
+const uint32_t *AArch64RegisterInfo::getCustomEHPadPreservedMask(
+ const MachineFunction &MF) const {
+ if (MF.getSubtarget<AArch64Subtarget>().isTargetLinux())
+ return CSR_AArch64_AAPCS_RegMask;
+
+ return nullptr;
+}
+
const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
if (TT.isOSDarwin())
return CSR_Darwin_AArch64_TLS_RegMask;
@@ -326,16 +334,16 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
}
bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const {
- return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC),
- std::end(*AArch64::GPR64argRegClass.MC),
- [this, &MF](MCPhysReg r){return isReservedReg(MF, r);});
+ return llvm::any_of(*AArch64::GPR64argRegClass.MC, [this, &MF](MCPhysReg r) {
+ return isReservedReg(MF, r);
+ });
}
void AArch64RegisterInfo::emitReservedArgRegCallError(
const MachineFunction &MF) const {
const Function &F = MF.getFunction();
- F.getContext().diagnose(DiagnosticInfoUnsupported{F, "AArch64 doesn't support"
- " function calls if any of the argument registers is reserved."});
+ F.getContext().diagnose(DiagnosticInfoUnsupported{F, ("AArch64 doesn't support"
+ " function calls if any of the argument registers is reserved.")});
}
bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
@@ -517,16 +525,16 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
Register BaseReg,
int64_t Offset) const {
assert(MI && "Unable to get the legal offset for nil instruction.");
- StackOffset SaveOffset(Offset, MVT::i8);
+ StackOffset SaveOffset = StackOffset::getFixed(Offset);
return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
}
/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
/// at the beginning of the basic block.
-void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
- Register BaseReg,
- int FrameIdx,
- int64_t Offset) const {
+Register
+AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ int FrameIdx,
+ int64_t Offset) const {
MachineBasicBlock::iterator Ins = MBB->begin();
DebugLoc DL; // Defaults to "unknown"
if (Ins != MBB->end())
@@ -536,6 +544,7 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
@@ -543,19 +552,21 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
.addFrameIndex(FrameIdx)
.addImm(Offset)
.addImm(Shifter);
+
+ return BaseReg;
}
void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
// ARM doesn't need the general 64-bit offsets
- StackOffset Off(Offset, MVT::i8);
+ StackOffset Off = StackOffset::getFixed(Offset);
unsigned i = 0;
-
while (!MI.getOperand(i).isFI()) {
++i;
assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
}
+
const MachineFunction *MF = MI.getParent()->getParent();
const AArch64InstrInfo *TII =
MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
@@ -585,6 +596,33 @@ createScratchRegisterForInstruction(MachineInstr &MI,
}
}
+void AArch64RegisterInfo::getOffsetOpcodes(
+ const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const {
+ // The smallest scalable element supported by scaled SVE addressing
+ // modes are predicates, which are 2 scalable bytes in size. So the scalable
+ // byte offset must always be a multiple of 2.
+ assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
+
+ // Add fixed-sized offset using existing DIExpression interface.
+ DIExpression::appendOffset(Ops, Offset.getFixed());
+
+ unsigned VG = getDwarfRegNum(AArch64::VG, true);
+ int64_t VGSized = Offset.getScalable() / 2;
+ if (VGSized > 0) {
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(VGSized);
+ Ops.append({dwarf::DW_OP_bregx, VG, 0ULL});
+ Ops.push_back(dwarf::DW_OP_mul);
+ Ops.push_back(dwarf::DW_OP_plus);
+ } else if (VGSized < 0) {
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(-VGSized);
+ Ops.append({dwarf::DW_OP_bregx, VG, 0ULL});
+ Ops.push_back(dwarf::DW_OP_mul);
+ Ops.push_back(dwarf::DW_OP_minus);
+ }
+}
+
void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
@@ -597,29 +635,31 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
const AArch64InstrInfo *TII =
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
const AArch64FrameLowering *TFI = getFrameLowering(MF);
-
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
bool Tagged =
MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
Register FrameReg;
- // Special handling of dbg_value, stackmap and patchpoint instructions.
- if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
- MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+ // Special handling of dbg_value, stackmap patchpoint statepoint instructions.
+ if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+ MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+ MI.getOpcode() == TargetOpcode::STATEPOINT) {
StackOffset Offset =
TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
/*PreferFP=*/true,
/*ForSimm=*/false);
- Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
+ Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
- MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
return;
}
if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
MachineOperand &FI = MI.getOperand(FIOperandNum);
- int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
- FI.ChangeToImmediate(Offset);
+ StackOffset Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
+ assert(!Offset.getScalable() &&
+ "Frame offsets with a scalable component are not supported");
+ FI.ChangeToImmediate(Offset.getFixed());
return;
}
@@ -628,12 +668,11 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// TAGPstack must use the virtual frame register in its 3rd operand.
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
FrameReg = MI.getOperand(3).getReg();
- Offset = {MFI.getObjectOffset(FrameIndex) +
- AFI->getTaggedBasePointerOffset(),
- MVT::i8};
+ Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) +
+ AFI->getTaggedBasePointerOffset());
} else if (Tagged) {
- StackOffset SPOffset = {
- MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
+ StackOffset SPOffset = StackOffset::getFixed(
+ MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize());
if (MFI.hasVarSizedObjects() ||
isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
(AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
@@ -654,8 +693,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
return;
}
FrameReg = AArch64::SP;
- Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
- MVT::i8};
+ Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) +
+ (int64_t)MFI.getStackSize());
} else {
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
@@ -726,3 +765,19 @@ unsigned AArch64RegisterInfo::getLocalAddressRegister(
return getBaseRegister();
return getFrameRegister(MF);
}
+
+/// SrcRC and DstRC will be morphed into NewRC if this returns true
+bool AArch64RegisterInfo::shouldCoalesce(
+ MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
+ const TargetRegisterClass *DstRC, unsigned DstSubReg,
+ const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+ if (MI->isCopy() &&
+ ((DstRC->getID() == AArch64::GPR64RegClassID) ||
+ (DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
+ MI->getOperand(0).getSubReg() && MI->getOperand(1).getSubReg())
+ // Do not coalesce in the case of a 32-bit subregister copy
+ // which implements a 32 to 64 bit zero extension
+ // which relies on the upper 32 bits being zeroed.
+ return false;
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 7b20f181e76d..0c871ac089a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -72,6 +72,10 @@ public:
// Funclets on ARM64 Windows don't preserve any registers.
const uint32_t *getNoPreservedMask() const override;
+ // Unwinders may not preserve all Neon and SVE registers.
+ const uint32_t *
+ getCustomEHPadPreservedMask(const MachineFunction &MF) const override;
+
/// getThisReturnPreservedMask - Returns a call preserved mask specific to the
/// case that 'returned' is on an i64 first argument if the calling convention
/// is one that can (partially) model this attribute with a preserved mask
@@ -103,9 +107,8 @@ public:
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
- int FrameIdx,
- int64_t Offset) const override;
+ Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+ int64_t Offset) const override;
void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
@@ -125,6 +128,15 @@ public:
unsigned getLocalAddressRegister(const MachineFunction &MF) const;
bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
+
+ /// SrcRC and DstRC will be morphed into NewRC if this returns true
+ bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC,
+ unsigned SubReg, const TargetRegisterClass *DstRC,
+ unsigned DstSubReg, const TargetRegisterClass *NewRC,
+ LiveIntervals &LIS) const override;
+
+ void getOffsetOpcodes(const StackOffset &Offset,
+ SmallVectorImpl<uint64_t> &Ops) const override;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 54b351fda053..28d1988b8a5f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -711,6 +711,32 @@ def XSeqPairClassOperand :
//===----- END: v8.1a atomic CASP register operands -----------------------===//
+//===----------------------------------------------------------------------===//
+// Armv8.7a accelerator extension register operands: 8 consecutive GPRs
+// starting with an even one
+
+let Namespace = "AArch64" in {
+ foreach i = 0-7 in
+ def "x8sub_"#i : SubRegIndex<64, !mul(64, i)>;
+}
+
+def Tuples8X : RegisterTuples<
+ !foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)),
+ !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>;
+
+def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>;
+def GPR64x8AsmOp : AsmOperandClass {
+ let Name = "GPR64x8";
+ let ParserMethod = "tryParseGPR64x8";
+ let RenderMethod = "addRegOperands";
+}
+def GPR64x8 : RegisterOperand<GPR64x8Class, "printGPR64x8"> {
+ let ParserMatchClass = GPR64x8AsmOp;
+ let PrintMethod = "printGPR64x8";
+}
+
+//===----- END: v8.7a accelerator extension register operands -------------===//
+
// SVE predicate registers
def P0 : AArch64Reg<0, "p0">, DwarfRegNum<[48]>;
def P1 : AArch64Reg<1, "p1">, DwarfRegNum<[49]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index fc31e701d3af..03b32967a212 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -221,8 +221,9 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
// if so, return it.
std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
- if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
- return SIMDInstrTable[InstID];
+ auto It = SIMDInstrTable.find(InstID);
+ if (It != SIMDInstrTable.end())
+ return It->second;
unsigned SCIdx = InstDesc->getSchedClass();
const MCSchedClassDesc *SCDesc =
@@ -290,8 +291,9 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
case Interleave:
std::string Subtarget =
std::string(SchedModel.getSubtargetInfo()->getCPU());
- if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
- return InterlEarlyExit[Subtarget];
+ auto It = InterlEarlyExit.find(Subtarget);
+ if (It != InterlEarlyExit.end())
+ return It->second;
for (auto &I : IRT) {
OriginalMCID = &TII->get(I.OrigOpc);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4f29f2f18185..e09b8401c0e0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -152,6 +152,8 @@ def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>;
def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>;
def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
+def AArch64saddv_p : SDNode<"AArch64ISD::SADDV_PRED", SDT_AArch64Reduce>;
+def AArch64uaddv_p : SDNode<"AArch64ISD::UADDV_PRED", SDT_AArch64Reduce>;
def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
@@ -164,29 +166,83 @@ def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;
def SDT_AArch64Arith : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
- SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
+ SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>
]>;
def SDT_AArch64FMA : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
- SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
+ SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
]>;
// Predicated operations with the result of inactive lanes being unspecified.
def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
+def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>;
def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
+def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>;
def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>;
+def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>;
+def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>;
+def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>;
+def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>;
+def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>;
+def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>;
+def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>;
def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
+def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
+def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
+def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>;
def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
+def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
+def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
-// Merging op1 into the inactive lanes.
-def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>;
-def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>;
+def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4>
+]>;
+
+// Predicated operations with the result of inactive lanes provided by the last operand.
+def AArch64clz_mt : SDNode<"AArch64ISD::CTLZ_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64cnt_mt : SDNode<"AArch64ISD::CTPOP_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64fneg_mt : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64fabs_mt : SDNode<"AArch64ISD::FABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64abs_mt : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64neg_mt : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
+def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>;
+def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintm_mt : SDNode<"AArch64ISD::FFLOOR_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frinti_mt : SDNode<"AArch64ISD::FNEARBYINT_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintx_mt : SDNode<"AArch64ISD::FRINT_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frinta_mt : SDNode<"AArch64ISD::FROUND_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintn_mt : SDNode<"AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frintz_mt : SDNode<"AArch64ISD::FTRUNC_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64fsqrt_mt : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>;
+
+// These are like the above but we don't yet have need for ISD nodes. They allow
+// a single pattern to match intrinsic and ISD operand layouts.
+def AArch64cls_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cls node:$pt, node:$pg, node:$op)]>;
+def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>;
+def AArch64not_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not node:$pt, node:$pg, node:$op)]>;
+
+def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
+ SDTCVecEltisVT<1,i1>
+]>;
+
+def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>,
+ SDTCVecEltisVT<1,i1>
+]>;
+
+def AArch64fcvtr_mt : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>;
+def AArch64fcvte_mt : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64ucvtf_mt : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
+def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
@@ -207,6 +263,24 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
+def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs),
+ [(setoge node:$lhs, node:$rhs),
+ (setge node:$lhs, node:$rhs)]>;
+def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs),
+ [(setogt node:$lhs, node:$rhs),
+ (setgt node:$lhs, node:$rhs)]>;
+def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs),
+ [(setoeq node:$lhs, node:$rhs),
+ (seteq node:$lhs, node:$rhs)]>;
+def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs),
+ [(setone node:$lhs, node:$rhs),
+ (setne node:$lhs, node:$rhs)]>;
+def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
+ (AArch64mul_p node:$pred, node:$src1, node:$src2), [{
+ return N->hasOneUse();
+}]>;
+
+
let Predicates = [HasSVE] in {
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
@@ -231,6 +305,7 @@ let Predicates = [HasSVE] in {
defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>;
+ defm SUB_ZPZZ : sve_int_bin_pred_bhsd<AArch64sub_p>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
@@ -253,12 +328,12 @@ let Predicates = [HasSVE] in {
defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
- defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
- defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;
+ defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>;
+ defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>;
// SVE predicated integer reductions.
- defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
- defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
+ defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>;
+ defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", AArch64uaddv_p>;
defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
@@ -271,25 +346,17 @@ let Predicates = [HasSVE] in {
defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
- defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
- defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
- defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
- defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;
-
- defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
- defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
- defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
- defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;
-
- // Add unpredicated alternative for the mul instruction.
- def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
- (MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
- def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
- (MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
- def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
- (MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
- def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
- (MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;
+ defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>;
+ defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>;
+ defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_p>;
+ defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_p>;
+
+ defm MUL_ZI : sve_int_arith_imm2<"mul", AArch64mul_p>;
+ defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", "MUL_ZPZZ", int_aarch64_sve_mul, DestructiveBinaryComm>;
+ defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", "SMULH_ZPZZ", int_aarch64_sve_smulh, DestructiveBinaryComm>;
+ defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>;
+
+ defm MUL_ZPZZ : sve_int_bin_pred_bhsd<AArch64mul_p>;
defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
@@ -305,34 +372,34 @@ let Predicates = [HasSVE] in {
defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
- defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
- defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
- defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
- defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
- defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
- defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
- defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
- defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;
-
- defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>;
- defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>;
- defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;
-
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
- }
-
- defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>;
- defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>;
- defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
- defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;
-
- defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
- defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
- defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
- defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
- defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
- defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;
+ defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", AArch64sxt_mt>;
+ defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", AArch64uxt_mt>;
+ defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", AArch64sxt_mt>;
+ defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", AArch64uxt_mt>;
+ defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", AArch64sxt_mt>;
+ defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", AArch64uxt_mt>;
+ defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", AArch64abs_mt>;
+ defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", AArch64neg_mt>;
+
+ defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", AArch64cls_mt>;
+ defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", AArch64clz_mt>;
+ defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", AArch64cnt_mt>;
+ defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", AArch64cnot_mt>;
+ defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", AArch64not_mt>;
+ defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
+ defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;
+
+ defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>;
+ defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>;
+ defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>;
+ defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", "UMIN_ZPZZ", int_aarch64_sve_umin, DestructiveBinaryComm>;
+ defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>;
+ defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>;
+
+ defm SMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64smax_p>;
+ defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>;
+ defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>;
+ defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>;
defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>;
defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;
@@ -361,6 +428,11 @@ let Predicates = [HasSVE] in {
defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;
defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>;
+ defm FSUB_ZPZZ : sve_fp_bin_pred_hfd<AArch64fsub_p>;
+ defm FMUL_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmul_p>;
+ defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmaxnm_p>;
+ defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>;
+ defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
@@ -377,10 +449,10 @@ let Predicates = [HasSVE] in {
defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
}
- defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
- defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
- defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul>;
- defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
+ defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>;
+ defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>;
+ defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>;
+ defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>;
defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;
@@ -404,8 +476,14 @@ let Predicates = [HasSVE] in {
// regalloc.
def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
(FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
+ def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)),
+ (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
+ def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)),
+ (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
(FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
+ def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)),
+ (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
(FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
@@ -425,15 +503,6 @@ let Predicates = [HasSVE] in {
defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>;
defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;
- // Use more efficient NEON instructions to extract elements within the NEON
- // part (first 128bits) of an SVE register.
- def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
- (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
- def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
- (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
- def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
- (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
-
// Splat immediate (unpredicated)
defm DUP_ZI : sve_int_dup_imm<"dup">;
defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
@@ -452,11 +521,6 @@ let Predicates = [HasSVE] in {
defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
- let Predicates = [HasSVE, HasBF16] in {
- def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
- (CPY_ZPmV_H $passthru, $pg, $splat)>;
- }
-
// Duplicate FP scalar into all vector elements
def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
(DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
@@ -470,10 +534,8 @@ let Predicates = [HasSVE] in {
(DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
(DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
- let Predicates = [HasSVE, HasBF16] in {
- def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
- (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
- }
+ def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+ (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
// Duplicate +0.0 into all vector elements
def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
@@ -482,9 +544,7 @@ let Predicates = [HasSVE] in {
def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
- let Predicates = [HasSVE, HasBF16] in {
- def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
- }
+ def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
// Duplicate Int immediate into all vector elements
def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
@@ -513,36 +573,23 @@ let Predicates = [HasSVE] in {
}
// Select elements from either vector (predicated)
- defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
+ defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
- def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
- }
-
defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
- }
-
- defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
- defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
+ defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>;
+ defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>;
defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
- }
-
defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
@@ -599,23 +646,11 @@ let Predicates = [HasSVE] in {
defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_3_Op_Pat<bf16, AArch64clasta_n, nxv8i1, bf16, nxv8bf16, CLASTA_VPZ_H>;
- def : SVE_3_Op_Pat<bf16, AArch64clastb_n, nxv8i1, bf16, nxv8bf16, CLASTB_VPZ_H>;
- def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
- def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
- }
-
defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
- def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
- }
-
// continuous load with reg+immediate
defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
@@ -1000,7 +1035,7 @@ let Predicates = [HasSVE] in {
def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
-multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
+ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
// reg + imm
let AddedComplexity = 2 in {
def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
@@ -1082,10 +1117,6 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
- }
-
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
@@ -1093,15 +1124,6 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
- def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
- def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
- def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
- def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
- def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
- }
-
defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
@@ -1123,6 +1145,29 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
(ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
+ // Extract subvectors from FP SVE vectors
+ def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
+ (UUNPKLO_ZZ_D ZPR:$Zs)>;
+ def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
+ (UUNPKHI_ZZ_D ZPR:$Zs)>;
+ def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
+ (UUNPKLO_ZZ_S ZPR:$Zs)>;
+ def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
+ (UUNPKHI_ZZ_S ZPR:$Zs)>;
+ def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))),
+ (UUNPKLO_ZZ_D ZPR:$Zs)>;
+ def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))),
+ (UUNPKHI_ZZ_D ZPR:$Zs)>;
+
+ def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))),
+ (UUNPKLO_ZZ_D ZPR:$Zs)>;
+ def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))),
+ (UUNPKHI_ZZ_D ZPR:$Zs)>;
+ def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))),
+ (UUNPKLO_ZZ_S ZPR:$Zs)>;
+ def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))),
+ (UUNPKHI_ZZ_S ZPR:$Zs)>;
+
// Concatenate two predicates.
def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
(UZP1_PPP_S $p1, $p2)>;
@@ -1131,6 +1176,18 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
(UZP1_PPP_B $p1, $p2)>;
+ // Concatenate two floating point vectors.
+ def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)),
+ (UZP1_ZZZ_S $v1, $v2)>;
+ def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)),
+ (UZP1_ZZZ_H $v1, $v2)>;
+ def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)),
+ (UZP1_ZZZ_S $v1, $v2)>;
+ def : Pat<(nxv4bf16 (concat_vectors nxv2bf16:$v1, nxv2bf16:$v2)),
+ (UZP1_ZZZ_S $v1, $v2)>;
+ def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)),
+ (UZP1_ZZZ_H $v1, $v2)>;
+
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
@@ -1160,10 +1217,10 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
- defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
- defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
- defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
- defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
+ defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>;
+ defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>;
+ defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>;
+ defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>;
defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
@@ -1288,82 +1345,145 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm INDEX_II : sve_int_index_ii<"index", index_vector>;
// Unpredicated shifts
- defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
- defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
- defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;
+ defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>;
+ defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>;
+ defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>;
defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
// Predicated shifts
- defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
- defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
- defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
- defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+ defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>;
+ defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>;
+ defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>;
+ defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+
+ defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
+ defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
+ defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
- defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
- defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
- defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
+ defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
+ defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
+ defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
}
- defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
- defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
- defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
+ defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
+ defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">;
+ defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">;
defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>;
defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>;
defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>;
+ defm ASR_ZPZZ : sve_int_bin_pred_bhsd<AArch64asr_p>;
+ defm LSR_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsr_p>;
+ defm LSL_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsl_p>;
+
defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
- defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
- defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
- defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
- defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
- defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
- defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
- defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
- defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
- defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
- defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
- defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
- defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
- defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
- defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
- defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
- defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
- defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
- defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
- defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
- defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
- defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
- defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
- defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
- defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
- defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
- defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
- defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
- defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
- defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
- defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
- defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
- defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
- defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
- defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
-
- defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
- defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
- defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>;
- defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>;
- defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>;
- defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>;
- defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>;
- defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
- defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>;
+ defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zdr<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, AArch64fcvtr_mt, nxv4f16, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, AArch64fcvte_mt, nxv4f32, nxv4i1, nxv4f16, ElementSizeS>;
+ defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110010, "scvtf", ZPR16, ZPR16, null_frag, AArch64scvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+ defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010100, "scvtf", ZPR32, ZPR32, null_frag, AArch64scvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+ defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010101, "ucvtf", ZPR32, ZPR32, null_frag, AArch64ucvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+ defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110011, "ucvtf", ZPR16, ZPR16, null_frag, AArch64ucvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+ defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag, AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+ defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag, AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+ defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zdr<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, AArch64fcvtr_mt, nxv2f16, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
+ defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
+ defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+ defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+ defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+ defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+ defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+ defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, AArch64scvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, AArch64ucvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, AArch64ucvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+ defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110110, "scvtf", ZPR64, ZPR64, null_frag, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110111, "ucvtf", ZPR64, ZPR64, null_frag, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+ defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+ defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+ defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+ defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+ defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+ defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+ defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+
+ def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+ (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ // FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
+ // This is ignored by the pattern below where it is matched by (i64 timm0_1)
+ def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+ (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ // Floating-point -> signed integer
+ def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))),
+ (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg),
+ (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))),
+ (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))),
+ (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))),
+ (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
+ (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))),
+ (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ // Floating-point -> unsigned integer
+ def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ (and (nxv2i64 ZPR:$Zs),
+ (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))),
+ (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ (and (nxv2i64 ZPR:$Zs),
+ (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))),
+ (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg),
+ (and (nxv4i32 ZPR:$Zs),
+ (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))),
+ (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ (and (nxv2i64 ZPR:$Zs),
+ (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))),
+ (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg),
+ (and (nxv2i64 ZPR:$Zs),
+ (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))),
+ (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+ defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>;
+ defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>;
+ defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", AArch64frintm_mt>;
+ defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", AArch64frintz_mt>;
+ defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", AArch64frinta_mt>;
+ defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", AArch64frintx_mt>;
+ defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>;
+ defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>;
+ defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>;
let Predicates = [HasBF16, HasSVE] in {
defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
@@ -1528,6 +1648,9 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
}
+ def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+ (ADDVL_XXI GPR64:$op, $imm)>;
+
// FIXME: BigEndian requires an additional REV instruction to satisfy the
// constraint that none of the bits change when stored to memory as one
// type, and and reloaded as another type.
@@ -1581,15 +1704,6 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
- }
-
- let Predicates = [IsLE, HasBF16, HasSVE] in {
- def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
- def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
- def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
- }
-
- let Predicates = [IsLE, HasSVE, HasBF16] in {
def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
@@ -1607,6 +1721,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
}
+ // These allow casting from/to unpacked predicate types.
def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
@@ -1621,6 +1736,18 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ // These allow casting from/to unpacked floating-point types.
+ def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv8f16 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv4f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv8f16 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv2f32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv4f32 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv2bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv8bf16 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv4bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv8bf16 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+
def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
@@ -1673,10 +1800,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
-
- let Predicates = [HasBF16, HasSVE] in {
- defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
- }
+ defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous loads
defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
@@ -1714,13 +1838,10 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous stores
- defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
- defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
- defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
-
- let Predicates = [HasBF16, HasSVE] in {
- defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
- }
+ defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous stores
defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
@@ -1882,10 +2003,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
-
- let Predicates = [HasBF16, HasSVE] in {
- defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
- }
+ defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
// 16-element contiguous loads
defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
@@ -1925,10 +2043,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>;
defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>;
defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>;
-
- let Predicates = [HasBF16, HasSVE] in {
- defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>;
- }
+ defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>;
// 16-element contiguous non-faulting loads
defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>;
@@ -1969,10 +2084,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
-
- let Predicates = [HasBF16, HasSVE] in {
- defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
- }
+ defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
// 16-element contiguous first faulting loads
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
@@ -2023,6 +2135,19 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
(INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+ def : Pat<(nxv8f16 (vector_insert (nxv8f16 (undef)), (f16 FPR16:$src), 0)),
+ (INSERT_SUBREG (nxv8f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>;
+ def : Pat<(nxv4f16 (vector_insert (nxv4f16 (undef)), (f16 FPR16:$src), 0)),
+ (INSERT_SUBREG (nxv4f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>;
+ def : Pat<(nxv2f16 (vector_insert (nxv2f16 (undef)), (f16 FPR16:$src), 0)),
+ (INSERT_SUBREG (nxv2f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>;
+ def : Pat<(nxv4f32 (vector_insert (nxv4f32 (undef)), (f32 FPR32:$src), 0)),
+ (INSERT_SUBREG (nxv4f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv2f32 (vector_insert (nxv2f32 (undef)), (f32 FPR32:$src), 0)),
+ (INSERT_SUBREG (nxv2f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv2f64 (vector_insert (nxv2f64 (undef)), (f64 FPR64:$src), 0)),
+ (INSERT_SUBREG (nxv2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
// Insert scalar into vector[0]
def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
(CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
@@ -2086,6 +2211,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
(DUP_ZR_D $index)),
$src)>;
+ // Extract element from vector with scalar index
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+
// Extract element from vector with immediate index
def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
(EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
@@ -2097,34 +2244,54 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+ def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
+ def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+ def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>;
def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
- // Extract element from vector with scalar index
- def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
- ZPR:$vec)>;
+ // Extract element from vector with immediate index that's within the bottom 128-bits.
+ let AddedComplexity = 1 in {
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
+ (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
+ (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+ (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
+ (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
+ }
- def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
- (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
- (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
- (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
- ZPR:$vec)>;
+ // Extract first element from vector.
+ let AddedComplexity = 2 in {
+ def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
+ (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)),
+ (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)),
+ (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)),
+ (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+ def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+ def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+ def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+ def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
+ (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)),
+ (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
+ (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+ }
}
let Predicates = [HasSVE, HasMatMulInt8] in {
@@ -2158,15 +2325,6 @@ let Predicates = [HasSVE, HasMatMulFP64] in {
defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
}
-let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
- def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
- def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
- def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
- def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
- def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
- def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
-}
-
let Predicates = [HasSVE2] in {
// SVE2 integer multiply-add (indexed)
defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
@@ -2192,10 +2350,10 @@ let Predicates = [HasSVE2] in {
defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;
// SVE2 integer multiply vectors (unpredicated)
- defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>;
+ defm MUL_ZZZ : sve2_int_mul<0b000, "mul", null_frag, AArch64mul_p>;
defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>;
defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>;
- defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
+ defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
// Add patterns for unpredicated version of smulh and umulh.
def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
@@ -2214,6 +2372,7 @@ let Predicates = [HasSVE2] in {
(UMULH_ZZZ_S $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
(UMULH_ZZZ_D $Op1, $Op2)>;
+
// SVE2 complex integer dot product (indexed)
defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
@@ -2335,11 +2494,11 @@ let Predicates = [HasSVE2] in {
}
// SVE2 predicated shifts
- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
- defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
// SVE2 integer add/subtract long
defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
@@ -2546,13 +2705,6 @@ let Predicates = [HasSVE2] in {
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
- let Predicates = [HasSVE, HasBF16] in {
- def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
- def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
- (nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
- nxv8i16:$Op3))>;
- }
-
// SVE2 integer compare scalar count and limit
defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
new file mode 100644
index 000000000000..50911fd22bc9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -0,0 +1,339 @@
+//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-A55 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A55 machine model for scheduling and other instruction cost heuristics.
+def CortexA55Model : SchedMachineModel {
+ let MicroOpBufferSize = 0; // The Cortex-A55 is an in-order processor
+ let IssueWidth = 2; // It dual-issues under most circumstances
+ let LoadLatency = 4; // Cycles for loads to access the cache. The
+ // optimisation guide shows that most loads have
+ // a latency of 3, but some have a latency of 4
+ // or 5. Setting it 4 looked to be good trade-off.
+ let MispredictPenalty = 8; // A branch direction mispredict.
+ let PostRAScheduler = 1; // Enable PostRA scheduler pass.
+ let CompleteModel = 0; // Covers instructions applicable to Cortex-A55.
+
+ list<Predicate> UnsupportedFeatures = [HasSVE];
+
+ // FIXME: Remove when all errors have been fixed.
+ let FullInstRWOverlapCheck = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
+// Cortex-A55 is in-order.
+
+def CortexA55UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def CortexA55UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide
+def CortexA55UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined
+def CortexA55UnitLd : ProcResource<1> { let BufferSize = 0; } // Load pipe
+def CortexA55UnitSt : ProcResource<1> { let BufferSize = 0; } // Store pipe
+def CortexA55UnitB : ProcResource<1> { let BufferSize = 0; } // Branch
+
+// The FP DIV/SQRT instructions execute totally differently from the FP ALU
+// instructions, which can mostly be dual-issued; that's why for now we model
+// them with 2 resources.
+def CortexA55UnitFPALU : ProcResource<2> { let BufferSize = 0; } // FP ALU
+def CortexA55UnitFPMAC : ProcResource<2> { let BufferSize = 0; } // FP MAC
+def CortexA55UnitFPDIV : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types
+
+let SchedModel = CortexA55Model in {
+
+// These latencies are modeled without taking into account forwarding paths
+// (the software optimisation guide lists latencies taking into account
+// typical forwarding paths).
+def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; } // MOVN, MOVZ
+def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; } // ALU
+def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; } // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; } // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; } // EXTR from a reg pair
+def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; } // Shift/Scale
+
+// MAC
+def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; } // 32-bit Multiply
+def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; } // 64-bit Multiply
+
+// Div
+def : WriteRes<WriteID32, [CortexA55UnitDiv]> {
+ let Latency = 8; let ResourceCycles = [8];
+}
+def : WriteRes<WriteID64, [CortexA55UnitDiv]> {
+ let Latency = 8; let ResourceCycles = [8];
+}
+
+// Load
+def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; }
+def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+// below, choosing the median of 3 which makes the latency 6.
+// An extra cycle is needed to get the swizzling right.
+def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; }
+def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5;
+ let ResourceCycles = [2]; }
+def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7;
+ let ResourceCycles = [4]; }
+def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8;
+ let ResourceCycles = [5]; }
+def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9;
+ let ResourceCycles = [6]; }
+def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10;
+ let ResourceCycles = [7]; }
+def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11;
+ let ResourceCycles = [8]; }
+
+// Pre/Post Indexing - Performed as part of address generation
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5;
+ let ResourceCycles = [2];}
+def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; }
+def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
+ let ResourceCycles = [2]; }
+def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6;
+ let ResourceCycles = [3]; }
+def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5;
+ let ResourceCycles = [4]; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [CortexA55UnitB]>;
+def : WriteRes<WriteBrReg, [CortexA55UnitB]>;
+def : WriteRes<WriteSys, [CortexA55UnitB]>;
+def : WriteRes<WriteBarrier, [CortexA55UnitB]>;
+def : WriteRes<WriteHint, [CortexA55UnitB]>;
+
+// FP ALU
+// As WriteF result is produced in F5 and it can be mostly forwarded
+// to consumer at F1, the effectively latency is set as 4.
+def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; }
+def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
+def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
+def : WriteRes<WriteV, [CortexA55UnitFPALU]> { let Latency = 4; }
+
+// FP ALU specific new schedwrite definitions
+def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;}
+def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;}
+def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;}
+
+// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
+def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
+def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
+ let ResourceCycles = [29]; }
+def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
+def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
+ let ResourceCycles = [5]; }
+def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13;
+ let ResourceCycles = [10]; }
+def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
+ let ResourceCycles = [19]; }
+def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8;
+ let ResourceCycles = [5]; }
+def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12;
+ let ResourceCycles = [9]; }
+def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
+ let ResourceCycles = [19]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+def : ReadAdvance<ReadVLD, 0>;
+def : ReadAdvance<ReadExtrHi, 1>;
+def : ReadAdvance<ReadAdrBase, 1>;
+
+// ALU - ALU input operands are generally needed in EX1. An operand produced in
+// in say EX2 can be forwarded for consumption to ALU in EX1, thereby
+// allowing back-to-back ALU operations such as add. If an operand requires
+// a shift, it will, however, be required in ISS stage.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+// Shifted operand
+def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def CortexA55ReadISReg : SchedReadVariant<[
+ SchedVar<RegShiftedPred, [CortexA55ReadShifted]>,
+ SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, CortexA55ReadISReg>;
+
+def CortexA55ReadIEReg : SchedReadVariant<[
+ SchedVar<RegExtendedPred, [CortexA55ReadShifted]>,
+ SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>;
+
+// MUL
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg,WriteIS,
+ WriteID32,WriteID64,
+ WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[CortexA55WriteVLD2,CortexA55WriteVLD1], (instregex "LDP.*")>;
+def : InstRW<[WriteI], (instrs COPY)>;
+//---
+// Vector Loads - 64-bit per cycle
+//---
+// 1-element structures
+def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; // single element
+def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
+def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// 2-element structures
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+// 3-element structures
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+
+// 4-element structures
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs.
+def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs.
+def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs.
+def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+
+def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//---
+// Floating Point Conversions, MAC, DIV, SQRT
+//---
+def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
+
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
+def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
+
+def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>;
+def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>;
+def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
+def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 7c40da05c305..aa5bec8088e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -93,7 +93,7 @@ def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>;
def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>;
def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
def : SchedAlias<WriteFImm, A57Write_3cyc_1V>;
-def : SchedAlias<WriteFMul, A57Write_5cyc_1V>;
+def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;}
def : SchedAlias<WriteFDiv, A57Write_17cyc_1W>;
def : SchedAlias<WriteV, A57Write_3cyc_1V>;
def : SchedAlias<WriteVLD, A57Write_5cyc_1L>;
@@ -350,12 +350,16 @@ def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")
// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+// Cortex A57 Software Optimization Guide Sec 3.14
+// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate
+def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>;
+
// ASIMD absolute diff accum, D-form
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
// ASIMD absolute diff accum, Q-form
-def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
// ASIMD absolute diff accum long
-def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>;
// ASIMD arith, reduce, 4H/4S
def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
@@ -372,32 +376,41 @@ def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>
def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
// ASIMD multiply, D-form
-def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// MUL
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// PMUL, SQDMULH, SQRDMULH
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+
// ASIMD multiply, Q-form
-def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// MUL
+def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// PMUL, SQDMULH, SQRDMULH
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// Cortex A57 Software Optimization Guide Sec 3.14
+def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
+def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>;
// ASIMD multiply accumulate, D-form
-def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
// ASIMD multiply accumulate, Q-form
-def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
// ASIMD multiply accumulate long
// ASIMD multiply accumulate saturating long
-def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
-def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>;
-def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>;
// ASIMD multiply long
-def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>;
def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
// ASIMD pairwise add and accumulate
// ASIMD shift accumulate
-def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
-def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>;
-def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
-def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
// ASIMD shift by immed, complex
def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
@@ -474,17 +487,22 @@ def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i6
def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
// ASIMD FP multiply, D-form, FZ
-def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
// ASIMD FP multiply, Q-form, FZ
-def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
// ASIMD FP multiply accumulate, D-form, FZ
// ASIMD FP multiply accumulate, Q-form, FZ
def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; }
-def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+
+// Cortex A57 Software Optimization Guide Sec 3.15
+// Advances from FP mul and mul-accum to mul-accum
+def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
+def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>;
+
def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
-def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
// ASIMD FP round, D-form
def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
@@ -547,8 +565,9 @@ def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v
def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
+// Cortex A57 Software Optimization Guide Sec 3.10
def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
-def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>;
def A57ReadFPM : SchedReadAdvance<0>;
def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
index 987ed3c4ebfb..a4c090d439db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -13,6 +13,10 @@
// Prefix: A57Write
// Latency: #cyc
// MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+// Postfix (optional): (XYZ)_Forward
+//
+// The postfix is added to differentiate SchedWriteRes that are used in
+// subsequent SchedReadAdvances.
//
// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
// 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
@@ -25,7 +29,9 @@
def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; }
def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; }
def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
+def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
let ResourceCycles = [17]; }
@@ -45,6 +51,7 @@ def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; }
def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; }
def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; }
def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; }
def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; }
@@ -93,6 +100,10 @@ def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> {
let Latency = 6;
let NumMicroOps = 2;
}
+def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI,
A57UnitL]> {
let Latency = 5;
@@ -102,10 +113,18 @@ def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
let Latency = 5;
let NumMicroOps = 2;
}
+def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 5;
let NumMicroOps = 2;
}
+def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
A57UnitV]> {
let Latency = 10;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
new file mode 100644
index 000000000000..b6741d418ef0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -0,0 +1,3890 @@
+//=- AArch64SchedA64FX.td - Fujitsu A64FX Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Fujitsu A64FX processors.
+//
+//===----------------------------------------------------------------------===//
+
+def A64FXModel : SchedMachineModel {
+ let IssueWidth = 6; // 6 micro-ops dispatched at a time.
+ let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.
+ let LoadLatency = 5; // Optimistic load latency.
+ let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
+ // Determined via a mix of micro-arch details and experimentation.
+ let LoopMicroOpBufferSize = 128;
+ let PostRAScheduler = 1; // Using PostRA sched.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures =
+ [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth];
+
+ let FullInstRWOverlapCheck = 0;
+}
+
+let SchedModel = A64FXModel in {
+
+// Define the issue ports.
+
+// A64FXIP*
+
+// Port 0
+def A64FXIPFLA : ProcResource<1>;
+
+// Port 1
+def A64FXIPPR : ProcResource<1>;
+
+// Port 2
+def A64FXIPEXA : ProcResource<1>;
+
+// Port 3
+def A64FXIPFLB : ProcResource<1>;
+
+// Port 4
+def A64FXIPEXB : ProcResource<1>;
+
+// Port 5
+def A64FXIPEAGA : ProcResource<1>;
+
+// Port 6
+def A64FXIPEAGB : ProcResource<1>;
+
+// Port 7
+def A64FXIPBR : ProcResource<1>;
+
+// Define groups for the functional units on each issue port. Each group
+// created will be used by a WriteRes later on.
+
+def A64FXGI7 : ProcResGroup<[A64FXIPBR]>;
+
+def A64FXGI0 : ProcResGroup<[A64FXIPFLA]>;
+
+def A64FXGI1 : ProcResGroup<[A64FXIPPR]>;
+
+def A64FXGI2 : ProcResGroup<[A64FXIPEXA]>;
+
+def A64FXGI3 : ProcResGroup<[A64FXIPFLB]>;
+
+def A64FXGI4 : ProcResGroup<[A64FXIPEXB]>;
+
+def A64FXGI5 : ProcResGroup<[A64FXIPEAGA]>;
+
+def A64FXGI6 : ProcResGroup<[A64FXIPEAGB]>;
+
+def A64FXGI03 : ProcResGroup<[A64FXIPFLA, A64FXIPFLB]>;
+
+def A64FXGI01 : ProcResGroup<[A64FXIPFLA, A64FXIPPR]>;
+
+def A64FXGI02 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA]>;
+
+def A64FXGI12 : ProcResGroup<[A64FXIPEXA, A64FXIPPR]>;
+
+def A64FXGI15 : ProcResGroup<[A64FXIPEAGA, A64FXIPPR]>;
+
+def A64FXGI05 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA]>;
+
+def A64FXGI24 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB]>;
+
+def A64FXGI124 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPPR]>;
+
+def A64FXGI056 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXGI0256 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA, A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXGI56 : ProcResGroup<[A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXGI2456 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB]>;
+
+def A64FXAny : ProcResGroup<[A64FXIPFLA, A64FXIPPR, A64FXIPEXA, A64FXIPFLB,
+ A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB, A64FXIPBR]> {
+ let BufferSize = 60;
+}
+
+def A64FXWrite_6Cyc : SchedWriteRes<[]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_1Cyc_GI7 : SchedWriteRes<[A64FXGI7]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_2Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 2;
+}
+
+def A64FXWrite_4Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 4;
+}
+
+def A64FXWrite_5Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 5;
+}
+
+def A64FXWrite_6Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 8;
+}
+
+def A64FXWrite_9Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 9;
+}
+
+def A64FXWrite_13Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 13;
+}
+
+def A64FXWrite_37Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 37;
+}
+
+def A64FXWrite_98Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 98;
+}
+
+def A64FXWrite_134Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 134;
+}
+
+def A64FXWrite_154Cyc_GI0 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 154;
+}
+
+def A64FXWrite_4Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+ let Latency = 4;
+}
+
+def A64FXWrite_6Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+ let Latency = 8;
+}
+
+def A64FXWrite_12Cyc_GI01 : SchedWriteRes<[A64FXGI01]> {
+ let Latency = 12;
+}
+
+def A64FXWrite_10Cyc_GI02 : SchedWriteRes<[A64FXGI02]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_17Cyc_GI02 : SchedWriteRes<[A64FXGI02]> {
+ let Latency = 17;
+}
+
+def A64FXWrite_21Cyc_GI02 : SchedWriteRes<[A64FXGI02]> {
+ let Latency = 21;
+}
+
+def A64FXWrite_3Cyc_GI1 : SchedWriteRes<[A64FXGI1]> {
+ let Latency = 3;
+}
+
+def A64FXWrite_6Cyc_NGI1 : SchedWriteRes<[A64FXGI1]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_4Cyc_GI12 : SchedWriteRes<[A64FXGI12]> {
+ let Latency = 4;
+}
+
+def A64FXWrite_3Cyc_GI2 : SchedWriteRes<[A64FXGI2]> {
+ let Latency = 3;
+}
+
+def A64FXWrite_5Cyc_GI2 : SchedWriteRes<[A64FXGI2]> {
+ let Latency = 5;
+}
+
+def A64FXWrite_6Cyc_GI2 : SchedWriteRes<[A64FXGI2]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_4Cyc_GI3 : SchedWriteRes<[A64FXGI3]> {
+ let Latency = 4;
+}
+
+def A64FXWrite_6Cyc_GI3 : SchedWriteRes<[A64FXGI3]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_6Cyc_GI15 : SchedWriteRes<[A64FXGI15]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_3Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 3;
+}
+
+def A64FXWrite_4Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 4;
+}
+
+def A64FXWrite_6Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 8;
+}
+
+def A64FXWrite_9Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 9;
+}
+
+def A64FXWrite_10Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_12Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 12;
+}
+
+def A64FXWrite_14Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+}
+
+def A64FXWrite_15Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 15;
+}
+
+def A64FXWrite_15Cyc_NGI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 15;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_18Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 18;
+}
+
+def A64FXWrite_45Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 45;
+}
+
+def A64FXWrite_60Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 60;
+}
+
+def A64FXWrite_75Cyc_GI03 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 75;
+}
+
+def A64FXWrite_6Cyc_GI05 : SchedWriteRes<[A64FXGI05]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_10Cyc_GI4 : SchedWriteRes<[A64FXGI4]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_12Cyc_GI4 : SchedWriteRes<[A64FXGI4]> {
+ let Latency = 12;
+}
+
+def A64FXWrite_20Cyc_GI4 : SchedWriteRes<[A64FXGI4]> {
+ let Latency = 20;
+}
+
+def A64FXWrite_5Cyc_GI5 : SchedWriteRes<[A64FXGI5]> {
+ let Latency = 5;
+}
+
+def A64FXWrite_11Cyc_GI5 : SchedWriteRes<[A64FXGI5]> {
+ let Latency = 11;
+}
+
+def A64FXWrite_5Cyc_GI6 : SchedWriteRes<[A64FXGI6]> {
+ let Latency = 5;
+}
+
+def A64FXWrite_1Cyc_GI24 : SchedWriteRes<[A64FXGI24]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_2Cyc_GI24 : SchedWriteRes<[A64FXGI24]> {
+ let Latency = 2;
+}
+
+def A64FXWrite_4Cyc_NGI24 : SchedWriteRes<[A64FXGI24]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def A64FXWrite_6Cyc_GI124: SchedWriteRes<[A64FXGI124]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_8Cyc_GI124 : SchedWriteRes<[A64FXGI124]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_6Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_1Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_5Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+ let Latency = 5;
+}
+
+def A64FXWrite_8Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+}
+
+def A64FXWrite_11Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+}
+
+def A64FXWrite_44Cyc_GI56 : SchedWriteRes<[A64FXGI56]> {
+ let Latency = 44;
+}
+
+def A64FXWrite_10Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_15Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+ let Latency = 15;
+}
+
+def A64FXWrite_19Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+ let Latency = 19;
+}
+
+def A64FXWrite_25Cyc_GI056 : SchedWriteRes<[A64FXGI056]> {
+ let Latency = 25;
+}
+
+def A64FXWrite_14Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> {
+ let Latency = 14;
+}
+
+def A64FXWrite_19Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> {
+ let Latency = 19;
+}
+
+def A64FXWrite_29Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> {
+ let Latency = 29;
+}
+
+def A64FXWrite_LDNP: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_LDP01: SchedWriteRes<[A64FXGI2456]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_LDR01: SchedWriteRes<[A64FXGI2456]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD102: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD103: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+
+}
+
+def A64FXWrite_LD104: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD105: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD106: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD107: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD108: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD109: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD110: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD111: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD112: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD113: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD114: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+
+def A64FXWrite_LD115: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 11;
+ let NumMicroOps = 5;
+}
+
+def A64FXWrite_LD1I0: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_LD1I1: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_LD2I0: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def A64FXWrite_LD2I1: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+
+def A64FXWrite_LD3I0: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+}
+
+def A64FXWrite_LD3I1: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 7;
+}
+
+def A64FXWrite_LD4I0: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+def A64FXWrite_LD4I1: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+ let NumMicroOps = 9;
+}
+
+def A64FXWrite_1Cyc_GI2456 : SchedWriteRes<[A64FXGI2456]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_FMOV_GV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_FMOV_VG14 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+}
+
+def A64FXWrite_FMOV_VG : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 25;
+}
+
+def A64FXWrite_ADDLV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 12;
+}
+
+def A64FXWrite_MULLE : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+}
+
+def A64FXWrite_MULLV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+}
+
+def A64FXWrite_MADDL : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_ABA : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 8;
+}
+
+def A64FXWrite_ABAL : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_ADDLV1 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 12;
+ let NumMicroOps = 6;
+}
+
+def A64FXWrite_MINMAXV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+ let NumMicroOps = 6;
+}
+
+def A64FXWrite_SQRDMULH : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 9;
+}
+
+def A64FXWrite_PMUL : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 8;
+}
+
+
+def A64FXWrite_SRSRAV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_SSRAV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_RSHRN : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_SHRN : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+
+def A64FXWrite_ADDP : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_FMULXE : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 15;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_FADDPV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_SADALP : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_SADDLP : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_FCVTXNV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 15;
+ let NumMicroOps = 2;
+}
+
+def A64FXWrite_FMAXVVH : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+ let NumMicroOps = 7;
+}
+
+def A64FXWrite_FMAXVVS : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 14;
+}
+
+def A64FXWrite_BIF : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 5;
+}
+
+def A64FXWrite_DUPGENERAL : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+}
+
+def A64FXWrite_SHA00 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 9;
+}
+
+def A64FXWrite_SHA01 : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 12;
+}
+
+def A64FXWrite_SMOV : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 25;
+}
+
+def A64FXWrite_TBX1 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def A64FXWrite_TBX2 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 5;
+}
+
+def A64FXWrite_TBX3 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 7;
+}
+
+def A64FXWrite_TBX4 : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 10;
+ let NumMicroOps = 9;
+}
+
+def A64FXWrite_PREF0: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_PREF1: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_SWP: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_STUR: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_STNP: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_STP01: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_ST10: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_ST11: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_ST12: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_ST13: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 0;
+}
+
+def A64FXWrite_ST14: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_ST15: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_ST16: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_ST17: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 1;
+}
+
+def A64FXWrite_ST1W_6: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 6;
+}
+
+def A64FXWrite_ST2W_7: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 7;
+}
+
+def A64FXWrite_ST3W_8: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 8;
+}
+
+def A64FXWrite_ST4W_9: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 9;
+}
+
+def A64FXWrite_ST1W_15: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 15;
+}
+
+def A64FXWrite_ST1W_19: SchedWriteRes<[A64FXGI056]> {
+ let Latency = 19;
+}
+
+def A64FXWrite_CAS: SchedWriteRes<[A64FXGI56]> {
+ let Latency = 7;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr, [A64FXGI7]> {
+ let Latency = 1;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg, [A64FXGI7]> {
+ let Latency = 1;
+}
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> {
+ let Latency = 4;
+}
+
+//---
+// Branch
+//---
+def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs B, BL, BR, BLR)>;
+def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs RET)>;
+def : InstRW<[A64FXWrite_1Cyc_GI7], (instregex "^B..$")>;
+def : InstRW<[A64FXWrite_1Cyc_GI7],
+ (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>;
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI, [A64FXGI2456]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteI],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg, [A64FXGI2456]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteISReg],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
+
+def : WriteRes<WriteIEReg, [A64FXGI2456]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteIEReg],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC(W|X)r",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r",
+ "SBCS(W|X)r", "CCMN(W|X)(i|r)",
+ "CCMP(W|X)(i|r)", "CSEL(W|X)r",
+ "CSINC(W|X)r", "CSINV(W|X)r",
+ "CSNEG(W|X)r")>;
+
+// Move immed
+def : WriteRes<WriteImm, [A64FXGI2456]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+
+def : InstRW<[A64FXWrite_1Cyc_GI2456],
+ (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[A64FXWrite_2Cyc_GI24],
+ (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
+
+// Variable shift
+def : WriteRes<WriteIS, [A64FXGI2456]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+def : WriteRes<WriteID32, [A64FXGI4]> {
+ let Latency = 39;
+ let ResourceCycles = [39];
+}
+
+// Divide, X-form
+def : WriteRes<WriteID64, [A64FXGI4]> {
+ let Latency = 23;
+ let ResourceCycles = [23];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32, [A64FXGI2456]> {
+ let Latency = 5;
+ let ResourceCycles = [1];
+}
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64, [A64FXGI2456]> {
+ let Latency = 5;
+ let ResourceCycles = [1];
+}
+
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[A64FXWrite_MADDL],
+ (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr, [A64FXGI2456]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+
+// Multiply high
+def : InstRW<[A64FXWrite_5Cyc_GI2], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[A64FXWrite_1Cyc_GI24],
+ (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
+
+// Bitfield move, insert
+def : InstRW<[A64FXWrite_4Cyc_NGI24], (instregex "^BFM")>;
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instregex "(S|U)?BFM.*")>;
+
+// Count leading
+def : InstRW<[A64FXWrite_2Cyc_GI0], (instregex "^CLS(W|X)r$",
+ "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AES[DE]")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AESI?MC")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^PMULL")>;
+def : InstRW<[A64FXWrite_SHA00], (instregex "^SHA1SU0")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU0")>;
+def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU1")>;
+def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA256(H|H2)")>;
+
+// CRC Instructions
+def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32Brr, CRC32Hrr)>;
+def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32Wrr)>;
+def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32Xrr)>;
+
+def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32CBrr, CRC32CHrr)>;
+def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32CWrr)>;
+def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32CXrr)>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD, [A64FXGI56]> {
+ let Latency = 4;
+ let ResourceCycles = [3];
+}
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr, [A64FXGI2456]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op. Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 5;
+}
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def A64FXWriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [A64FXWrite_1Cyc_GI56]>,
+ SchedVar<NoSchedPred, [A64FXWrite_1Cyc_GI56]>]>;
+def : SchedAlias<WriteLDIdx, A64FXWriteLDIdx>;
+
+def A64FXReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, A64FXReadAdrBase>;
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRBui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRDui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRHui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRQui)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRSui)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRDl)>;
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRQl)>;
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRWl)>;
+def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRXl)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRBi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRHi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRXi)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSWi)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPDpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPQpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPSpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpost)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPDpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPQpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPSpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPWpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPDpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPQpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPSpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPXpre)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPDpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPQpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPSpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPWpost)>;
+def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr],
+ (instrs LDPXpost)>;
+
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>;
+def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRBroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRBroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRDroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRHHroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRQroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRSroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRSHWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRSHXroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRWroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRXroW)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRBroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRDroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRHHroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRQroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRSroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRSHWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRSHXroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRWroX)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase],
+ (instrs LDRXroX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBBi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURDi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHHi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURQi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHWi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHXi)>;
+def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSWi)>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[A64FXWrite_PREF0], (instrs PRFMl)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFUMi)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFMui)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroW)>;
+def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroX)>;
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST, [A64FXGI56]> {
+ let Latency = 1;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [A64FXGI56, A64FXGI2456]> {
+ let Latency = 1;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP, [A64FXGI56]> {
+ let Latency = 1;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+def : InstRW<[A64FXWrite_STUR], (instrs STURBi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURBBi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURDi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURHi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURHHi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURQi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURSi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURWi)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRXi)>;
+
+def : InstRW<[A64FXWrite_STNP], (instrs STNPDi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STNPQi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STNPXi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STNPWi)>;
+
+def : InstRW<[A64FXWrite_STNP], (instrs STPDi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STPQi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STPXi)>;
+def : InstRW<[A64FXWrite_STNP], (instrs STPWi)>;
+
+def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>;
+def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>;
+
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[A64FXWrite_STP01],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[A64FXWrite_STP01, ReadAdrBase],
+ (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase],
+ (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRBroW, STRBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRBroW, STRBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRDroW, STRDroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRDroW, STRDroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRHroW, STRHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRHroW, STRHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRQroW, STRQroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRQroW, STRQroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRSroW, STRSroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRSroW, STRSroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRWroW, STRWroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRWroW, STRWroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRXroW, STRXroX)>;
+def : InstRW<[A64FXWrite_STUR, ReadAdrBase],
+ (instrs STRXroW, STRXroX)>;
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF, [A64FXGI03]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+}
+
+// FP arithmetic
+
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FADDDrr, FADDHrr)>;
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FSUBDrr, FSUBHrr)>;
+
+// FP compare
+def : WriteRes<WriteFCmp, [A64FXGI03]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+}
+
+// FP Div, Sqrt
+def : WriteRes<WriteFDiv, [A64FXGI0]> {
+ let Latency = 43;
+}
+
+def A64FXXWriteFDiv : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 38;
+}
+
+def A64FXXWriteFDivSP : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 29;
+}
+
+def A64FXXWriteFDivDP : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 43;
+}
+
+def A64FXXWriteFSqrtSP : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 29;
+}
+
+def A64FXXWriteFSqrtDP : SchedWriteRes<[A64FXGI0]> {
+ let Latency = 43;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A64FXXWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVSrr")>;
+def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^FSQRTSr")>;
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A64FXXWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVDrr")>;
+def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^FSQRTDr")>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [A64FXGI03]> {
+ let Latency = 9;
+ let ResourceCycles = [2];
+}
+
+def A64FXXWriteFMul : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 9;
+ let ResourceCycles = [2];
+}
+
+def A64FXXWriteFMulAcc : SchedWriteRes<[A64FXGI03]> {
+ let Latency = 9;
+ let ResourceCycles = [2];
+}
+
+def : InstRW<[A64FXXWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[A64FXXWriteFMulAcc],
+ (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
+
+// FP round to integral
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+ (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [A64FXGI03]> {
+ let Latency = 9;
+ let ResourceCycles = [2];
+}
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [A64FXGI0]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+}
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [A64FXGI0]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+}
+
+def : InstRW<[A64FXWrite_FMOV_GV], (instrs FMOVXDHighr)>;
+def : InstRW<[A64FXWrite_FMOV_VG14], (instrs FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [A64FXGI03]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+}
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+
+// ASIMD logical (MVN (alias for NOT), ORN, ORR)
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD arith, reduce
+def : InstRW<[A64FXWrite_ADDLV],
+ (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[A64FXWrite_MULLE], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A64FXWrite_MULLV],
+ (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A64FXWrite_ABA],
+ (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A64FXWrite_ABA],
+ (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A64FXWrite_ABAL],
+ (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A64FXWrite_ADDLV1],
+ (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[A64FXWrite_ADDLV1],
+ (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[A64FXWrite_ADDLV1],
+ (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A64FXWrite_MINMAXV],
+ (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A64FXWrite_MINMAXV],
+ (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[A64FXWrite_MINMAXV],
+ (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[A64FXWrite_PMUL],
+ (instregex "^(P?MUL|SQR?DMUL)" #
+ "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+ "(_indexed)?$")>;
+
+// ASIMD multiply, Q-form
+def : InstRW<[A64FXWrite_PMUL],
+ (instregex "^(P?MUL)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply, Q-form
+def : InstRW<[A64FXWrite_SQRDMULH],
+ (instregex "^(SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+ (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+ (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[A64FXWrite_SRSRAV],
+ (instregex "SRSRAv", "URSRAv")>;
+def : InstRW<[A64FXWrite_SSRAV],
+ (instregex "SSRAv", "USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[A64FXWrite_RSHRN],
+ (instregex "RSHRNv", "SQRSHRNv", "SQRSHRUNv", "UQRSHRNv")>;
+def : InstRW<[A64FXWrite_SHRN],
+ (instregex "SHRNv", "SQSHRNv", "SQSHRUNv", "UQSHRNv")>;
+
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+ (instregex "SQXTNv", "SQXTUNv", "UQXTNv")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A64FXWrite_ABA], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+ (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+ (instregex "^[SU][QR]{1,2}SHL" #
+ "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+ (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[A64FXWrite_SHRN], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[A64FXWrite_RSHRN], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+ "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[A64FXWrite_ADDP],
+ (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+ "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[A64FXWrite_4Cyc_GI0],
+ (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[A64FXWrite_SADALP], (instregex "^SADALP", "^UADALP")>;
+def : InstRW<[A64FXWrite_SADDLP], (instregex "^SADDLPv", "^UADDLPv")>;
+def : InstRW<[A64FXWrite_ADDLV1], (instregex "^SADDLV", "^UADDLV")>;
+def : InstRW<[A64FXWrite_MINMAXV],
+ (instregex "^ADDVv", "^SMAXVv", "^UMAXVv", "^SMINVv", "^UMINVv")>;
+def : InstRW<[A64FXWrite_ABA],
+ (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^SQADDv", "^SQSUBv", "^UQADDv", "^UQSUBv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^SUQADDv", "^USQADDv")>;
+def : InstRW<[A64FXWrite_SHRN],
+ (instregex "^ADDHNv", "^SUBHNv")>;
+def : InstRW<[A64FXWrite_RSHRN],
+ (instregex "^RADDHNv", "^RSUBHNv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+ "^SRHADD", "^SUQADD", "^UQADD", "^UQSUB",
+ "^URHADD", "^USQADD")>;
+
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^CMEQv", "^CMGEv", "^CMGTv",
+ "^CMLEv", "^CMLTv", "^CMHIv", "^CMHSv")>;
+def : InstRW<[A64FXWrite_MINMAXV],
+ (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
+def : InstRW<[A64FXWrite_ADDP],
+ (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^SABDv", "^UABDv")>;
+def : InstRW<[A64FXWrite_TBX1],
+ (instregex "^SABDLv", "^UABDLv")>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+ (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith, pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A64FXWrite_FADDPV], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCMEQv", "^FCMGEv",
+ "^FCMGTv", "^FCMLEv",
+ "^FCMLTv")>;
+// ASIMD FP round, D-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+ (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A64FXWrite_9Cyc_GI03],
+ (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A64FXWrite_FCVTXNV], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A64FXWrite_FCVTXNV],
+ (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A64FXWrite_FCVTXNV],
+ (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVv2f32)>;
+def : InstRW<[A64FXXWriteFDivSP], (instregex "FDIVv2f32")>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A64FXXWriteFDiv], (instrs FDIVv4f32)>;
+def : InstRW<[A64FXXWriteFDiv], (instregex "FDIVv4f32")>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVv2f64)>;
+def : InstRW<[A64FXXWriteFDivDP], (instregex "FDIVv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMAXv", "^FMAXNMv",
+ "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A64FXWrite_ADDP], (instregex "^FMAXPv", "^FMAXNMPv",
+ "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[A64FXWrite_FMAXVVH], (instregex "^FMAXVv", "^FMAXNMVv",
+ "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[A64FXWrite_FMULXE],
+ (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[A64FXWrite_FMULXE],
+ (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[A64FXWrite_FMULXE],
+ (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A64FXWrite_FMULXE],
+ (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP negate
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FNEGv")>;
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[A64FXWrite_1Cyc_GI2456], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A64FXWrite_BIF],
+ (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI0],
+ (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^CPY")>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^XTNv")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[A64FXWrite_6Cyc_GI3],
+ (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
+
+// ASIMD insert, element to element
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^MOVIv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMOVv")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv8i8One")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv8i8Two")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv8i8Three")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv8i8Four")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv8i8One")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv8i8Two")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv8i8Three")>;
+def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv8i8Four")>;
+
+// ASIMD table lookup, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv16i8One")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv16i8Two")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv16i8Three")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv16i8Four")>;
+def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv16i8One")>;
+def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv16i8Two")>;
+def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv16i8Three")>;
+def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv16i8Four")>;
+
+// ASIMD transpose
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[A64FXWrite_6Cyc_GI0],
+ (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+ "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[A64FXWrite_4Cyc_GI03],
+ (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A64FXWrite_SMOV], (instregex "(S|U)MOVv.*")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1v", "^TRN2v",
+ "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[A64FXWrite_8Cyc_GI56],
+ (instregex "^LD1Onev(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_11Cyc_GI56],
+ (instregex "^LD1Onev(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD108, WriteAdr],
+ (instregex "^LD1Onev(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD109, WriteAdr],
+ (instregex "^LD1Onev(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[A64FXWrite_LD102],
+ (instregex "^LD1Twov(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_LD103],
+ (instregex "^LD1Twov(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD110, WriteAdr],
+ (instregex "^LD1Twov(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD111, WriteAdr],
+ (instregex "^LD1Twov(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[A64FXWrite_LD104],
+ (instregex "^LD1Threev(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_LD105],
+ (instregex "^LD1Threev(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD112, WriteAdr],
+ (instregex "^LD1Threev(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD113, WriteAdr],
+ (instregex "^LD1Threev(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[A64FXWrite_LD106],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|2d)$")>;
+def : InstRW<[A64FXWrite_LD107],
+ (instregex "^LD1Fourv(16b|8h|4s)$")>;
+def : InstRW<[A64FXWrite_LD114, WriteAdr],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|2d)_POST$")>;
+def : InstRW<[A64FXWrite_LD115, WriteAdr],
+ (instregex "^LD1Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[A64FXWrite_LD1I0], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD1I1, WriteAdr],
+ (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[A64FXWrite_8Cyc_GI03],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD108, WriteAdr],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_LD103],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD111, WriteAdr],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[A64FXWrite_LD2I0], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD2I1, WriteAdr],
+ (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[A64FXWrite_LD102],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD110, WriteAdr],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_LD105],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD113, WriteAdr],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[A64FXWrite_LD3I0], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD3I1, WriteAdr],
+ (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[A64FXWrite_LD104],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD112, WriteAdr],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_LD107],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD115, WriteAdr],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[A64FXWrite_LD4I0], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_LD4I1, WriteAdr],
+ (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[A64FXWrite_LD106],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_LD114, WriteAdr],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[A64FXWrite_ST10],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST14, WriteAdr],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[A64FXWrite_ST11],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST15, WriteAdr],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[A64FXWrite_ST12],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST16, WriteAdr],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[A64FXWrite_ST13],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST17, WriteAdr],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[A64FXWrite_ST10],
+ (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST14, WriteAdr],
+ (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_ST11],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST15, WriteAdr],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[A64FXWrite_ST11],
+ (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST15, WriteAdr],
+ (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_ST12],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST16, WriteAdr],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[A64FXWrite_ST12], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST16, WriteAdr],
+ (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[A64FXWrite_ST13],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[A64FXWrite_ST17, WriteAdr],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[A64FXWrite_ST13], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[A64FXWrite_ST17, WriteAdr],
+ (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+// V8.1a Atomics (LSE)
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+ (instrs CASB, CASH, CASW, CASX)>;
+
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+ (instrs CASAB, CASAH, CASAW, CASAX)>;
+
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+ (instrs CASLB, CASLH, CASLW, CASLX)>;
+
+def : InstRW<[A64FXWrite_CAS, WriteAtomic],
+ (instrs CASALB, CASALH, CASALW, CASALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDLARB, LDLARH, LDLARW, LDLARX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDADDB, LDADDH, LDADDW, LDADDX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDEORB, LDEORH, LDEORW, LDEORX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDSETB, LDSETH, LDSETW, LDSETX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX,
+ LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX,
+ LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX,
+ LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX,
+ LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX,
+ LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX,
+ LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX,
+ LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX,
+ LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX,
+ LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>;
+
+def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic],
+ (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX,
+ LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX,
+ LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX,
+ LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+ (instrs SWPB, SWPH, SWPW, SWPX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+ (instrs SWPAB, SWPAH, SWPAW, SWPAX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+ (instrs SWPLB, SWPLH, SWPLW, SWPLX)>;
+
+def : InstRW<[A64FXWrite_SWP, WriteAtomic],
+ (instrs SWPALB, SWPALH, SWPALW, SWPALX)>;
+
+def : InstRW<[A64FXWrite_STUR, WriteAtomic],
+ (instrs STLLRB, STLLRH, STLLRW, STLLRX)>;
+
+// [ 1] "abs $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ABS_ZPmZ_B, ABS_ZPmZ_D, ABS_ZPmZ_H, ABS_ZPmZ_S)>;
+
+// [ 2] "add $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZZZ_B, ADD_ZZZ_D, ADD_ZZZ_H, ADD_ZZZ_S)>;
+
+// [ 3] "add $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZPmZ_B, ADD_ZPmZ_D, ADD_ZPmZ_H, ADD_ZPmZ_S)>;
+
+// [ 4] "add $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZI_B, ADD_ZI_D, ADD_ZI_H, ADD_ZI_S)>;
+
+// [ 5] "addpl $Rd, $Rn, $imm6";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDPL_XXI)>;
+
+// [ 6] "addvl $Rd, $Rn, $imm6";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDVL_XXI)>;
+
+// [ 7] "adr $Zd, [$Zn, $Zm]";
+def : InstRW<[A64FXWrite_5Cyc_GI0], (instrs ADR_LSL_ZZZ_D_0, ADR_LSL_ZZZ_D_1, ADR_LSL_ZZZ_D_2, ADR_LSL_ZZZ_D_3, ADR_LSL_ZZZ_S_0, ADR_LSL_ZZZ_S_1, ADR_LSL_ZZZ_S_2, ADR_LSL_ZZZ_S_3, ADR_SXTW_ZZZ_D_0, ADR_SXTW_ZZZ_D_1, ADR_SXTW_ZZZ_D_2, ADR_SXTW_ZZZ_D_3, ADR_UXTW_ZZZ_D_0, ADR_UXTW_ZZZ_D_1, ADR_UXTW_ZZZ_D_2, ADR_UXTW_ZZZ_D_3)>;
+
+// [ 8] "and $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs AND_PPzPP)>;
+
+// [ 9] "and $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZZZ)>;
+
+// [10] "and $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZPmZ_B, AND_ZPmZ_D, AND_ZPmZ_H, AND_ZPmZ_S)>;
+
+// [11] "and $Zdn, $_Zdn, $imms13";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZI)>;
+
+// [12] "ands $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ANDS_PPzPP)>;
+
+// [13] "andv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ANDV_VPZ_B, ANDV_VPZ_D, ANDV_VPZ_H, ANDV_VPZ_S)>;
+
+// [14] "asr $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZZZ_B, ASR_WIDE_ZZZ_H, ASR_WIDE_ZZZ_S)>;
+
+// [15] "asr $Zd, $Zn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZZI_B, ASR_ZZI_D, ASR_ZZI_H, ASR_ZZI_S)>;
+
+// [16] "asr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZPmZ_B, ASR_WIDE_ZPmZ_H, ASR_WIDE_ZPmZ_S, ASR_ZPmZ_B, ASR_ZPmZ_D, ASR_ZPmZ_H, ASR_ZPmZ_S)>;
+
+// [17] "asr $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZPmI_B, ASR_ZPmI_D, ASR_ZPmI_H, ASR_ZPmI_S)>;
+
+// [18] "asrd $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRD_ZPmI_B, ASRD_ZPmI_D, ASRD_ZPmI_H, ASRD_ZPmI_S)>;
+
+// [19] "asrr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRR_ZPmZ_B, ASRR_ZPmZ_D, ASRR_ZPmZ_H, ASRR_ZPmZ_S)>;
+
+// [20] "bic $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BIC_PPzPP)>;
+
+// [21] "bic $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZZZ)>;
+
+// [22] "bic $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZPmZ_B, BIC_ZPmZ_D, BIC_ZPmZ_H, BIC_ZPmZ_S)>;
+
+// [23] "bics $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BICS_PPzPP)>;
+
+// [24] "brka $Pd, $Pg/m, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPmP)>;
+
+// [25] "brka $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPzP)>;
+
+// [26] "brkas $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKAS_PPzP)>;
+
+// [27] "brkb $Pd, $Pg/m, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPmP)>;
+
+// [28] "brkb $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPzP)>;
+
+// [29] "brkbs $Pd, $Pg/z, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKBS_PPzP)>;
+
+// [30] "brkn $Pdm, $Pg/z, $Pn, $_Pdm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKN_PPzP)>;
+
+// [31] "brkns $Pdm, $Pg/z, $Pn, $_Pdm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKNS_PPzP)>;
+
+// [32] "brkpa $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPA_PPzPP)>;
+
+// [33] "brkpas $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPAS_PPzPP)>;
+
+// [34] "brkpb $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPB_PPzPP)>;
+
+// [35] "brkpbs $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPBS_PPzPP)>;
+
+// [36] "clasta $Rdn, $Pg, $_Rdn, $Zm";
+def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTA_RPZ_B, CLASTA_RPZ_D, CLASTA_RPZ_H, CLASTA_RPZ_S)>;
+
+// [37] "clasta $Vdn, $Pg, $_Vdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_VPZ_B, CLASTA_VPZ_D, CLASTA_VPZ_H, CLASTA_VPZ_S)>;
+
+// [38] "clasta $Zdn, $Pg, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_ZPZ_B, CLASTA_ZPZ_D, CLASTA_ZPZ_H, CLASTA_ZPZ_S)>;
+
+// [39] "clastb $Rdn, $Pg, $_Rdn, $Zm";
+def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTB_RPZ_B, CLASTB_RPZ_D, CLASTB_RPZ_H, CLASTB_RPZ_S)>;
+
+// [40] "clastb $Vdn, $Pg, $_Vdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_VPZ_B, CLASTB_VPZ_D, CLASTB_VPZ_H, CLASTB_VPZ_S)>;
+
+// [41] "clastb $Zdn, $Pg, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_ZPZ_B, CLASTB_ZPZ_D, CLASTB_ZPZ_H, CLASTB_ZPZ_S)>;
+
+// [42] "cls $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLS_ZPmZ_B, CLS_ZPmZ_D, CLS_ZPmZ_H, CLS_ZPmZ_S)>;
+
+// [43] "clz $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLZ_ZPmZ_B, CLZ_ZPmZ_D, CLZ_ZPmZ_H, CLZ_ZPmZ_S)>;
+
+// [44] "cmpeq $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZZ_B, CMPEQ_PPzZZ_D, CMPEQ_PPzZZ_H, CMPEQ_PPzZZ_S, CMPEQ_WIDE_PPzZZ_B, CMPEQ_WIDE_PPzZZ_H, CMPEQ_WIDE_PPzZZ_S)>;
+
+// [45] "cmpeq $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZI_B, CMPEQ_PPzZI_D, CMPEQ_PPzZI_H, CMPEQ_PPzZI_S)>;
+
+// [46] "cmpge $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZZ_B, CMPGE_PPzZZ_D, CMPGE_PPzZZ_H, CMPGE_PPzZZ_S, CMPGE_WIDE_PPzZZ_B, CMPGE_WIDE_PPzZZ_H, CMPGE_WIDE_PPzZZ_S)>;
+
+// [47] "cmpge $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZI_B, CMPGE_PPzZI_D, CMPGE_PPzZI_H, CMPGE_PPzZI_S)>;
+
+// [48] "cmpgt $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZZ_B, CMPGT_PPzZZ_D, CMPGT_PPzZZ_H, CMPGT_PPzZZ_S, CMPGT_WIDE_PPzZZ_B, CMPGT_WIDE_PPzZZ_H, CMPGT_WIDE_PPzZZ_S)>;
+
+// [49] "cmpgt $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZI_B, CMPGT_PPzZI_D, CMPGT_PPzZI_H, CMPGT_PPzZI_S)>;
+
+// [50] "cmphi $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZZ_B, CMPHI_PPzZZ_D, CMPHI_PPzZZ_H, CMPHI_PPzZZ_S, CMPHI_WIDE_PPzZZ_B, CMPHI_WIDE_PPzZZ_H, CMPHI_WIDE_PPzZZ_S)>;
+
+// [51] "cmphi $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZI_B, CMPHI_PPzZI_D, CMPHI_PPzZI_H, CMPHI_PPzZI_S)>;
+
+// [52] "cmphs $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZZ_B, CMPHS_PPzZZ_D, CMPHS_PPzZZ_H, CMPHS_PPzZZ_S, CMPHS_WIDE_PPzZZ_B, CMPHS_WIDE_PPzZZ_H, CMPHS_WIDE_PPzZZ_S)>;
+
+// [53] "cmphs $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZI_B, CMPHS_PPzZI_D, CMPHS_PPzZI_H, CMPHS_PPzZI_S)>;
+
+// [54] "cmple $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_WIDE_PPzZZ_B, CMPLE_WIDE_PPzZZ_H, CMPLE_WIDE_PPzZZ_S)>;
+
+// [55] "cmple $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_PPzZI_B, CMPLE_PPzZI_D, CMPLE_PPzZI_H, CMPLE_PPzZI_S)>;
+
+// [56] "cmplo $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_WIDE_PPzZZ_B, CMPLO_WIDE_PPzZZ_H, CMPLO_WIDE_PPzZZ_S)>;
+
+// [57] "cmplo $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_PPzZI_B, CMPLO_PPzZI_D, CMPLO_PPzZI_H, CMPLO_PPzZI_S)>;
+
+// [58] "cmpls $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_WIDE_PPzZZ_B, CMPLS_WIDE_PPzZZ_H, CMPLS_WIDE_PPzZZ_S)>;
+
+// [59] "cmpls $Pd, $Pg/z, $Zn, $imm7";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_PPzZI_B, CMPLS_PPzZI_D, CMPLS_PPzZI_H, CMPLS_PPzZI_S)>;
+
+// [60] "cmplt $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_WIDE_PPzZZ_B, CMPLT_WIDE_PPzZZ_H, CMPLT_WIDE_PPzZZ_S)>;
+
+// [61] "cmplt $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_PPzZI_B, CMPLT_PPzZI_D, CMPLT_PPzZI_H, CMPLT_PPzZI_S)>;
+
+// [62] "cmpne $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZZ_B, CMPNE_PPzZZ_D, CMPNE_PPzZZ_H, CMPNE_PPzZZ_S, CMPNE_WIDE_PPzZZ_B, CMPNE_WIDE_PPzZZ_H, CMPNE_WIDE_PPzZZ_S)>;
+
+// [63] "cmpne $Pd, $Pg/z, $Zn, $imm5";
+def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZI_B, CMPNE_PPzZI_D, CMPNE_PPzZI_H, CMPNE_PPzZI_S)>;
+
+// [64] "cnot $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs CNOT_ZPmZ_B, CNOT_ZPmZ_D, CNOT_ZPmZ_H, CNOT_ZPmZ_S)>;
+
+// [65] "cnt $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI3], (instrs CNT_ZPmZ_B, CNT_ZPmZ_D, CNT_ZPmZ_H, CNT_ZPmZ_S)>;
+
+// [66] "cntb $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTB_XPiI)>;
+
+// [67] "cntd $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTD_XPiI)>;
+
+// [68] "cnth $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTH_XPiI)>;
+
+// [69] "cntp $Rd, $Pg, $Pn";
+def : InstRW<[A64FXWrite_6Cyc_GI01], (instrs CNTP_XPP_B, CNTP_XPP_D, CNTP_XPP_H, CNTP_XPP_S)>;
+
+// [70] "cntw $Rd, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>;
+
+// [71] "compact $Zd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>;
+
+// [72] "cpy $Zd, $Pg/m, $Rn";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>;
+
+// [73] "cpy $Zd, $Pg/m, $Vn";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>;
+
+// [74] "cpy $Zd, $Pg/m, $imm";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>;
+
+// [75] "cpy $Zd, $Pg/z, $imm";
+//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>;
+
+// [76] "ctermeq $Rn, $Rm";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>;
+
+// [77] "ctermne $Rn, $Rm";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMNE_WW, CTERMNE_XX)>;
+
+// [78] "decb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECB_XPiI)>;
+
+// [79] "decd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECD_XPiI)>;
+
+// [80] "decd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECD_ZPiI)>;
+
+// [81] "dech $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECH_XPiI)>;
+
+// [82] "dech $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECH_ZPiI)>;
+
+// [83] "decp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs DECP_XP_B, DECP_XP_D, DECP_XP_H, DECP_XP_S)>;
+
+// [84] "decp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs DECP_ZP_D, DECP_ZP_H, DECP_ZP_S)>;
+
+// [85] "decw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECW_XPiI)>;
+
+// [86] "decw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECW_ZPiI)>;
+
+// [87] "dup $Zd, $Rn";
+def : InstRW<[A64FXWrite_8Cyc_GI01], (instrs DUP_ZR_B, DUP_ZR_D, DUP_ZR_H, DUP_ZR_S)>;
+
+// [88] "dup $Zd, $Zn$idx";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs DUP_ZZI_B, DUP_ZZI_D, DUP_ZZI_H, DUP_ZZI_Q, DUP_ZZI_S)>;
+
+// [89] "dup $Zd, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUP_ZI_B, DUP_ZI_D, DUP_ZI_H, DUP_ZI_S)>;
+
+// [90] "dupm $Zd, $imms";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUPM_ZI)>;
+
+// [91] "eor $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EOR_PPzPP)>;
+
+// [92] "eor $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZZZ)>;
+
+// [93] "eor $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZPmZ_B, EOR_ZPmZ_D, EOR_ZPmZ_H, EOR_ZPmZ_S)>;
+
+// [94] "eor $Zdn, $_Zdn, $imms13";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs EOR_ZI)>;
+
+// [95] "eors $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EORS_PPzPP)>;
+
+// [96] "eorv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs EORV_VPZ_B, EORV_VPZ_D, EORV_VPZ_H, EORV_VPZ_S)>;
+
+// [97] "ext $Zdn, $_Zdn, $Zm, $imm8";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs EXT_ZZI)>;
+
+// [99] "fabd $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FABD_ZPmZ_D, FABD_ZPmZ_H, FABD_ZPmZ_S)>;
+
+// [100] "fabs $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FABS_ZPmZ_D, FABS_ZPmZ_H, FABS_ZPmZ_S)>;
+
+// [101] "facge $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGE_PPzZZ_D, FACGE_PPzZZ_H, FACGE_PPzZZ_S)>;
+
+// [102] "facgt $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGT_PPzZZ_D, FACGT_PPzZZ_H, FACGT_PPzZZ_S)>;
+
+// [103] "fadd $Zd, $Zn, $Zm"; def is line 1638
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZZZ_D, FADD_ZZZ_H, FADD_ZZZ_S)>;
+
+// [104] "fadd $Zdn, $Pg/m, $_Zdn, $Zm"; def is line 1638
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmZ_D, FADD_ZPmZ_H, FADD_ZPmZ_S)>;
+
+// [105] "fadd $Zdn, $Pg/m, $_Zdn, $i1"; def is line 1638
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmI_D, FADD_ZPmI_H, FADD_ZPmI_S)>;
+
+// [106] "fadda $Vdn, $Pg, $_Vdn, $Zm";
+def : InstRW<[A64FXWrite_18Cyc_GI03], (instrs FADDA_VPZ_D, FADDA_VPZ_H, FADDA_VPZ_S)>;
+
+// [107] "faddv $Vd, $Pg, $Zn";
+// H : 4 / 6 / ([1,2]9 / [1]6) x 4 / [1,2]9 = 75 cycle
+// S : 4 / 6 / ([1,2]9 / [1]6) x 3 / [1,2]9 = 60 cycle
+// D : 4 / 6 / ([1,2]9 / [1]6) x 2 / [1,2]9 = 45 cycle
+def : InstRW<[A64FXWrite_75Cyc_GI03], (instrs FADDV_VPZ_H)>;
+def : InstRW<[A64FXWrite_60Cyc_GI03], (instrs FADDV_VPZ_S)>;
+def : InstRW<[A64FXWrite_45Cyc_GI03], (instrs FADDV_VPZ_D)>;
+
+// [108] "fcadd $Zdn, $Pg/m, $_Zdn, $Zm, $imm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCADD_ZPmZ_D, FCADD_ZPmZ_H, FCADD_ZPmZ_S)>;
+
+// [109] "fcmeq $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZ0_D, FCMEQ_PPzZ0_H, FCMEQ_PPzZ0_S)>;
+
+// [110] "fcmeq $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZZ_D, FCMEQ_PPzZZ_H, FCMEQ_PPzZZ_S)>;
+
+// [111] "fcmge $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZ0_D, FCMGE_PPzZ0_H, FCMGE_PPzZ0_S)>;
+
+// [112] "fcmge $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZZ_D, FCMGE_PPzZZ_H, FCMGE_PPzZZ_S)>;
+
+// [113] "fcmgt $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZ0_D, FCMGT_PPzZ0_H, FCMGT_PPzZ0_S)>;
+
+// [114] "fcmgt $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZZ_D, FCMGT_PPzZZ_H, FCMGT_PPzZZ_S)>;
+
+// [115] "fcmla $Zda, $Pg/m, $Zn, $Zm, $imm";
+def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZPmZZ_D, FCMLA_ZPmZZ_H, FCMLA_ZPmZZ_S)>;
+
+// [116] "fcmla $Zda, $Zn, $Zm$iop, $imm";
+def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZZZI_H, FCMLA_ZZZI_S)>;
+
+// [117] "fcmle $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLE_PPzZ0_D, FCMLE_PPzZ0_H, FCMLE_PPzZ0_S)>;
+
+// [118] "fcmlt $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLT_PPzZ0_D, FCMLT_PPzZ0_H, FCMLT_PPzZ0_S)>;
+
+// [119] "fcmne $Pd, $Pg/z, $Zn, #0.0";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZ0_D, FCMNE_PPzZ0_H, FCMNE_PPzZ0_S)>;
+
+// [120] "fcmne $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZZ_D, FCMNE_PPzZZ_H, FCMNE_PPzZZ_S)>;
+
+// [121] "fcmuo $Pd, $Pg/z, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMUO_PPzZZ_D, FCMUO_PPzZZ_H, FCMUO_PPzZZ_S)>;
+
+// [122] "fcpy $Zd, $Pg/m, $imm8";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCPY_ZPmI_D, FCPY_ZPmI_H, FCPY_ZPmI_S)>;
+
+// [123] "fcvt $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVT_ZPmZ_DtoH, FCVT_ZPmZ_DtoS, FCVT_ZPmZ_HtoD, FCVT_ZPmZ_HtoS, FCVT_ZPmZ_StoD, FCVT_ZPmZ_StoH)>;
+
+// [124] "fcvtzs $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZS_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoS, FCVTZS_ZPmZ_HtoD, FCVTZS_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoS, FCVTZS_ZPmZ_StoD, FCVTZS_ZPmZ_StoS)>;
+
+// [125] "fcvtzu $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZU_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoS, FCVTZU_ZPmZ_HtoD, FCVTZU_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoS, FCVTZU_ZPmZ_StoD, FCVTZU_ZPmZ_StoS)>;
+
+// [126] "fdiv $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIV_ZPmZ_D)>;
+def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIV_ZPmZ_H)>;
+def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIV_ZPmZ_S)>;
+
+// [127] "fdivr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIVR_ZPmZ_D)>;
+def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIVR_ZPmZ_H)>;
+def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIVR_ZPmZ_S)>;
+
+// [128] "fdup $Zd, $imm8";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FDUP_ZI_D, FDUP_ZI_H, FDUP_ZI_S)>;
+
+// [129] "fexpa $Zd, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FEXPA_ZZ_D, FEXPA_ZZ_H, FEXPA_ZZ_S)>;
+
+// [130] "fmad $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMAD_ZPmZZ_D, FMAD_ZPmZZ_H, FMAD_ZPmZZ_S)>;
+
+// [131] "fmax $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAX_ZPmZ_D, FMAX_ZPmZ_H, FMAX_ZPmZ_S)>;
+
+// [132] "fmax $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAX_ZPmI_D, FMAX_ZPmI_H, FMAX_ZPmI_S)>;
+
+// [133] "fmaxnm $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAXNM_ZPmZ_D, FMAXNM_ZPmZ_H, FMAXNM_ZPmZ_S)>;
+
+// [134] "fmaxnm $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAXNM_ZPmI_D, FMAXNM_ZPmI_H, FMAXNM_ZPmI_S)>;
+
+// [135] "fmaxnmv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXNMV_VPZ_D, FMAXNMV_VPZ_H, FMAXNMV_VPZ_S)>;
+
+// [136] "fmaxv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXV_VPZ_D, FMAXV_VPZ_H, FMAXV_VPZ_S)>;
+
+// [137] "fmin $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMIN_ZPmZ_D, FMIN_ZPmZ_H, FMIN_ZPmZ_S)>;
+
+// [138] "fmin $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMIN_ZPmI_D, FMIN_ZPmI_H, FMIN_ZPmI_S)>;
+
+// [139] "fminnm $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMINNM_ZPmZ_D, FMINNM_ZPmZ_H, FMINNM_ZPmZ_S)>;
+
+// [140] "fminnm $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMINNM_ZPmI_D, FMINNM_ZPmI_H, FMINNM_ZPmI_S)>;
+
+// [141] "fminnmv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINNMV_VPZ_D, FMINNMV_VPZ_H, FMINNMV_VPZ_S)>;
+
+// [142] "fminv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINV_VPZ_D, FMINV_VPZ_H, FMINV_VPZ_S)>;
+
+// [143] "fmla $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZPmZZ_D, FMLA_ZPmZZ_H, FMLA_ZPmZZ_S)>;
+
+// [144] "fmla $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZZZI_D, FMLA_ZZZI_H, FMLA_ZZZI_S)>;
+
+// [145] "fmls $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZPmZZ_D, FMLS_ZPmZZ_H, FMLS_ZPmZZ_S)>;
+
+// [146] "fmls $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZZZI_D, FMLS_ZZZI_H, FMLS_ZZZI_S)>;
+
+// [147] "fmsb $Zdn, $Pg/m, $Zm, $Za";
+
+// [148] "fmul $Zd, $Zn, $Zm";
+
+// [149] "fmul $Zd, $Zn, $Zm$iop";
+
+// [150] "fmul $Zdn, $Pg/m, $_Zdn, $Zm";
+
+// [151] "fmul $Zdn, $Pg/m, $_Zdn, $i1";
+
+// [152] "fmulx $Zdn, $Pg/m, $_Zdn, $Zm";
+
+// [153] "fneg $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FNEG_ZPmZ_D, FNEG_ZPmZ_H, FNEG_ZPmZ_S)>;
+
+// [154] "fnmad $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMAD_ZPmZZ_D, FNMAD_ZPmZZ_H, FNMAD_ZPmZZ_S)>;
+
+// [155] "fnmla $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLA_ZPmZZ_D, FNMLA_ZPmZZ_H, FNMLA_ZPmZZ_S)>;
+
+// [156] "fnmls $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLS_ZPmZZ_D, FNMLS_ZPmZZ_H, FNMLS_ZPmZZ_S)>;
+
+// [157] "fnmsb $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMSB_ZPmZZ_D, FNMSB_ZPmZZ_H, FNMSB_ZPmZZ_S)>;
+
+// [158] "frecpe $Zd, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPE_ZZ_D, FRECPE_ZZ_H, FRECPE_ZZ_S)>;
+
+// [159] "frecps $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRECPS_ZZZ_D, FRECPS_ZZZ_H, FRECPS_ZZZ_S)>;
+
+// [160] "frecpx $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPX_ZPmZ_D, FRECPX_ZPmZ_H, FRECPX_ZPmZ_S)>;
+
+// [161] "frinta $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTA_ZPmZ_D, FRINTA_ZPmZ_H, FRINTA_ZPmZ_S)>;
+
+// [162] "frinti $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTI_ZPmZ_D, FRINTI_ZPmZ_H, FRINTI_ZPmZ_S)>;
+
+// [163] "frintm $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTM_ZPmZ_D, FRINTM_ZPmZ_H, FRINTM_ZPmZ_S)>;
+
+// [164] "frintn $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTN_ZPmZ_D, FRINTN_ZPmZ_H, FRINTN_ZPmZ_S)>;
+
+// [165] "frintp $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTP_ZPmZ_D, FRINTP_ZPmZ_H, FRINTP_ZPmZ_S)>;
+
+// [166] "frintx $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTX_ZPmZ_D, FRINTX_ZPmZ_H, FRINTX_ZPmZ_S)>;
+
+// [167] "frintz $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTZ_ZPmZ_D, FRINTZ_ZPmZ_H, FRINTZ_ZPmZ_S)>;
+
+// [168] "frsqrte $Zd, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRSQRTE_ZZ_D, FRSQRTE_ZZ_H, FRSQRTE_ZZ_S)>;
+
+// [169] "frsqrts $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRSQRTS_ZZZ_D, FRSQRTS_ZZZ_H, FRSQRTS_ZZZ_S)>;
+
+// [170] "fscale $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSCALE_ZPmZ_D, FSCALE_ZPmZ_H, FSCALE_ZPmZ_S)>;
+
+// [171] "fsqrt $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FSQRT_ZPmZ_D)>;
+def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FSQRT_ZPmZ_H)>;
+def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FSQRT_ZPmZ_S)>;
+
+// [172] "fsub $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZZZ_D, FSUB_ZZZ_H, FSUB_ZZZ_S)>;
+
+// [173] "fsub $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZPmZ_D, FSUB_ZPmZ_H, FSUB_ZPmZ_S)>;
+
+// [174] "fsub $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUB_ZPmI_D, FSUB_ZPmI_H, FSUB_ZPmI_S)>;
+
+// [175] "fsubr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUBR_ZPmZ_D, FSUBR_ZPmZ_H, FSUBR_ZPmZ_S)>;
+
+// [176] "fsubr $Zdn, $Pg/m, $_Zdn, $i1";
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUBR_ZPmI_D, FSUBR_ZPmI_H, FSUBR_ZPmI_S)>;
+
+// [177] "ftmad $Zdn, $_Zdn, $Zm, $imm3";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTMAD_ZZI_D, FTMAD_ZZI_H, FTMAD_ZZI_S)>;
+
+// [178] "ftsmul $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTSMUL_ZZZ_D, FTSMUL_ZZZ_H, FTSMUL_ZZZ_S)>;
+
+// [180] "incb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCB_XPiI)>;
+
+// [181] "incd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCD_XPiI)>;
+
+// [182] "incd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCD_ZPiI)>;
+
+// [183] "inch $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCH_XPiI)>;
+
+// [184] "inch $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCH_ZPiI)>;
+
+// [185] "incp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs INCP_XP_B, INCP_XP_D, INCP_XP_H, INCP_XP_S)>;
+
+// [186] "incp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs INCP_ZP_D, INCP_ZP_H, INCP_ZP_S)>;
+
+// [187] "incw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCW_XPiI)>;
+
+// [188] "incw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCW_ZPiI)>;
+
+// [189] "index $Zd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_17Cyc_GI02], (instrs INDEX_RR_B, INDEX_RR_D, INDEX_RR_H, INDEX_RR_S)>;
+
+// [190] "index $Zd, $Rn, $imm5";
+def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_RI_B, INDEX_RI_D, INDEX_RI_H, INDEX_RI_S)>;
+
+// [191] "index $Zd, $imm5, $Rm";
+def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_IR_B, INDEX_IR_D, INDEX_IR_H, INDEX_IR_S)>;
+
+// [192] "index $Zd, $imm5, $imm5b";
+def : InstRW<[A64FXWrite_13Cyc_GI0], (instrs INDEX_II_B, INDEX_II_D, INDEX_II_H, INDEX_II_S)>;
+
+// [193] "insr $Zdn, $Rm";
+def : InstRW<[A64FXWrite_10Cyc_GI02], (instrs INSR_ZR_B, INSR_ZR_D, INSR_ZR_H, INSR_ZR_S)>;
+
+// [194] "insr $Zdn, $Vm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs INSR_ZV_B, INSR_ZV_D, INSR_ZV_H, INSR_ZV_S)>;
+
+// [195] "lasta $Rd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTA_RPZ_B, LASTA_RPZ_D, LASTA_RPZ_H, LASTA_RPZ_S)>;
+
+// [196] "lasta $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTA_VPZ_B, LASTA_VPZ_D, LASTA_VPZ_H, LASTA_VPZ_S)>;
+
+// [197] "lastb $Rd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTB_RPZ_B, LASTB_RPZ_D, LASTB_RPZ_H, LASTB_RPZ_S)>;
+
+// [198] "lastb $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTB_VPZ_B, LASTB_VPZ_D, LASTB_VPZ_H, LASTB_VPZ_S)>;
+
+// [199] "ld1b $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B, LD1B_D, LD1B_H, LD1B_S)>;
+
+// [200] "ld1b $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1B_D_REAL, GLD1B_D_SXTW_REAL, GLD1B_D_UXTW_REAL, GLD1B_S_SXTW_REAL, GLD1B_S_UXTW_REAL)>;
+
+// [201] "ld1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B_D_IMM_REAL, LD1B_H_IMM_REAL, LD1B_IMM_REAL, LD1B_S_IMM_REAL)>;
+
+// [202] "ld1b $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1B_D_IMM_REAL, GLD1B_S_IMM_REAL)>;
+
+// [203] "ld1d $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D)>;
+
+// [204] "ld1d $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1D_REAL, GLD1D_SCALED_REAL, GLD1D_SXTW_REAL, GLD1D_SXTW_SCALED_REAL, GLD1D_UXTW_REAL, GLD1D_UXTW_SCALED_REAL)>;
+
+// [205] "ld1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D_IMM_REAL)>;
+
+// [206] "ld1d $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1D_IMM_REAL)>;
+
+// [207] "ld1h $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H, LD1H_D, LD1H_S)>;
+
+// [208] "ld1h $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1H_D_REAL, GLD1H_D_SCALED_REAL, GLD1H_D_SXTW_REAL, GLD1H_D_SXTW_SCALED_REAL, GLD1H_D_UXTW_REAL, GLD1H_D_UXTW_SCALED_REAL, GLD1H_S_SXTW_REAL, GLD1H_S_SXTW_SCALED_REAL, GLD1H_S_UXTW_REAL, GLD1H_S_UXTW_SCALED_REAL)>;
+
+// [209] "ld1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H_D_IMM_REAL, LD1H_IMM_REAL, LD1H_S_IMM_REAL)>;
+
+// [210] "ld1h $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1H_D_IMM_REAL, GLD1H_S_IMM_REAL)>;
+
+// [211] "ld1rb $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RB_D_IMM, LD1RB_H_IMM, LD1RB_IMM, LD1RB_S_IMM)>;
+
+// [212] "ld1rd $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RD_IMM)>;
+
+// [213] "ld1rh $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RH_D_IMM, LD1RH_IMM, LD1RH_S_IMM)>;
+
+// [214] "ld1rqb $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B)>;
+
+// [215] "ld1rqb $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B_IMM)>;
+
+// [216] "ld1rqd $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D)>;
+
+// [217] "ld1rqd $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D_IMM)>;
+
+// [218] "ld1rqh $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H)>;
+
+// [219] "ld1rqh $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H_IMM)>;
+
+// [220] "ld1rqw $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W)>;
+
+// [221] "ld1rqw $Zt, $Pg/z, [$Rn, $imm4]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W_IMM)>;
+
+// [222] "ld1rsb $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSB_D_IMM, LD1RSB_H_IMM, LD1RSB_S_IMM)>;
+
+// [223] "ld1rsh $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSH_D_IMM, LD1RSH_S_IMM)>;
+
+// [224] "ld1rsw $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSW_IMM)>;
+
+// [225] "ld1rw $Zt, $Pg/z, [$Rn, $imm6]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RW_D_IMM, LD1RW_IMM)>;
+
+// [226] "ld1sb $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D, LD1SB_H, LD1SB_S)>;
+
+// [227] "ld1sb $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SB_D_REAL, GLD1SB_D_SXTW_REAL, GLD1SB_D_UXTW_REAL, GLD1SB_S_SXTW_REAL, GLD1SB_S_UXTW_REAL)>;
+
+// [228] "ld1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D_IMM_REAL, LD1SB_H_IMM_REAL, LD1SB_S_IMM_REAL)>;
+
+// [229] "ld1sb $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SB_D_IMM_REAL, GLD1SB_S_IMM_REAL)>;
+
+// [230] "ld1sh $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D, LD1SH_S)>;
+
+// [231] "ld1sh $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SH_D_REAL, GLD1SH_D_SCALED_REAL, GLD1SH_D_SXTW_REAL, GLD1SH_D_SXTW_SCALED_REAL, GLD1SH_D_UXTW_REAL, GLD1SH_D_UXTW_SCALED_REAL, GLD1SH_S_SXTW_REAL, GLD1SH_S_SXTW_SCALED_REAL, GLD1SH_S_UXTW_REAL, GLD1SH_S_UXTW_SCALED_REAL)>;
+
+// [232] "ld1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D_IMM_REAL, LD1SH_S_IMM_REAL)>;
+
+// [233] "ld1sh $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SH_D_IMM_REAL, GLD1SH_S_IMM_REAL)>;
+
+// [234] "ld1sw $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D)>;
+
+// [235] "ld1sw $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SW_D_REAL, GLD1SW_D_SCALED_REAL, GLD1SW_D_SXTW_REAL, GLD1SW_D_SXTW_SCALED_REAL, GLD1SW_D_UXTW_REAL, GLD1SW_D_UXTW_SCALED_REAL)>;
+
+// [236] "ld1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D_IMM_REAL)>;
+
+// [237] "ld1sw $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SW_D_IMM_REAL)>;
+
+// [238] "ld1w $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W, LD1W_D)>;
+
+// [239] "ld1w $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1W_D_REAL, GLD1W_D_SCALED_REAL, GLD1W_D_SXTW_REAL, GLD1W_D_SXTW_SCALED_REAL, GLD1W_D_UXTW_REAL, GLD1W_D_UXTW_SCALED_REAL, GLD1W_SXTW_REAL, GLD1W_SXTW_SCALED_REAL, GLD1W_UXTW_REAL, GLD1W_UXTW_SCALED_REAL)>;
+
+// [240] "ld1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W_D_IMM_REAL, LD1W_IMM_REAL)>;
+
+// [241] "ld1w $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1W_D_IMM_REAL, GLD1W_IMM_REAL)>;
+
+// [242] "ld2b $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B)>;
+
+// [243] "ld2b $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B_IMM)>;
+
+// [244] "ld2d $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D)>;
+
+// [245] "ld2d $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D_IMM)>;
+
+// [246] "ld2h $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H)>;
+
+// [247] "ld2h $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H_IMM)>;
+
+// [248] "ld2w $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W)>;
+
+// [249] "ld2w $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W_IMM)>;
+
+// [250] "ld3b $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B)>;
+
+// [251] "ld3b $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B_IMM)>;
+
+// [252] "ld3d $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D)>;
+
+// [253] "ld3d $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D_IMM)>;
+
+// [254] "ld3h $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H)>;
+
+// [255] "ld3h $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H_IMM)>;
+
+// [256] "ld3w $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W)>;
+
+// [257] "ld3w $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W_IMM)>;
+
+// [258] "ld4b $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B)>;
+
+// [259] "ld4b $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B_IMM)>;
+
+// [260] "ld4d $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D)>;
+
+// [261] "ld4d $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D_IMM)>;
+
+// [262] "ld4h $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H)>;
+
+// [263] "ld4h $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H_IMM)>;
+
+// [264] "ld4w $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W)>;
+
+// [265] "ld4w $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W_IMM)>;
+
+// [266] "ldff1b $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1B_D_REAL, LDFF1B_H_REAL, LDFF1B_REAL, LDFF1B_S_REAL)>;
+
+// [267] "ldff1b $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1B_D_REAL, GLDFF1B_D_SXTW_REAL, GLDFF1B_D_UXTW_REAL, GLDFF1B_S_SXTW_REAL, GLDFF1B_S_UXTW_REAL)>;
+
+// [268] "ldff1b $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1B_D_IMM_REAL, GLDFF1B_S_IMM_REAL)>;
+
+// [269] "ldff1d $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1D_REAL)>;
+
+// [270] "ldff1d $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1D_REAL, GLDFF1D_SCALED_REAL, GLDFF1D_SXTW_REAL, GLDFF1D_SXTW_SCALED_REAL, GLDFF1D_UXTW_REAL, GLDFF1D_UXTW_SCALED_REAL)>;
+
+// [271] "ldff1d $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1D_IMM_REAL)>;
+
+// [272] "ldff1h $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1H_D_REAL, LDFF1H_REAL, LDFF1H_S_REAL)>;
+
+// [273] "ldff1h $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1H_D_REAL, GLDFF1H_D_SCALED_REAL, GLDFF1H_D_SXTW_REAL, GLDFF1H_D_SXTW_SCALED_REAL, GLDFF1H_D_UXTW_REAL, GLDFF1H_D_UXTW_SCALED_REAL, GLDFF1H_S_SXTW_REAL, GLDFF1H_S_SXTW_SCALED_REAL, GLDFF1H_S_UXTW_REAL, GLDFF1H_S_UXTW_SCALED_REAL)>;
+
+// [274] "ldff1h $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1H_D_IMM_REAL, GLDFF1H_S_IMM_REAL)>;
+
+// [275] "ldff1sb $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SB_D_REAL, LDFF1SB_H_REAL, LDFF1SB_S_REAL)>;
+
+// [276] "ldff1sb $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SB_D_REAL, GLDFF1SB_D_SXTW_REAL, GLDFF1SB_D_UXTW_REAL, GLDFF1SB_S_SXTW_REAL, GLDFF1SB_S_UXTW_REAL)>;
+
+// [277] "ldff1sb $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SB_D_IMM_REAL, GLDFF1SB_S_IMM_REAL)>;
+
+// [278] "ldff1sh $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SH_D_REAL, LDFF1SH_S_REAL)>;
+
+// [279] "ldff1sh $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SH_D_REAL, GLDFF1SH_D_SCALED_REAL, GLDFF1SH_D_SXTW_REAL, GLDFF1SH_D_SXTW_SCALED_REAL, GLDFF1SH_D_UXTW_REAL, GLDFF1SH_D_UXTW_SCALED_REAL, GLDFF1SH_S_SXTW_REAL, GLDFF1SH_S_SXTW_SCALED_REAL, GLDFF1SH_S_UXTW_REAL, GLDFF1SH_S_UXTW_SCALED_REAL)>;
+
+// [280] "ldff1sh $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SH_D_IMM_REAL, GLDFF1SH_S_IMM_REAL)>;
+
+// [281] "ldff1sw $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SW_D_REAL)>;
+
+// [282] "ldff1sw $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SW_D_REAL, GLDFF1SW_D_SCALED_REAL, GLDFF1SW_D_SXTW_REAL, GLDFF1SW_D_SXTW_SCALED_REAL, GLDFF1SW_D_UXTW_REAL, GLDFF1SW_D_UXTW_SCALED_REAL)>;
+
+// [283] "ldff1sw $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SW_D_IMM_REAL)>;
+
+// [284] "ldff1w $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1W_D_REAL, LDFF1W_REAL)>;
+
+// [285] "ldff1w $Zt, $Pg/z, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1W_D_REAL, GLDFF1W_D_SCALED_REAL, GLDFF1W_D_SXTW_REAL, GLDFF1W_D_SXTW_SCALED_REAL, GLDFF1W_D_UXTW_REAL, GLDFF1W_D_UXTW_SCALED_REAL, GLDFF1W_SXTW_REAL, GLDFF1W_SXTW_SCALED_REAL, GLDFF1W_UXTW_REAL, GLDFF1W_UXTW_SCALED_REAL)>;
+
+// [286] "ldff1w $Zt, $Pg/z, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1W_D_IMM_REAL, GLDFF1W_IMM_REAL)>;
+
+// [287] "ldnf1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1B_D_IMM_REAL, LDNF1B_H_IMM_REAL, LDNF1B_IMM_REAL, LDNF1B_S_IMM_REAL)>;
+
+// [288] "ldnf1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1D_IMM_REAL)>;
+
+// [289] "ldnf1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1H_D_IMM_REAL, LDNF1H_IMM_REAL, LDNF1H_S_IMM_REAL)>;
+
+// [290] "ldnf1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SB_D_IMM_REAL, LDNF1SB_H_IMM_REAL, LDNF1SB_S_IMM_REAL)>;
+
+// [291] "ldnf1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SH_D_IMM_REAL, LDNF1SH_S_IMM_REAL)>;
+
+// [292] "ldnf1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SW_D_IMM_REAL)>;
+
+// [293] "ldnf1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1W_D_IMM_REAL, LDNF1W_IMM_REAL)>;
+
+// [294] "ldnt1b $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRR)>;
+
+// [295] "ldnt1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRI)>;
+
+// [296] "ldnt1d $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRR)>;
+
+// [297] "ldnt1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRI)>;
+
+// [298] "ldnt1h $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRR)>;
+
+// [299] "ldnt1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRI)>;
+
+// [300] "ldnt1w $Zt, $Pg/z, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRR)>;
+
+// [301] "ldnt1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRI)>;
+
+// [302] "ldr $Pt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_PXI)>;
+
+// [303] "ldr $Zt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_ZXI)>;
+
+// [304] "lsl $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZZZ_B, LSL_WIDE_ZZZ_H, LSL_WIDE_ZZZ_S)>;
+
+// [305] "lsl $Zd, $Zn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZZI_B, LSL_ZZI_D, LSL_ZZI_H, LSL_ZZI_S)>;
+
+// [306] "lsl $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZPmZ_B, LSL_WIDE_ZPmZ_H, LSL_WIDE_ZPmZ_S, LSL_ZPmZ_B, LSL_ZPmZ_D, LSL_ZPmZ_H, LSL_ZPmZ_S)>;
+
+// [307] "lsl $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZPmI_B, LSL_ZPmI_D, LSL_ZPmI_H, LSL_ZPmI_S)>;
+
+// [308] "lslr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSLR_ZPmZ_B, LSLR_ZPmZ_D, LSLR_ZPmZ_H, LSLR_ZPmZ_S)>;
+
+// [309] "lsr $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZZZ_B, LSR_WIDE_ZZZ_H, LSR_WIDE_ZZZ_S)>;
+
+// [310] "lsr $Zd, $Zn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZZI_B, LSR_ZZI_D, LSR_ZZI_H, LSR_ZZI_S)>;
+
+// [311] "lsr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZPmZ_B, LSR_WIDE_ZPmZ_H, LSR_WIDE_ZPmZ_S, LSR_ZPmZ_B, LSR_ZPmZ_D, LSR_ZPmZ_H, LSR_ZPmZ_S)>;
+
+// [312] "lsr $Zdn, $Pg/m, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZPmI_B, LSR_ZPmI_D, LSR_ZPmI_H, LSR_ZPmI_S)>;
+
+// [313] "lsrr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSRR_ZPmZ_B, LSRR_ZPmZ_D, LSRR_ZPmZ_H, LSRR_ZPmZ_S)>;
+
+// [314] "mad $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MAD_ZPmZZ_B, MAD_ZPmZZ_D, MAD_ZPmZZ_H, MAD_ZPmZZ_S)>;
+
+// [315] "mla $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLA_ZPmZZ_B, MLA_ZPmZZ_D, MLA_ZPmZZ_H, MLA_ZPmZZ_S)>;
+
+// [316] "mls $Zda, $Pg/m, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLS_ZPmZZ_B, MLS_ZPmZZ_D, MLS_ZPmZZ_H, MLS_ZPmZZ_S)>;
+
+// [317] "movprfx $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPmZ_B, MOVPRFX_ZPmZ_D, MOVPRFX_ZPmZ_H, MOVPRFX_ZPmZ_S)>;
+
+// [318] "movprfx $Zd, $Pg/z, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPzZ_B, MOVPRFX_ZPzZ_D, MOVPRFX_ZPzZ_H, MOVPRFX_ZPzZ_S)>;
+
+// [319] "movprfx $Zd, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZZ)>;
+
+// [320] "msb $Zdn, $Pg/m, $Zm, $Za";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MSB_ZPmZZ_B, MSB_ZPmZZ_D, MSB_ZPmZZ_H, MSB_ZPmZZ_S)>;
+
+// [321] "mul $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MUL_ZPmZ_B, MUL_ZPmZ_D, MUL_ZPmZ_H, MUL_ZPmZ_S)>;
+
+// [322] "mul $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs MUL_ZI_B, MUL_ZI_D, MUL_ZI_H, MUL_ZI_S)>;
+
+// [323] "nand $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NAND_PPzPP)>;
+
+// [324] "nands $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NANDS_PPzPP)>;
+
+// [325] "neg $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NEG_ZPmZ_B, NEG_ZPmZ_D, NEG_ZPmZ_H, NEG_ZPmZ_S)>;
+
+// [326] "nor $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NOR_PPzPP)>;
+
+// [327] "nors $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NORS_PPzPP)>;
+
+// [328] "not $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NOT_ZPmZ_B, NOT_ZPmZ_D, NOT_ZPmZ_H, NOT_ZPmZ_S)>;
+
+// [329] "orn $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORN_PPzPP)>;
+
+// [330] "orns $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORNS_PPzPP)>;
+
+// [331] "orr $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORR_PPzPP)>;
+
+// [332] "orr $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZZZ)>;
+
+// [333] "orr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZPmZ_B, ORR_ZPmZ_D, ORR_ZPmZ_H, ORR_ZPmZ_S)>;
+
+// [334] "orr $Zdn, $_Zdn, $imms13";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs ORR_ZI)>;
+
+// [335] "orrs $Pd, $Pg/z, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORRS_PPzPP)>;
+
+// [336] "orv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ORV_VPZ_B, ORV_VPZ_D, ORV_VPZ_H, ORV_VPZ_S)>;
+
+// [337] "pfalse $Pd";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PFALSE)>;
+
+// [338] "pnext $Pdn, $Pg, $_Pdn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PNEXT_B, PNEXT_D, PNEXT_H, PNEXT_S)>;
+
+// [339] "prfb $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRR)>;
+
+// [340] "prfb $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFB_D_SCALED, PRFB_D_SXTW_SCALED, PRFB_D_UXTW_SCALED, PRFB_S_SXTW_SCALED, PRFB_S_UXTW_SCALED)>;
+
+// [341] "prfb $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRI)>;
+
+// [342] "prfb $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFB_D_PZI, PRFB_S_PZI)>;
+
+// [343] "prfd $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRR)>;
+
+// [344] "prfd $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFD_D_SCALED, PRFD_D_SXTW_SCALED, PRFD_D_UXTW_SCALED, PRFD_S_SXTW_SCALED, PRFD_S_UXTW_SCALED)>;
+
+// [345] "prfd $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRI)>;
+
+// [346] "prfd $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFD_D_PZI, PRFD_S_PZI)>;
+
+// [347] "prfh $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRR)>;
+
+// [348] "prfh $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFH_D_SCALED, PRFH_D_SXTW_SCALED, PRFH_D_UXTW_SCALED, PRFH_S_SXTW_SCALED, PRFH_S_UXTW_SCALED)>;
+
+// [349] "prfh $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>;
+
+// [350] "prfh $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>;
+
+// [351] "prfw $prfop, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>;
+
+// [352] "prfw $prfop, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>;
+
+// [353] "prfw $prfop, $Pg, [$Rn, $imm6, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRI)>;
+
+// [354] "prfw $prfop, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFW_D_PZI, PRFW_S_PZI)>;
+
+// [355] "ptest $Pg, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTEST_PP)>;
+
+// [356] "ptrue $Pd, $pattern";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUE_B, PTRUE_D, PTRUE_H, PTRUE_S)>;
+
+// [357] "ptrues $Pd, $pattern";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUES_B, PTRUES_D, PTRUES_H, PTRUES_S)>;
+
+// [358] "punpkhi $Pd, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKHI_PP)>;
+
+// [359] "punpklo $Pd, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKLO_PP)>;
+
+// [360] "rbit $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBIT_ZPmZ_B, RBIT_ZPmZ_D, RBIT_ZPmZ_H, RBIT_ZPmZ_S)>;
+
+// [361] "rdffr $Pd";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_P)>;
+
+// [362] "rdffr $Pd, $Pg/z";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_PPz)>;
+
+// [363] "rdffrs $Pd, $Pg/z";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFRS_PPz)>;
+
+// [364] "rdvl $Rd, $imm6";
+def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs RDVLI_XI)>;
+
+// [365] "rev $Pd, $Pn";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs REV_PP_B, REV_PP_D, REV_PP_H, REV_PP_S)>;
+
+// [366] "rev $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs REV_ZZ_B, REV_ZZ_D, REV_ZZ_H, REV_ZZ_S)>;
+
+// [367] "revb $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVB_ZPmZ_D, REVB_ZPmZ_H, REVB_ZPmZ_S)>;
+
+// [368] "revh $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVH_ZPmZ_D, REVH_ZPmZ_S)>;
+
+// [369] "revw $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVW_ZPmZ_D)>;
+
+// [370] "sabd $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SABD_ZPmZ_B, SABD_ZPmZ_D, SABD_ZPmZ_H, SABD_ZPmZ_S)>;
+
+// [371] "saddv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs SADDV_VPZ_B, SADDV_VPZ_H, SADDV_VPZ_S)>;
+
+// [372] "scvtf $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SCVTF_ZPmZ_DtoD, SCVTF_ZPmZ_DtoH, SCVTF_ZPmZ_DtoS, SCVTF_ZPmZ_HtoH, SCVTF_ZPmZ_StoD, SCVTF_ZPmZ_StoH, SCVTF_ZPmZ_StoS)>;
+
+// [373] "sdiv $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIV_ZPmZ_D, SDIV_ZPmZ_S)>;
+
+// [374] "sdivr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIVR_ZPmZ_D, SDIVR_ZPmZ_S)>;
+
+// [375] "sdot $Zda, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SDOT_ZZZ_D, SDOT_ZZZ_S)>;
+
+// [376] "sdot $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs SDOT_ZZZI_D, SDOT_ZZZI_S)>;
+
+// [377] "sel $Pd, $Pg, $Pn, $Pm";
+def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs SEL_PPPP)>;
+
+// [378] "sel $Zd, $Pg, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SEL_ZPZZ_B, SEL_ZPZZ_D, SEL_ZPZZ_H, SEL_ZPZZ_S)>;
+
+// [379] "setffr";
+def : InstRW<[A64FXWrite_6Cyc], (instrs SETFFR)>;
+
+// [380] "smax $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMAX_ZPmZ_B, SMAX_ZPmZ_D, SMAX_ZPmZ_H, SMAX_ZPmZ_S)>;
+
+// [381] "smax $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMAX_ZI_B, SMAX_ZI_D, SMAX_ZI_H, SMAX_ZI_S)>;
+
+// [382] "smaxv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMAXV_VPZ_B, SMAXV_VPZ_D, SMAXV_VPZ_H, SMAXV_VPZ_S)>;
+
+// [383] "smin $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMIN_ZPmZ_B, SMIN_ZPmZ_D, SMIN_ZPmZ_H, SMIN_ZPmZ_S)>;
+
+// [384] "smin $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMIN_ZI_B, SMIN_ZI_D, SMIN_ZI_H, SMIN_ZI_S)>;
+
+// [385] "sminv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMINV_VPZ_B, SMINV_VPZ_D, SMINV_VPZ_H, SMINV_VPZ_S)>;
+
+// [386] "smulh $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SMULH_ZPmZ_B, SMULH_ZPmZ_D, SMULH_ZPmZ_H, SMULH_ZPmZ_S)>;
+
+// [387] "splice $Zdn, $Pg, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SPLICE_ZPZ_B, SPLICE_ZPZ_D, SPLICE_ZPZ_H, SPLICE_ZPZ_S)>;
+
+// [388] "sqadd $Zd, $Zn, $Zm";
+
+// [389] "sqadd $Zdn, $_Zdn, $imm";
+
+// [390] "sqdecb $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiWdI)>;
+
+// [391] "sqdecb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiI)>;
+
+// [392] "sqdecd $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiWdI)>;
+
+// [393] "sqdecd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiI)>;
+
+// [394] "sqdecd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECD_ZPiI)>;
+
+// [395] "sqdech $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiWdI)>;
+
+// [396] "sqdech $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiI)>;
+
+// [397] "sqdech $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECH_ZPiI)>;
+
+// [398] "sqdecp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XP_B, SQDECP_XP_D, SQDECP_XP_H, SQDECP_XP_S)>;
+
+// [399] "sqdecp $Rdn, $Pg, $_Rdn";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XPWd_B, SQDECP_XPWd_D, SQDECP_XPWd_H, SQDECP_XPWd_S)>;
+
+// [400] "sqdecp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQDECP_ZP_D, SQDECP_ZP_H, SQDECP_ZP_S)>;
+
+// [401] "sqdecw $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiWdI)>;
+
+// [402] "sqdecw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiI)>;
+
+// [403] "sqdecw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECW_ZPiI)>;
+
+// [404] "sqincb $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiWdI)>;
+
+// [405] "sqincb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiI)>;
+
+// [406] "sqincd $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiWdI)>;
+
+// [407] "sqincd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiI)>;
+
+// [408] "sqincd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCD_ZPiI)>;
+
+// [409] "sqinch $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiWdI)>;
+
+// [410] "sqinch $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiI)>;
+
+// [411] "sqinch $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCH_ZPiI)>;
+
+// [412] "sqincp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XP_B, SQINCP_XP_D, SQINCP_XP_H, SQINCP_XP_S)>;
+
+// [413] "sqincp $Rdn, $Pg, $_Rdn";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XPWd_B, SQINCP_XPWd_D, SQINCP_XPWd_H, SQINCP_XPWd_S)>;
+
+// [414] "sqincp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQINCP_ZP_D, SQINCP_ZP_H, SQINCP_ZP_S)>;
+
+// [415] "sqincw $Rdn, $_Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiWdI)>;
+
+// [416] "sqincw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiI)>;
+
+// [417] "sqincw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>;
+
+// [418] "sqsub $Zd, $Zn, $Zm";
+
+// [419] "sqsub $Zdn, $_Zdn, $imm";
+
+// [420] "st1b $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>;
+
+// [421] "st1b $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>;
+
+// [422] "st1b $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>;
+
+// [423] "st1b $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>;
+
+// [424] "st1d $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>;
+
+// [425] "st1d $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>;
+
+// [426] "st1d $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>;
+
+// [427] "st1d $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>;
+
+// [428] "st1h $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>;
+
+// [429] "st1h $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>;
+
+// [430] "st1h $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>;
+
+// [431] "st1h $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>;
+
+// [432] "st1w $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>;
+
+// [433] "st1w $Zt, $Pg, [$Rn, $Zm]";
+def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>;
+
+// [434] "st1w $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>;
+
+// [435] "st1w $Zt, $Pg, [$Zn, $imm5]";
+def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1W_D_IMM, SST1W_IMM)>;
+
+// [436] "st2b $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B)>;
+
+// [437] "st2b $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B_IMM)>;
+
+// [438] "st2d $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D)>;
+
+// [439] "st2d $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D_IMM)>;
+
+// [440] "st2h $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H)>;
+
+// [441] "st2h $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H_IMM)>;
+
+// [442] "st2w $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W)>;
+
+// [443] "st2w $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W_IMM)>;
+
+// [444] "st3b $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B)>;
+
+// [445] "st3b $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B_IMM)>;
+
+// [446] "st3d $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D)>;
+
+// [447] "st3d $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D_IMM)>;
+
+// [448] "st3h $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H)>;
+
+// [449] "st3h $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H_IMM)>;
+
+// [450] "st3w $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W)>;
+
+// [451] "st3w $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W_IMM)>;
+
+// [452] "st4b $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B)>;
+
+// [453] "st4b $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B_IMM)>;
+
+// [454] "st4d $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D)>;
+
+// [455] "st4d $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D_IMM)>;
+
+// [456] "st4h $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H)>;
+
+// [457] "st4h $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H_IMM)>;
+
+// [458] "st4w $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W)>;
+
+// [459] "st4w $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W_IMM)>;
+
+// [460] "stnt1b $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRR)>;
+
+// [461] "stnt1b $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRI)>;
+
+// [462] "stnt1d $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRR)>;
+
+// [463] "stnt1d $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRI)>;
+
+// [464] "stnt1h $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRR)>;
+
+// [465] "stnt1h $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRI)>;
+
+// [466] "stnt1w $Zt, $Pg, [$Rn, $Rm]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRR)>;
+
+// [467] "stnt1w $Zt, $Pg, [$Rn, $imm4, mul vl]";
+def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRI)>;
+
+// [468] "str $Pt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI15], (instrs STR_PXI)>;
+
+// [469] "str $Zt, [$Rn, $imm9, mul vl]";
+def : InstRW<[A64FXWrite_6Cyc_GI05], (instrs STR_ZXI)>;
+
+// [470] "sub $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZZZ_B, SUB_ZZZ_D, SUB_ZZZ_H, SUB_ZZZ_S)>;
+
+// [471] "sub $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZPmZ_B, SUB_ZPmZ_D, SUB_ZPmZ_H, SUB_ZPmZ_S)>;
+
+// [472] "sub $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZI_B, SUB_ZI_D, SUB_ZI_H, SUB_ZI_S)>;
+
+// [473] "subr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUBR_ZPmZ_B, SUBR_ZPmZ_D, SUBR_ZPmZ_H, SUBR_ZPmZ_S)>;
+
+// [474] "subr $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SUBR_ZI_B, SUBR_ZI_D, SUBR_ZI_H, SUBR_ZI_S)>;
+
+// [475] "sunpkhi $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKHI_ZZ_D, SUNPKHI_ZZ_H, SUNPKHI_ZZ_S)>;
+
+// [476] "sunpklo $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKLO_ZZ_D, SUNPKLO_ZZ_H, SUNPKLO_ZZ_S)>;
+
+// [477] "sxtb $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTB_ZPmZ_D, SXTB_ZPmZ_H, SXTB_ZPmZ_S)>;
+
+// [478] "sxth $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTH_ZPmZ_D, SXTH_ZPmZ_S)>;
+
+// [479] "sxtw $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTW_ZPmZ_D)>;
+
+// [480] "tbl $Zd, $Zn, $Zm";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs TBL_ZZZ_B, TBL_ZZZ_D, TBL_ZZZ_H, TBL_ZZZ_S)>;
+
+// [481] "trn1 $Pd, $Pn, $Pm";
+
+// [482] "trn1 $Zd, $Zn, $Zm";
+
+// [483] "trn2 $Pd, $Pn, $Pm";
+
+// [484] "trn2 $Zd, $Zn, $Zm";
+
+// [486] "uabd $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UABD_ZPmZ_B, UABD_ZPmZ_D, UABD_ZPmZ_H, UABD_ZPmZ_S)>;
+
+// [487] "uaddv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs UADDV_VPZ_B, UADDV_VPZ_D, UADDV_VPZ_H, UADDV_VPZ_S)>;
+
+// [488] "ucvtf $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UCVTF_ZPmZ_DtoD, UCVTF_ZPmZ_DtoH, UCVTF_ZPmZ_DtoS, UCVTF_ZPmZ_HtoH, UCVTF_ZPmZ_StoD, UCVTF_ZPmZ_StoH, UCVTF_ZPmZ_StoS)>;
+
+// [489] "udiv $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIV_ZPmZ_D, UDIV_ZPmZ_S)>;
+
+// [490] "udivr $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIVR_ZPmZ_D, UDIVR_ZPmZ_S)>;
+
+// [491] "udot $Zda, $Zn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UDOT_ZZZ_D, UDOT_ZZZ_S)>;
+
+// [492] "udot $Zda, $Zn, $Zm$iop";
+def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs UDOT_ZZZI_D, UDOT_ZZZI_S)>;
+
+// [493] "umax $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMAX_ZPmZ_B, UMAX_ZPmZ_D, UMAX_ZPmZ_H, UMAX_ZPmZ_S)>;
+
+// [494] "umax $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMAX_ZI_B, UMAX_ZI_D, UMAX_ZI_H, UMAX_ZI_S)>;
+
+// [495] "umaxv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMAXV_VPZ_B, UMAXV_VPZ_D, UMAXV_VPZ_H, UMAXV_VPZ_S)>;
+
+// [496] "umin $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMIN_ZPmZ_B, UMIN_ZPmZ_D, UMIN_ZPmZ_H, UMIN_ZPmZ_S)>;
+
+// [497] "umin $Zdn, $_Zdn, $imm";
+def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMIN_ZI_B, UMIN_ZI_D, UMIN_ZI_H, UMIN_ZI_S)>;
+
+// [498] "uminv $Vd, $Pg, $Zn";
+def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMINV_VPZ_B, UMINV_VPZ_D, UMINV_VPZ_H, UMINV_VPZ_S)>;
+
+// [499] "umulh $Zdn, $Pg/m, $_Zdn, $Zm";
+def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UMULH_ZPmZ_B, UMULH_ZPmZ_D, UMULH_ZPmZ_H, UMULH_ZPmZ_S)>;
+
+// [500] "uqadd $Zd, $Zn, $Zm";
+
+// [501] "uqadd $Zdn, $_Zdn, $imm";
+
+// [502] "uqdecb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECB_WPiI, UQDECB_XPiI)>;
+
+// [503] "uqdecd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECD_WPiI, UQDECD_XPiI)>;
+
+// [504] "uqdecd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECD_ZPiI)>;
+
+// [505] "uqdech $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECH_WPiI, UQDECH_XPiI)>;
+
+// [506] "uqdech $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECH_ZPiI)>;
+
+// [507] "uqdecp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQDECP_WP_B, UQDECP_WP_D, UQDECP_WP_H, UQDECP_WP_S, UQDECP_XP_B, UQDECP_XP_D, UQDECP_XP_H, UQDECP_XP_S)>;
+
+// [508] "uqdecp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQDECP_ZP_D, UQDECP_ZP_H, UQDECP_ZP_S)>;
+
+// [509] "uqdecw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECW_WPiI, UQDECW_XPiI)>;
+
+// [510] "uqdecw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECW_ZPiI)>;
+
+// [511] "uqincb $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCB_WPiI, UQINCB_XPiI)>;
+
+// [512] "uqincd $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCD_WPiI, UQINCD_XPiI)>;
+
+// [513] "uqincd $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCD_ZPiI)>;
+
+// [514] "uqinch $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCH_WPiI, UQINCH_XPiI)>;
+
+// [515] "uqinch $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCH_ZPiI)>;
+
+// [516] "uqincp $Rdn, $Pg";
+def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQINCP_WP_B, UQINCP_WP_D, UQINCP_WP_H, UQINCP_WP_S, UQINCP_XP_B, UQINCP_XP_D, UQINCP_XP_H, UQINCP_XP_S)>;
+
+// [517] "uqincp $Zdn, $Pg";
+def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQINCP_ZP_D, UQINCP_ZP_H, UQINCP_ZP_S)>;
+
+// [518] "uqincw $Rdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCW_WPiI, UQINCW_XPiI)>;
+
+// [519] "uqincw $Zdn, $pattern, mul $imm4";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCW_ZPiI)>;
+
+// [520] "uqsub $Zd, $Zn, $Zm";
+//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZZZ_B, UQSUB_ZZZ_D, UQSUB_ZZZ_H, UQSUB_ZZZ_S)>;
+
+// [521] "uqsub $Zdn, $_Zdn, $imm";
+//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZI_B, UQSUB_ZI_D, UQSUB_ZI_H, UQSUB_ZI_S)>;
+
+// [522] "uunpkhi $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKHI_ZZ_D, UUNPKHI_ZZ_H, UUNPKHI_ZZ_S)>;
+
+// [523] "uunpklo $Zd, $Zn";
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKLO_ZZ_D, UUNPKLO_ZZ_H, UUNPKLO_ZZ_S)>;
+
+// [524] "uxtb $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTB_ZPmZ_D, UXTB_ZPmZ_H, UXTB_ZPmZ_S)>;
+
+// [525] "uxth $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTH_ZPmZ_D, UXTH_ZPmZ_S)>;
+
+// [526] "uxtw $Zd, $Pg/m, $Zn";
+def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTW_ZPmZ_D)>;
+
+// [527] "uzp1 $Pd, $Pn, $Pm";
+
+// [528] "uzp1 $Zd, $Zn, $Zm";
+
+// [529] "uzp2 $Pd, $Pn, $Pm";
+
+// [530] "uzp2 $Zd, $Zn, $Zm";
+
+// [531] "whilele $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELE_PWW_B, WHILELE_PWW_D, WHILELE_PWW_H, WHILELE_PWW_S, WHILELE_PXX_B, WHILELE_PXX_D, WHILELE_PXX_H, WHILELE_PXX_S)>;
+
+// [532] "whilelo $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELO_PWW_B, WHILELO_PWW_D, WHILELO_PWW_H, WHILELO_PWW_S, WHILELO_PXX_B, WHILELO_PXX_D, WHILELO_PXX_H, WHILELO_PXX_S)>;
+
+// [533] "whilels $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELS_PWW_B, WHILELS_PWW_D, WHILELS_PWW_H, WHILELS_PWW_S, WHILELS_PXX_B, WHILELS_PXX_D, WHILELS_PXX_H, WHILELS_PXX_S)>;
+
+// [534] "whilelt $Pd, $Rn, $Rm";
+def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELT_PWW_B, WHILELT_PWW_D, WHILELT_PWW_H, WHILELT_PWW_S, WHILELT_PXX_B, WHILELT_PXX_D, WHILELT_PXX_H, WHILELT_PXX_S)>;
+
+// [535] "wrffr $Pn";
+def : InstRW<[A64FXWrite_6Cyc_NGI1], (instrs WRFFR)>;
+
+// [536] "zip1 $Pd, $Pn, $Pm";
+
+// [537] "zip1 $Zd, $Zn, $Zm";
+
+// [538] "zip2 $Pd, $Pn, $Pm";
+
+// [539] "zip2 $Zd, $Zn, $Zm";
+
+} // SchedModel = A64FXModel
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
new file mode 100644
index 000000000000..438371c1b6a8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -0,0 +1,745 @@
+//==- AArch64SchedTSV110.td - Huawei TSV110 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Huawei TSV110 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details.
+
+// Huawei TSV110 scheduling machine model.
+def TSV110Model : SchedMachineModel {
+ let IssueWidth = 4; // 4 micro-ops dispatched per cycle.
+ let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+ let LoopMicroOpBufferSize = 16;
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+ PAUnsupported.F);
+}
+
+// Define each kind of processor resource and number available on the TSV110,
+// which has 8 pipelines, each with its own queue where micro-ops wait for
+// their operands and issue out-of-order to one of eight execution pipelines.
+let SchedModel = TSV110Model in {
+ def TSV110UnitALU : ProcResource<1>; // Int ALU
+ def TSV110UnitAB : ProcResource<2>; // Int ALU/BRU
+ def TSV110UnitMDU : ProcResource<1>; // Multi-Cycle
+ def TSV110UnitFSU1 : ProcResource<1>; // FP/ASIMD
+ def TSV110UnitFSU2 : ProcResource<1>; // FP/ASIMD
+ def TSV110UnitLdSt : ProcResource<2>; // Load/Store
+
+ def TSV110UnitF : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2]>;
+ def TSV110UnitALUAB : ProcResGroup<[TSV110UnitALU, TSV110UnitAB]>;
+ def TSV110UnitFLdSt : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2, TSV110UnitLdSt]>;
+}
+
+let SchedModel = TSV110Model in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// TSV110
+
+// Integer ALU
+def : WriteRes<WriteImm, [TSV110UnitALUAB]> { let Latency = 1; }
+def : WriteRes<WriteI, [TSV110UnitALUAB]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [TSV110UnitMDU]> { let Latency = 2; }
+def : WriteRes<WriteIEReg, [TSV110UnitMDU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [TSV110UnitALUAB]> { let Latency = 1; }
+def : WriteRes<WriteIS, [TSV110UnitALUAB]> { let Latency = 1; }
+
+// Integer Mul/MAC/Div
+def : WriteRes<WriteID32, [TSV110UnitMDU]> { let Latency = 12;
+ let ResourceCycles = [12]; }
+def : WriteRes<WriteID64, [TSV110UnitMDU]> { let Latency = 20;
+ let ResourceCycles = [20]; }
+def : WriteRes<WriteIM32, [TSV110UnitMDU]> { let Latency = 3; }
+def : WriteRes<WriteIM64, [TSV110UnitMDU]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [TSV110UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [TSV110UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+// Pre/Post Indexing
+def : WriteRes<WriteAdr, [TSV110UnitALUAB]> { let Latency = 1; }
+
+// Store
+def : WriteRes<WriteST, [TSV110UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [TSV110UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [TSV110UnitLdSt]> { let Latency = 1; }
+
+// FP
+def : WriteRes<WriteF, [TSV110UnitF]> { let Latency = 2; }
+def : WriteRes<WriteFCmp, [TSV110UnitF]> { let Latency = 3; }
+def : WriteRes<WriteFCvt, [TSV110UnitF]> { let Latency = 3; }
+def : WriteRes<WriteFCopy, [TSV110UnitF]> { let Latency = 2; }
+def : WriteRes<WriteFImm, [TSV110UnitF]> { let Latency = 2; }
+def : WriteRes<WriteFMul, [TSV110UnitF]> { let Latency = 5; }
+
+// FP Div, Sqrt
+def : WriteRes<WriteFDiv, [TSV110UnitFSU1]> { let Latency = 18; }
+
+def : WriteRes<WriteV, [TSV110UnitF]> { let Latency = 4; }
+def : WriteRes<WriteVLD, [TSV110UnitFLdSt]> { let Latency = 5; }
+def : WriteRes<WriteVST, [TSV110UnitF]> { let Latency = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [TSV110UnitAB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [TSV110UnitAB]> { let Latency = 1; }
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Forwarding logic is modeled only for multiply and accumulate.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// Detailed Refinements
+//===----------------------------------------------------------------------===//
+
+// Contains all of the TSV110 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+// Prefix: TSV110Wr
+// Latency: #cyc
+// MicroOp Count/Types: #(ALU|AB|MDU|FSU1|FSU2|LdSt|ALUAB|F|FLdSt)
+//
+// e.g. TSV110Wr_6cyc_1ALU_6MDU_4LdSt means the total latency is 6 and there are
+// 1 micro-ops to be issued down one ALU pipe, six MDU pipes and four LdSt pipes.
+//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def TSV110Wr_1cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 1; }
+def TSV110Wr_1cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 1; }
+def TSV110Wr_1cyc_1ALUAB : SchedWriteRes<[TSV110UnitALUAB]> { let Latency = 1; }
+def TSV110Wr_1cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 1; }
+
+def TSV110Wr_2cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 2; }
+def TSV110Wr_2cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 2; }
+def TSV110Wr_2cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 2; }
+def TSV110Wr_2cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 2; }
+def TSV110Wr_2cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 2; }
+def TSV110Wr_2cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 2; }
+
+def TSV110Wr_3cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 3; }
+def TSV110Wr_3cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 3; }
+def TSV110Wr_3cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 3; }
+
+def TSV110Wr_4cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 4; }
+def TSV110Wr_4cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 4; }
+def TSV110Wr_4cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 4; }
+def TSV110Wr_4cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 4; }
+
+def TSV110Wr_5cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 5; }
+def TSV110Wr_5cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 5; }
+def TSV110Wr_5cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 5; }
+def TSV110Wr_5cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 5; }
+
+def TSV110Wr_6cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 6; }
+
+def TSV110Wr_7cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 7; }
+
+def TSV110Wr_8cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 8; }
+
+def TSV110Wr_11cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 11; }
+
+def TSV110Wr_12cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 12; }
+
+def TSV110Wr_17cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 17; }
+
+def TSV110Wr_18cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 18; }
+
+def TSV110Wr_20cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 20; }
+
+def TSV110Wr_24cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 24; }
+
+def TSV110Wr_31cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 31; }
+
+def TSV110Wr_36cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 36; }
+
+def TSV110Wr_38cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 38; }
+
+def TSV110Wr_64cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 64; }
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def TSV110Wr_1cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+ TSV110UnitALUAB]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+ TSV110UnitALUAB]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt,
+ TSV110UnitLdSt]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_2F : SchedWriteRes<[TSV110UnitF,
+ TSV110UnitF]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_2cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1,
+ TSV110UnitFSU2]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_4cyc_2F : SchedWriteRes<[TSV110UnitF,
+ TSV110UnitF]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_4cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1,
+ TSV110UnitFSU2]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_4cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+ TSV110UnitALUAB]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_5cyc_1ALU_1F : SchedWriteRes<[TSV110UnitALU,
+ TSV110UnitF]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_6cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt,
+ TSV110UnitLdSt]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_6cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt,
+ TSV110UnitALUAB]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_7cyc_1F_1LdSt : SchedWriteRes<[TSV110UnitF,
+ TSV110UnitLdSt]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def TSV110Wr_8cyc_2FSU1 : SchedWriteRes<[TSV110UnitFSU1,
+ TSV110UnitFSU1]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+
+def TSV110Wr_8cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1,
+ TSV110UnitFSU2]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def TSV110Wr_6cyc_3F : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+ TSV110UnitF]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def TSV110Wr_6cyc_3LdSt : SchedWriteRes<[TSV110UnitLdSt, TSV110UnitLdSt,
+ TSV110UnitLdSt]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def TSV110Wr_7cyc_2F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+ TSV110UnitLdSt]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def TSV110Wr_8cyc_4F : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+ TSV110UnitF, TSV110UnitF]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def TSV110Wr_8cyc_3F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+ TSV110UnitF, TSV110UnitLdSt]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def TSV110Wr_8cyc_3F_2LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, TSV110UnitF,
+ TSV110UnitLdSt, TSV110UnitLdSt]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def TSV110Wr_10cyc_4F_4LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF,
+ TSV110UnitF, TSV110UnitF,
+ TSV110UnitLdSt, TSV110UnitLdSt,
+ TSV110UnitLdSt, TSV110UnitLdSt]> {
+ let Latency = 10;
+ let NumMicroOps = 8;
+}
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1AB], (instrs B)>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BL)>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BLR)>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ))$")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AES[DE]")>;
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AESI?MC")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA1SU1")>;
+def : InstRW<[TSV110Wr_2cyc_2F], (instregex "^SHA1(H|SU0)")>;
+def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA1[CMP]")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA256SU0")>;
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^SHA256SU1")>;
+def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA256(H|H2)")>;
+def TSV110ReadCRC: SchedReadAdvance<1, [TSV110Wr_2cyc_1MDU]>;
+def : InstRW<[TSV110Wr_2cyc_1MDU, TSV110ReadCRC], (instregex "^CRC32.*$")>;
+
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(BIC|EON|ORN)[WX]rr")>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(BIC)S[WX]rr")>;
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(ADD|AND|EOR|ORR|SUB)[WX]r(r|i)")>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(ADD|AND|EOR|ORR|SUB)S[WX]r(r|i)")>;
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(ADC|SBC|BIC)[WX]r$")>;
+def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(ADC|SBC)S[WX]r$")>;
+
+def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>;
+def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)S[WX]rs$")>;
+def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(ADD|SUB)[WX]r(s|x|x64)$")>;
+def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(ADD|SUB)S[WX]r(s|x|x64)$")>;
+
+def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>;
+
+
+// Move and Shift Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instrs ADR, ADRP)>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^MOV[NZK][WX]i")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(LSLV|LSRV|ASRV|RORV)(W|X)r")>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>;
+
+def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
+def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>;
+def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^EXTR(W|X)rri$")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(S|U)?BFM(W|X)ri$")>;
+def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CLS|CLZ|RBIT|REV(16|32)?)(W|X)r$")>;
+
+
+// Load Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(W|X)l$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs LDRSWl)>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDP(W|X)i$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt_1ALUAB, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFMl)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFUMi)>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMui$")>;
+def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMro(W|X)$")>;
+
+
+// Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STN?P(W|X)i$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR(BB|HH|W|X)i$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR(BB|HH|W|X)ui$")>;
+
+def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+
+
+// FP Data Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "F(ABS|NEG)(D|S)r")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCCMP(E)?(S|D)rr$")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCMP(E)?(S|D)r(r|i)$")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCSEL(S|D)rrr$")>;
+
+def : InstRW<[TSV110Wr_11cyc_1FSU1], (instrs FDIVSrr)>;
+def : InstRW<[TSV110Wr_18cyc_1FSU1], (instrs FDIVDrr)>;
+def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTSr)>;
+def : InstRW<[TSV110Wr_31cyc_1FSU2], (instrs FSQRTDr)>;
+
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^FN?M(ADD|SUB)Hrrr")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FN?M(ADD|SUB)Srrr")>;
+def : InstRW<[TSV110Wr_7cyc_1F], (instregex "^FN?M(ADD|SUB)Drrr")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Hrr")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|SUB)Srr")>;
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Drr")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(N)?MULHrr$")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULSrr$")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULDrr$")>;
+
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT.+r")>;
+
+
+// FP Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_5cyc_1ALU_1F], (instregex "^[SU]CVTF[SU][WX][SD]ri")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT[HSD][HSD]r")>;
+
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^FMOV(DX|WS|XD|SW|DXHigh|XDHigh)r$")>;
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOV[SD][ir]$")>;
+
+
+// FP Load Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[DSQ]l")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDUR[BDHSQ]i")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[BDHSQ]ui")>;
+def : InstRW<[TSV110Wr_6cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi], (instregex "^LDN?P[DQS]i")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi, WriteAdr], (instregex "^LDP[DQS](post|pre)")>;
+
+
+// FP Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR[BHSDQ]i")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ](post|pre)")>;
+def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR[BHSDQ]ui")>;
+def : InstRW<[TSV110Wr_2cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ]ro[WX]")>;
+def : InstRW<[TSV110Wr_2cyc_2LdSt], (instregex "^STN?P[SDQ]i")>;
+def : InstRW<[TSV110Wr_2cyc_2LdSt, WriteAdr], (instregex "^STP[SDQ](post|pre)")>;
+
+
+// ASIMD Integer Instructions
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v8i8, v4i16, v2i32
+// Q form - v16i8, v8i16, v4i32
+// D form - v1i8, v1i16, v1i32, v1i64
+// Q form - v16i8, v8i16, v4i32, v2i64
+// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD simple arithmetic
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(ABS|ADD(P)?|NEG|SUB)v")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](ADD(L|LP|W)|SUB(L|W))v")>;
+
+// ASIMD complex arithmetic
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]H(ADD|SUB)v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^R?(ADD|SUB)HN2?v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]Q(ADD|SUB)v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^(SU|US)QADDv")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]RHADDv")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ADALPv")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^((SQ)(ABS|NEG))v")>;
+
+// ASIMD compare
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT|TST)v")>;
+
+// ASIMD max/min
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)P?v")>;
+
+// ASIMD logical
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(AND|BIC|BIF|BIT|BSL|EOR|MVN|NOT|ORN|ORR)v")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[TSV110Wr_8cyc_2FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v16i8|v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD shift
+// ASIMD shift accumulate
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(S|SR|U|UR)SRA")>;
+// ASIMD shift by immed, basic
+def : InstRW<[TSV110Wr_4cyc_1FSU1],
+ (instregex "SHLv","SLIv","SRIv","SHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD reduction
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v2f32
+// Q form - v4f32, v2f64
+// D form - 32, 64
+// D form - v1i32, v1i64
+// D form - v2i32
+// Q form - v4i32, v2i64
+
+// ASIMD FP sign manipulation
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FABSv")>;
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FNEGv")>;
+
+// ASIMD FP compare
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v")>;
+
+// ASIMD FP convert
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FCVT[AMNPZ][SU]v")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT(L)v")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FCVT(N|XN)v")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[TSV110Wr_11cyc_1FSU1], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[TSV110Wr_24cyc_1FSU1], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[TSV110Wr_38cyc_1FSU1], (instregex "FDIVv2f64")>;
+
+// ASIMD FP SQRT
+def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTv2f32)>;
+def : InstRW<[TSV110Wr_36cyc_1FSU2], (instrs FSQRTv4f32)>;
+def : InstRW<[TSV110Wr_64cyc_1FSU2], (instrs FSQRTv2f64)>;
+
+// ASIMD FP max,min
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?v")>;
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(MAX|MIN)(NM)?Vv")>;
+
+// ASIMD FP add
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|ADDP|SUB)v")>;
+
+// ASIMD FP multiply
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FMULX?v")>;
+
+
+// ASIMD Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(CLS|CLZ|CNT)v")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(DUP|INS)v.+lane")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^REV(16|32|64)v")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(UZP|ZIP)[12]v")>;
+
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^EXTv")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^XTNv")>;
+def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^RBITv")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^(INS|DUP)v.+gpr")>;
+
+def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^[SU]MOVv")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v16i8Four")>;
+
+def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOVv")>;
+
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT[AIMNPXZ]v")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[SU]CVTFv")>;
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[FU](RECP|RSQRT)(E|X)v")>;
+
+
+// ASIMD Load Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_3LdSt], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_6cyc_3LdSt, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_2LdSt], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_6cyc_2LdSt, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_10cyc_4F_4LdSt], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_10cyc_4F_4LdSt, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+
+// ASIMD Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_1F], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+
+def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[TSV110Wr_6cyc_1F], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[TSV110Wr_8cyc_1F], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[TSV110Wr_8cyc_1F, WriteAdr], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+} // SchedModel = TSV110Model
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 8f814d185e85..a5bc3668ed54 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -82,7 +82,8 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
unsigned OffsetScaled = 0;
while (OffsetScaled < ObjSizeScaled) {
if (ObjSizeScaled - OffsetScaled >= 2) {
- SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl);
+ SDValue AddrNode =
+ DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl);
SDValue St = DAG.getMemIntrinsicNode(
OpCode2, dl, DAG.getVTList(MVT::Other),
{Chain, TagSrc, AddrNode},
@@ -94,7 +95,8 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
}
if (ObjSizeScaled - OffsetScaled > 0) {
- SDValue AddrNode = DAG.getMemBasePlusOffset(Ptr, OffsetScaled * 16, dl);
+ SDValue AddrNode =
+ DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl);
SDValue St = DAG.getMemIntrinsicNode(
OpCode1, dl, DAG.getVTList(MVT::Other),
{Chain, TagSrc, AddrNode},
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
deleted file mode 100644
index 24751a81797d..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the StackOffset class, which is used to
-// describe scalable and non-scalable offsets during frame lowering.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
-
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/TypeSize.h"
-#include <cassert>
-
-namespace llvm {
-
-/// StackOffset is a wrapper around scalable and non-scalable offsets and is
-/// used in several functions such as 'isAArch64FrameOffsetLegal' and
-/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g.
-//
-/// StackOffset(1, MVT::nxv16i8)
-//
-/// would describe an offset as being the size of a single SVE vector.
-///
-/// The class also implements simple arithmetic (addition/subtraction) on these
-/// offsets, e.g.
-//
-/// StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64)
-//
-/// describes an offset that spans the combined storage required for an SVE
-/// vector and a 64bit GPR.
-class StackOffset {
- int64_t Bytes;
- int64_t ScalableBytes;
-
- explicit operator int() const;
-
-public:
- using Part = std::pair<int64_t, MVT>;
-
- StackOffset() : Bytes(0), ScalableBytes(0) {}
-
- StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
- assert(MVT(T).isByteSized() && "Offset type is not a multiple of bytes");
- *this += Part(Offset, T);
- }
-
- StackOffset(const StackOffset &Other)
- : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}
-
- StackOffset &operator=(const StackOffset &) = default;
-
- StackOffset &operator+=(const StackOffset::Part &Other) {
- const TypeSize Size = Other.second.getSizeInBits();
- if (Size.isScalable())
- ScalableBytes += Other.first * ((int64_t)Size.getKnownMinSize() / 8);
- else
- Bytes += Other.first * ((int64_t)Size.getFixedSize() / 8);
- return *this;
- }
-
- StackOffset &operator+=(const StackOffset &Other) {
- Bytes += Other.Bytes;
- ScalableBytes += Other.ScalableBytes;
- return *this;
- }
-
- StackOffset operator+(const StackOffset &Other) const {
- StackOffset Res(*this);
- Res += Other;
- return Res;
- }
-
- StackOffset &operator-=(const StackOffset &Other) {
- Bytes -= Other.Bytes;
- ScalableBytes -= Other.ScalableBytes;
- return *this;
- }
-
- StackOffset operator-(const StackOffset &Other) const {
- StackOffset Res(*this);
- Res -= Other;
- return Res;
- }
-
- StackOffset operator-() const {
- StackOffset Res = {};
- const StackOffset Other(*this);
- Res -= Other;
- return Res;
- }
-
- /// Returns the scalable part of the offset in bytes.
- int64_t getScalableBytes() const { return ScalableBytes; }
-
- /// Returns the non-scalable part of the offset in bytes.
- int64_t getBytes() const { return Bytes; }
-
- /// Returns the offset in parts to which this frame offset can be
- /// decomposed for the purpose of describing a frame offset.
- /// For non-scalable offsets this is simply its byte size.
- void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
- int64_t &NumDataVectors) const {
- assert(isValid() && "Invalid frame offset");
-
- NumBytes = Bytes;
- NumDataVectors = 0;
- NumPredicateVectors = ScalableBytes / 2;
- // This method is used to get the offsets to adjust the frame offset.
- // If the function requires ADDPL to be used and needs more than two ADDPL
- // instructions, part of the offset is folded into NumDataVectors so that it
- // uses ADDVL for part of it, reducing the number of ADDPL instructions.
- if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
- NumPredicateVectors > 62) {
- NumDataVectors = NumPredicateVectors / 8;
- NumPredicateVectors -= NumDataVectors * 8;
- }
- }
-
- void getForDwarfOffset(int64_t &ByteSized, int64_t &VGSized) const {
- assert(isValid() && "Invalid frame offset");
-
- // VGSized offsets are divided by '2', because the VG register is the
- // the number of 64bit granules as opposed to 128bit vector chunks,
- // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
- // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
- // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
- ByteSized = Bytes;
- VGSized = ScalableBytes / 2;
- }
-
- /// Returns whether the offset is known zero.
- explicit operator bool() const { return Bytes || ScalableBytes; }
-
- bool isValid() const {
- // The smallest scalable element supported by scaled SVE addressing
- // modes are predicates, which are 2 scalable bytes in size. So the scalable
- // byte offset must always be a multiple of 2.
- return ScalableBytes % 2 == 0;
- }
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 61f27cbc3b29..ab49e0c3f937 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -59,7 +59,7 @@
using namespace llvm;
-#define DEBUG_TYPE "stack-tagging"
+#define DEBUG_TYPE "aarch64-stack-tagging"
static cl::opt<bool> ClMergeInit(
"stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
@@ -73,6 +73,10 @@ static cl::opt<bool>
static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
cl::init(40), cl::Hidden);
+static cl::opt<unsigned>
+ ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272),
+ cl::Hidden);
+
static const Align kTagGranuleSize = Align(16);
namespace {
@@ -103,9 +107,10 @@ public:
SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {}
bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) {
- auto I = std::lower_bound(
- Ranges.begin(), Ranges.end(), Start,
- [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; });
+ auto I =
+ llvm::lower_bound(Ranges, Start, [](const Range &LHS, uint64_t RHS) {
+ return LHS.End <= RHS;
+ });
if (I != Ranges.end() && End > I->Start) {
// Overlap - bail.
return false;
@@ -434,7 +439,8 @@ void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
bool LittleEndian =
Triple(AI->getModule()->getTargetTriple()).isLittleEndian();
// Current implementation of initializer merging assumes little endianness.
- if (MergeInit && !F->hasOptNone() && LittleEndian) {
+ if (MergeInit && !F->hasOptNone() && LittleEndian &&
+ Size < ClMergeInitSizeLimit) {
LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI
<< ", size = " << Size << "\n");
InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB);
@@ -544,7 +550,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
SmallVector<Instruction *, 8> RetVec;
- DenseMap<Value *, AllocaInst *> AllocaForValue;
SmallVector<Instruction *, 4> UnrecognizedLifetimes;
for (auto &BB : *F) {
@@ -566,8 +571,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
auto *II = dyn_cast<IntrinsicInst>(I);
if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
II->getIntrinsicID() == Intrinsic::lifetime_end)) {
- AllocaInst *AI =
- llvm::findAllocaForValue(II->getArgOperand(1), AllocaForValue);
+ AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
if (!AI) {
UnrecognizedLifetimes.push_back(I);
continue;
@@ -655,7 +659,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
IntrinsicInst *Start = Info.LifetimeStart[0];
IntrinsicInst *End = Info.LifetimeEnd[0];
uint64_t Size =
- dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
+ cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
Size = alignTo(Size, kTagGranuleSize);
tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
// We need to ensure that if we tag some object, we certainly untag it
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index 73bd434ef123..41096a961330 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -13,7 +13,6 @@
#include "AArch64InstrInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -50,6 +49,12 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
"apply unchecked-ld-st when the target is definitely within range"),
clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
+static cl::opt<bool>
+ ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
+ cl::ZeroOrMore,
+ cl::desc("Apply first slot optimization for stack tagging "
+ "(eliminate ADDG Rt, Rn, 0, 0)."));
+
namespace {
class AArch64StackTaggingPreRA : public MachineFunctionPass {
@@ -71,6 +76,7 @@ public:
bool mayUseUncheckedLoadStore();
void uncheckUsesOf(unsigned TaggedReg, int FI);
void uncheckLoadsAndStores();
+ Optional<int> findFirstSlotCandidate();
bool runOnMachineFunction(MachineFunction &Func) override;
StringRef getPassName() const override {
@@ -197,6 +203,141 @@ void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
}
}
+struct SlotWithTag {
+ int FI;
+ int Tag;
+ SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {}
+ explicit SlotWithTag(const MachineInstr &MI)
+ : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {}
+ bool operator==(const SlotWithTag &Other) const {
+ return FI == Other.FI && Tag == Other.Tag;
+ }
+};
+
+namespace llvm {
+template <> struct DenseMapInfo<SlotWithTag> {
+ static inline SlotWithTag getEmptyKey() { return {-2, -2}; }
+ static inline SlotWithTag getTombstoneKey() { return {-3, -3}; }
+ static unsigned getHashValue(const SlotWithTag &V) {
+ return hash_combine(DenseMapInfo<int>::getHashValue(V.FI),
+ DenseMapInfo<int>::getHashValue(V.Tag));
+ }
+ static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) {
+ return A == B;
+ }
+};
+} // namespace llvm
+
+static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) {
+ return MFI->getUseLocalStackAllocationBlock() &&
+ MFI->isObjectPreAllocated(FI);
+}
+
+// Pin one of the tagged slots to offset 0 from the tagged base pointer.
+// This would make its address available in a virtual register (IRG's def), as
+// opposed to requiring an ADDG instruction to materialize. This effectively
+// eliminates a vreg (by replacing it with direct uses of IRG, which is usually
+// live almost everywhere anyway), and therefore needs to happen before
+// regalloc.
+Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
+ // Find the best (FI, Tag) pair to pin to offset 0.
+ // Looking at the possible uses of a tagged address, the advantage of pinning
+ // is:
+ // - COPY to physical register.
+ // Does not matter, this would trade a MOV instruction for an ADDG.
+ // - ST*G matter, but those mostly appear near the function prologue where all
+ // the tagged addresses need to be materialized anyway; also, counting ST*G
+ // uses would overweight large allocas that require more than one ST*G
+ // instruction.
+ // - Load/Store instructions in the address operand do not require a tagged
+ // pointer, so they also do not benefit. These operands have already been
+ // eliminated (see uncheckLoadsAndStores) so all remaining load/store
+ // instructions count.
+ // - Any other instruction may benefit from being pinned to offset 0.
+ LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n");
+ if (!ClFirstSlot)
+ return None;
+
+ DenseMap<SlotWithTag, int> RetagScore;
+ SlotWithTag MaxScoreST{-1, -1};
+ int MaxScore = -1;
+ for (auto *I : ReTags) {
+ SlotWithTag ST{*I};
+ if (isSlotPreAllocated(MFI, ST.FI))
+ continue;
+
+ Register RetagReg = I->getOperand(0).getReg();
+ if (!Register::isVirtualRegister(RetagReg))
+ continue;
+
+ int Score = 0;
+ SmallVector<Register, 8> WorkList;
+ WorkList.push_back(RetagReg);
+
+ while (!WorkList.empty()) {
+ Register UseReg = WorkList.back();
+ WorkList.pop_back();
+ for (auto &UseI : MRI->use_instructions(UseReg)) {
+ unsigned Opcode = UseI.getOpcode();
+ if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
+ Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset ||
+ Opcode == AArch64::STGPi || Opcode == AArch64::STGloop ||
+ Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback ||
+ Opcode == AArch64::STZGloop_wback)
+ continue;
+ if (UseI.isCopy()) {
+ Register DstReg = UseI.getOperand(0).getReg();
+ if (Register::isVirtualRegister(DstReg))
+ WorkList.push_back(DstReg);
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %"
+ << Register::virtReg2Index(UseReg) << " in " << UseI
+ << "\n");
+ Score++;
+ }
+ }
+
+ int TotalScore = RetagScore[ST] += Score;
+ if (TotalScore > MaxScore ||
+ (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) {
+ MaxScore = TotalScore;
+ MaxScoreST = ST;
+ }
+ }
+
+ if (MaxScoreST.FI < 0)
+ return None;
+
+ // If FI's tag is already 0, we are done.
+ if (MaxScoreST.Tag == 0)
+ return MaxScoreST.FI;
+
+ // Otherwise, find a random victim pair (FI, Tag) where Tag == 0.
+ SlotWithTag SwapST{-1, -1};
+ for (auto *I : ReTags) {
+ SlotWithTag ST{*I};
+ if (ST.Tag == 0) {
+ SwapST = ST;
+ break;
+ }
+ }
+
+ // Swap tags between the victim and the highest scoring pair.
+ // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for
+ // the highest score slot without changing anything else.
+ for (auto *&I : ReTags) {
+ SlotWithTag ST{*I};
+ MachineOperand &TagOp = I->getOperand(4);
+ if (ST == MaxScoreST) {
+ TagOp.setImm(0);
+ } else if (ST == SwapST) {
+ TagOp.setImm(MaxScoreST.Tag);
+ }
+ }
+ return MaxScoreST.FI;
+}
+
bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
MRI = &MF->getRegInfo();
@@ -225,11 +366,35 @@ bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
}
}
+ // Take over from SSP. It does nothing for tagged slots, and should not really
+ // have been enabled in the first place.
+ for (int FI : TaggedSlots)
+ MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None);
+
if (ReTags.empty())
return false;
if (mayUseUncheckedLoadStore())
uncheckLoadsAndStores();
+ // Find a slot that is used with zero tag offset, like ADDG #fi, 0.
+ // If the base tagged pointer is set up to the address of this slot,
+ // the ADDG instruction can be eliminated.
+ Optional<int> BaseSlot = findFirstSlotCandidate();
+ if (BaseSlot)
+ AFI->setTaggedBasePointerIndex(*BaseSlot);
+
+ for (auto *I : ReTags) {
+ int FI = I->getOperand(1).getIndex();
+ int Tag = I->getOperand(4).getImm();
+ Register Base = I->getOperand(3).getReg();
+ if (Tag == 0 && FI == BaseSlot) {
+ BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY),
+ I->getOperand(0).getReg())
+ .addReg(Base);
+ I->eraseFromParent();
+ }
+ }
+
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 029535cb98b5..71b2bb196486 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -67,7 +67,7 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
if (CPUString.empty())
CPUString = "generic";
- ParseSubtargetFeatures(CPUString, FS);
+ ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
initializeProperties();
return *this;
@@ -103,19 +103,26 @@ void AArch64Subtarget::initializeProperties() {
case CortexA76:
case CortexA77:
case CortexA78:
+ case CortexA78C:
+ case CortexR82:
case CortexX1:
PrefFunctionLogAlignment = 4;
break;
case A64FX:
CacheLineSize = 256;
- PrefFunctionLogAlignment = 5;
- PrefLoopLogAlignment = 5;
+ PrefFunctionLogAlignment = 3;
+ PrefLoopLogAlignment = 2;
+ MaxInterleaveFactor = 4;
+ PrefetchDistance = 128;
+ MinPrefetchStride = 1024;
+ MaxPrefetchIterationsAhead = 4;
break;
case AppleA7:
case AppleA10:
case AppleA11:
case AppleA12:
case AppleA13:
+ case AppleA14:
CacheLineSize = 64;
PrefetchDistance = 280;
MinPrefetchStride = 2048;
@@ -150,6 +157,8 @@ void AArch64Subtarget::initializeProperties() {
PrefFunctionLogAlignment = 3;
break;
case NeoverseN1:
+ case NeoverseN2:
+ case NeoverseV1:
PrefFunctionLogAlignment = 4;
break;
case Saphira:
@@ -200,7 +209,7 @@ void AArch64Subtarget::initializeProperties() {
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
- : AArch64GenSubtargetInfo(TT, CPU, FS),
+ : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
IsLittle(LittleEndian),
@@ -366,3 +375,8 @@ unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
return (SVEVectorBitsMin / 128) * 128;
return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
}
+
+bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
+ // Prefer NEON unless larger SVE registers are available.
+ return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
index b111f0016948..8fe2f125982f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -45,6 +45,7 @@ public:
AppleA11,
AppleA12,
AppleA13,
+ AppleA14,
Carmel,
CortexA35,
CortexA53,
@@ -57,20 +58,24 @@ public:
CortexA76,
CortexA77,
CortexA78,
+ CortexA78C,
+ CortexR82,
CortexX1,
ExynosM3,
Falkor,
Kryo,
NeoverseE1,
NeoverseN1,
+ NeoverseN2,
+ NeoverseV1,
Saphira,
ThunderX2T99,
ThunderX,
ThunderXT81,
ThunderXT83,
ThunderXT88,
- TSV110,
- ThunderX3T110
+ ThunderX3T110,
+ TSV110
};
protected:
@@ -83,6 +88,10 @@ protected:
bool HasV8_4aOps = false;
bool HasV8_5aOps = false;
bool HasV8_6aOps = false;
+ bool HasV8_7aOps = false;
+
+ bool HasV8_0rOps = false;
+ bool HasCONTEXTIDREL2 = false;
bool HasFPARMv8 = false;
bool HasNEON = false;
@@ -118,14 +127,13 @@ protected:
bool HasAES = false;
// ARMv8.3 extensions
- bool HasPA = false;
+ bool HasPAuth = false;
bool HasJS = false;
bool HasCCIDX = false;
bool HasComplxNum = false;
// ARMv8.4 extensions
bool HasNV = false;
- bool HasRASv8_4 = false;
bool HasMPAM = false;
bool HasDIT = false;
bool HasTRACEV8_4 = false;
@@ -133,7 +141,7 @@ protected:
bool HasSEL2 = false;
bool HasPMU = false;
bool HasTLB_RMI = false;
- bool HasFMI = false;
+ bool HasFlagM = false;
bool HasRCPC_IMMO = false;
bool HasLSLFast = false;
@@ -162,6 +170,12 @@ protected:
bool HasFineGrainedTraps = false;
bool HasEnhancedCounterVirtualization = false;
+ // Armv8.7-A Extensions
+ bool HasXS = false;
+ bool HasWFxT = false;
+ bool HasHCX = false;
+ bool HasLS64 = false;
+
// Arm SVE2 extensions
bool HasSVE2 = false;
bool HasSVE2AES = false;
@@ -172,6 +186,9 @@ protected:
// Future architecture extensions.
bool HasETE = false;
bool HasTRBE = false;
+ bool HasBRBE = false;
+ bool HasPAUTH = false;
+ bool HasSPE_EEF = false;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove = false;
@@ -191,6 +208,7 @@ protected:
// Enable 64-bit vectorization in SLP.
unsigned MinVectorRegisterBitWidth = 64;
+ bool OutlineAtomics = false;
bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
@@ -203,6 +221,7 @@ protected:
bool UseAlternateSExtLoadCVTF32Pattern = false;
bool HasArithmeticBccFusion = false;
bool HasArithmeticCbzFusion = false;
+ bool HasCmpBccFusion = false;
bool HasFuseAddress = false;
bool HasFuseAES = false;
bool HasFuseArithmeticLogic = false;
@@ -306,6 +325,7 @@ public:
bool hasV8_3aOps() const { return HasV8_3aOps; }
bool hasV8_4aOps() const { return HasV8_4aOps; }
bool hasV8_5aOps() const { return HasV8_5aOps; }
+ bool hasV8_0rOps() const { return HasV8_0rOps; }
bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
@@ -343,6 +363,7 @@ public:
bool hasSHA3() const { return HasSHA3; }
bool hasSHA2() const { return HasSHA2; }
bool hasAES() const { return HasAES; }
+ bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; }
bool balanceFPOps() const { return BalanceFPOps; }
bool predictableSelectIsExpensive() const {
return PredictableSelectIsExpensive;
@@ -357,6 +378,7 @@ public:
}
bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+ bool hasCmpBccFusion() const { return HasCmpBccFusion; }
bool hasFuseAddress() const { return HasFuseAddress; }
bool hasFuseAES() const { return HasFuseAES; }
bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; }
@@ -432,6 +454,7 @@ public:
bool hasRandGen() const { return HasRandGen; }
bool hasMTE() const { return HasMTE; }
bool hasTME() const { return HasTME; }
+ bool hasPAUTH() const { return HasPAUTH; }
// Arm SVE2 extensions
bool hasSVE2AES() const { return HasSVE2AES; }
bool hasSVE2SM4() const { return HasSVE2SM4; }
@@ -461,10 +484,15 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
- bool isTargetILP32() const { return TargetTriple.isArch32Bit(); }
+ bool isTargetILP32() const {
+ return TargetTriple.isArch32Bit() ||
+ TargetTriple.getEnvironment() == Triple::GNUILP32;
+ }
bool useAA() const override { return UseAA; }
+ bool outlineAtomics() const { return OutlineAtomics; }
+
bool hasVH() const { return HasVH; }
bool hasPAN() const { return HasPAN; }
bool hasLOR() const { return HasLOR; }
@@ -473,22 +501,25 @@ public:
bool hasPAN_RWV() const { return HasPAN_RWV; }
bool hasCCPP() const { return HasCCPP; }
- bool hasPA() const { return HasPA; }
+ bool hasPAuth() const { return HasPAuth; }
bool hasJS() const { return HasJS; }
bool hasCCIDX() const { return HasCCIDX; }
bool hasComplxNum() const { return HasComplxNum; }
bool hasNV() const { return HasNV; }
- bool hasRASv8_4() const { return HasRASv8_4; }
bool hasMPAM() const { return HasMPAM; }
bool hasDIT() const { return HasDIT; }
bool hasTRACEV8_4() const { return HasTRACEV8_4; }
bool hasAM() const { return HasAM; }
bool hasAMVS() const { return HasAMVS; }
+ bool hasXS() const { return HasXS; }
+ bool hasWFxT() const { return HasWFxT; }
+ bool hasHCX() const { return HasHCX; }
+ bool hasLS64() const { return HasLS64; }
bool hasSEL2() const { return HasSEL2; }
bool hasPMU() const { return HasPMU; }
bool hasTLB_RMI() const { return HasTLB_RMI; }
- bool hasFMI() const { return HasFMI; }
+ bool hasFlagM() const { return HasFlagM; }
bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
bool addrSinkUsingGEPs() const override {
@@ -511,7 +542,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
/// ClassifyGlobalReference - Find the target operand flags that describe
/// how a global value should be referenced for the current subtarget.
@@ -550,6 +581,7 @@ public:
// implied by the architecture.
unsigned getMaxSVEVectorSizeInBits() const;
unsigned getMinSVEVectorSizeInBits() const;
+ bool useSVEForFixedLengthVectors() const;
};
} // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index ceceabc6ff4e..01ac52bd875a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -32,6 +32,11 @@ def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
AssemblerPredicate<(all_of FeaturePAN_RWV),
"ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
+def HasCONTEXTIDREL2
+ : Predicate<"Subtarget->hasCONTEXTIDREL2()">,
+ AssemblerPredicate<(all_of FeatureCONTEXTIDREL2),
+ "Target contains CONTEXTIDR_EL2 RW operand">;
+
//===----------------------------------------------------------------------===//
// AT (address translate) instruction options.
//===----------------------------------------------------------------------===//
@@ -93,6 +98,21 @@ def : DB<"ld", 0xd>;
def : DB<"st", 0xe>;
def : DB<"sy", 0xf>;
+class DBnXS<string name, bits<4> encoding, bits<5> immValue> : SearchableTable {
+ let SearchableFields = ["Name", "Encoding", "ImmValue"];
+ let EnumValueField = "Encoding";
+
+ string Name = name;
+ bits<4> Encoding = encoding;
+ bits<5> ImmValue = immValue;
+ code Requires = [{ {AArch64::FeatureXS} }];
+}
+
+def : DBnXS<"oshnxs", 0x3, 0x10>;
+def : DBnXS<"nshnxs", 0x7, 0x14>;
+def : DBnXS<"ishnxs", 0xb, 0x18>;
+def : DBnXS<"synxs", 0xf, 0x1c>;
+
//===----------------------------------------------------------------------===//
// DC (data cache maintenance) instruction options.
//===----------------------------------------------------------------------===//
@@ -384,11 +404,8 @@ def : BTI<"jc", 0b11>;
// TLBI (translation lookaside buffer invalidate) instruction options.
//===----------------------------------------------------------------------===//
-class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
- bits<3> op2, bit needsreg = 1> : SearchableTable {
- let SearchableFields = ["Name", "Encoding"];
- let EnumValueField = "Encoding";
-
+class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2, bit needsreg> {
string Name = name;
bits<14> Encoding;
let Encoding{13-11} = op1;
@@ -396,95 +413,122 @@ class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
let Encoding{6-3} = crm;
let Encoding{2-0} = op2;
bit NeedsReg = needsreg;
- code Requires = [{ {} }];
+ list<string> Requires = [];
+ list<string> ExtraRequires = [];
+ code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }];
}
-def : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>;
-def : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>;
-def : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>;
-def : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>;
-def : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>;
-def : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>;
-def : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
-def : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>;
-def : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>;
-def : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>;
-def : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>;
-def : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>;
-def : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>;
-def : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>;
-def : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>;
-def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>;
+def TLBITable : GenericTable {
+ let FilterClass = "TLBIEntry";
+ let CppTypeName = "TLBI";
+ let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+}
+
+def lookupTLBIByName : SearchIndex {
+ let Table = TLBITable;
+ let Key = ["Name"];
+}
+
+def lookupTLBIByEncoding : SearchIndex {
+ let Table = TLBITable;
+ let Key = ["Encoding"];
+}
+
+multiclass TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+ bits<3> op2, bit needsreg = 1> {
+ def : TLBIEntry<name, op1, crn, crm, op2, needsreg>;
+ def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+ let Encoding{7} = 1;
+ let ExtraRequires = ["AArch64::FeatureXS"];
+ }
+}
+
+defm : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>;
+defm : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>;
+defm : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>;
+defm : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>;
+defm : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>;
+defm : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
+defm : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>;
+defm : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>;
+defm : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>;
+defm : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>;
+defm : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>;
+defm : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>;
+defm : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>;
// Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)
-let Requires = [{ {AArch64::FeatureTLB_RMI} }] in {
+let Requires = ["AArch64::FeatureTLB_RMI"] in {
// Armv8.4-A Outer Sharable TLB Maintenance instructions:
// op1 CRn CRm op2
-def : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>;
-def : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>;
-def : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>;
-def : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>;
-def : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>;
-def : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>;
-def : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>;
-def : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>;
-def : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>;
-def : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>;
-def : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
-def : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>;
-def : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>;
-def : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>;
-def : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>;
-def : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>;
+defm : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>;
+defm : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>;
+defm : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>;
+defm : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>;
+defm : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
+defm : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>;
+defm : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>;
// Armv8.4-A TLB Range Maintenance instructions:
// op1 CRn CRm op2
-def : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>;
-def : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>;
-def : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>;
-def : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>;
-def : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>;
-def : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>;
-def : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>;
-def : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>;
-def : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>;
-def : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>;
-def : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>;
-def : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>;
-def : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>;
-def : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>;
-def : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>;
-def : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>;
-def : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>;
-def : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>;
-def : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>;
-def : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>;
-def : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>;
-def : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>;
-def : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>;
-def : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>;
-def : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>;
-def : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>;
-def : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>;
-def : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>;
-def : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>;
-def : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>;
+defm : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>;
+defm : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>;
+defm : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>;
+defm : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>;
+defm : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>;
+defm : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>;
+defm : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>;
+defm : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>;
+defm : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>;
+defm : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>;
+defm : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>;
+defm : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>;
} //FeatureTLB_RMI
// Armv8.5-A Prediction Restriction by Context instruction options:
@@ -599,6 +643,7 @@ def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>;
def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>;
def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>;
def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b010>;
def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>;
def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>;
def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010>;
@@ -814,6 +859,9 @@ def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>;
def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>;
def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>;
def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"HCRX_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b010> {
+ let Requires = [{ {AArch64::FeatureHCX} }];
+}
def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>;
def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>;
def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>;
@@ -1220,7 +1268,6 @@ def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>;
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::FeatureVH} }] in {
def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>;
-def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>;
def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>;
def : RWSysReg<"CNTHV_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b010>;
def : RWSysReg<"CNTHV_CTL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b001>;
@@ -1246,6 +1293,9 @@ def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>;
def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>;
def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>;
def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>;
+let Requires = [{ {AArch64::FeatureCONTEXTIDREL2} }] in {
+ def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>;
+}
}
// v8.2a registers
// Op0 Op1 CRn CRm Op2
@@ -1286,7 +1336,7 @@ def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>;
// v8.3a "Pointer authentication extension" registers
// Op0 Op1 CRn CRm Op2
-let Requires = [{ {AArch64::FeaturePA} }] in {
+let Requires = [{ {AArch64::FeaturePAuth} }] in {
def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>;
def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>;
def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>;
@@ -1328,13 +1378,11 @@ def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
// v8.4a RAS registers
// Op0 Op1 CRn CRm Op2
-let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>;
def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>;
def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>;
-} // FeatureRASv8_4
// v8.4a MPAM registers
// Op0 Op1 CRn CRm Op2
@@ -1522,6 +1570,33 @@ def : RWSysReg<"CNTPCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b101>;
def : RWSysReg<"CNTVCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b110>;
}
+// v8.7a LD64B/ST64B Accelerator Extension system register
+let Requires = [{ {AArch64::FeatureLS64} }] in
+def : RWSysReg<"ACCDATA_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b101>;
+
+// Branch Record Buffer system registers
+let Requires = [{ {AArch64::FeatureBRBE} }] in {
+def : RWSysReg<"BRBCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b000>;
+def : RWSysReg<"BRBCR_EL12", 0b10, 0b101, 0b1001, 0b0000, 0b000>;
+def : RWSysReg<"BRBCR_EL2", 0b10, 0b100, 0b1001, 0b0000, 0b000>;
+def : RWSysReg<"BRBFCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b001>;
+def : ROSysReg<"BRBIDR0_EL1", 0b10, 0b001, 0b1001, 0b0010, 0b000>;
+def : RWSysReg<"BRBINFINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b000>;
+def : RWSysReg<"BRBSRCINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b001>;
+def : RWSysReg<"BRBTGTINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b010>;
+def : RWSysReg<"BRBTS_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b010>;
+foreach n = 0-31 in {
+ defvar nb = !cast<bits<5>>(n);
+ def : ROSysReg<"BRBINF"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b00}>;
+ def : ROSysReg<"BRBSRC"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b01}>;
+ def : ROSysReg<"BRBTGT"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b10}>;
+}
+}
+
+// Statistical Profiling Extension system register
+let Requires = [{ {AArch64::FeatureSPE_EEF} }] in
+def : RWSysReg<"PMSNEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b001>;
+
// Cyclone specific system registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::ProcAppleA7} }] in
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index a63b9a97ada5..bec1758a931b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -148,10 +148,10 @@ static cl::opt<int> EnableGlobalISelAtO(
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
cl::init(0));
-static cl::opt<bool> EnableSVEIntrinsicOpts(
- "aarch64-sve-intrinsic-opts", cl::Hidden,
- cl::desc("Enable SVE intrinsic opts"),
- cl::init(true));
+static cl::opt<bool>
+ EnableSVEIntrinsicOpts("aarch64-enable-sve-intrinsic-opts", cl::Hidden,
+ cl::desc("Enable SVE intrinsic opts"),
+ cl::init(true));
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
cl::init(true), cl::Hidden);
@@ -184,6 +184,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64SIMDInstrOptPass(*PR);
initializeAArch64PreLegalizerCombinerPass(*PR);
initializeAArch64PostLegalizerCombinerPass(*PR);
+ initializeAArch64PostLegalizerLoweringPass(*PR);
+ initializeAArch64PostSelectOptimizePass(*PR);
initializeAArch64PromoteConstantPass(*PR);
initializeAArch64RedundantCopyEliminationPass(*PR);
initializeAArch64StorePairSuppressPass(*PR);
@@ -213,8 +215,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
static std::string computeDataLayout(const Triple &TT,
const MCTargetOptions &Options,
bool LittleEndian) {
- if (Options.getABIName() == "ilp32")
- return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
if (TT.isOSBinFormatMachO()) {
if (TT.getArch() == Triple::aarch64_32)
return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
@@ -222,9 +222,16 @@ static std::string computeDataLayout(const Triple &TT,
}
if (TT.isOSBinFormatCOFF())
return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
- if (LittleEndian)
- return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
- return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+ std::string Endian = LittleEndian ? "e" : "E";
+ std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : "";
+ return Endian + "-m:e" + Ptr32 +
+ "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+}
+
+static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) {
+ if (CPU.empty() && TT.isArm64e())
+ return "apple-a12";
+ return CPU;
}
static Reloc::Model getEffectiveRelocModel(const Triple &TT,
@@ -274,7 +281,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
bool LittleEndian)
: LLVMTargetMachine(T,
computeDataLayout(TT, Options.MCOptions, LittleEndian),
- TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM),
+ TT, computeDefaultCPU(TT, CPU), FS, Options,
+ getEffectiveRelocModel(TT, RM),
getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
initAsmInfo();
@@ -309,6 +317,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
// MachO/CodeModel::Large, which GlobalISel does not support.
if (getOptLevel() <= EnableGlobalISelAtO &&
TT.getArch() != Triple::aarch64_32 &&
+ TT.getEnvironment() != Triple::GNUILP32 &&
!(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) {
setGlobalISel(true);
setGlobalISelAbort(GlobalISelAbortMode::Disable);
@@ -331,12 +340,10 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
auto &I = SubtargetMap[CPU + FS];
if (!I) {
@@ -453,7 +460,12 @@ void AArch64PassConfig::addIRPasses() {
// determine whether it succeeded. We can exploit existing control-flow in
// ldrex/strex loops to simplify this, but it needs tidying up.
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
- addPass(createCFGSimplificationPass(1, true, true, false, true));
+ addPass(createCFGSimplificationPass(SimplifyCFGOptions()
+ .forwardSwitchCondToPhi(true)
+ .convertSwitchToLookupTable(true)
+ .needCanonicalLoops(false)
+ .hoistCommonInsts(true)
+ .sinkCommonInsts(true)));
// Run LoopDataPrefetch
//
@@ -541,13 +553,13 @@ bool AArch64PassConfig::addInstSelector() {
}
bool AArch64PassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
void AArch64PassConfig::addPreLegalizeMachineIR() {
bool IsOptNone = getOptLevel() == CodeGenOpt::None;
- addPass(createAArch64PreLegalizeCombiner(IsOptNone));
+ addPass(createAArch64PreLegalizerCombiner(IsOptNone));
}
bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -556,11 +568,10 @@ bool AArch64PassConfig::addLegalizeMachineIR() {
}
void AArch64PassConfig::addPreRegBankSelect() {
- // For now we don't add this to the pipeline for -O0. We could do in future
- // if we split the combines into separate O0/opt groupings.
bool IsOptNone = getOptLevel() == CodeGenOpt::None;
if (!IsOptNone)
- addPass(createAArch64PostLegalizeCombiner(IsOptNone));
+ addPass(createAArch64PostLegalizerCombiner(IsOptNone));
+ addPass(createAArch64PostLegalizerLowering());
}
bool AArch64PassConfig::addRegBankSelect() {
@@ -574,6 +585,8 @@ void AArch64PassConfig::addPreGlobalInstructionSelect() {
bool AArch64PassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(createAArch64PostSelectOptimize());
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 7738a4229391..25e626134317 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -57,6 +57,12 @@ public:
SMDiagnostic &Error,
SMRange &SourceRange) const override;
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
+
private:
bool isLittle;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cf6de797727b..7fda6b8fb602 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64ExpandImm.h"
#include "AArch64TargetTransformInfo.h"
+#include "AArch64ExpandImm.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -16,9 +16,11 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include <algorithm>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64tti"
@@ -84,7 +86,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -192,6 +195,10 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TTI::TCC_Free;
break;
+ case Intrinsic::experimental_gc_statepoint:
+ if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TTI::TCC_Free;
+ break;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
@@ -205,14 +212,43 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
+unsigned
+AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ auto *RetTy = ICA.getReturnType();
+ switch (ICA.getID()) {
+ case Intrinsic::umin:
+ case Intrinsic::umax: {
+ auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ // umin(x,y) -> sub(x,usubsat(x,y))
+ // umax(x,y) -> add(x,usubsat(y,x))
+ if (LT.second == MVT::v2i64)
+ return LT.first * 2;
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::smin:
+ case Intrinsic::smax: {
+ static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
+ MVT::v8i16, MVT::v2i32, MVT::v4i32};
+ auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
+ return LT.first;
+ break;
+ }
+ default:
+ break;
+ }
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
// A helper that returns a vector type from the given type. The number of
// elements in type Ty determine the vector width.
auto toVectorTy = [&](Type *ArgTy) {
- return FixedVectorType::get(ArgTy->getScalarType(),
- cast<FixedVectorType>(DstTy)->getNumElements());
+ return VectorType::get(ArgTy->getScalarType(),
+ cast<VectorType>(DstTy)->getElementCount());
};
// Exit early if DstTy is not a vector type whose elements are at least
@@ -261,8 +297,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
return false;
// Get the total number of vector elements in the legalized types.
- unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
- unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+ unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements();
+ unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
// Return true if the legalized types have the same number of vector elements
// and the destination element type size is twice that of the source type.
@@ -270,6 +306,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
}
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -306,7 +343,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
static const TypeConversionCostTblEntry
ConversionTbl[] = {
@@ -410,7 +448,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost);
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
@@ -442,12 +481,14 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
// we may get the extension for free. If not, get the default cost for the
// extend.
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
- return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+ return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
+ CostKind);
// The destination type should be larger than the element type. If not, get
// the default cost for the extend.
- if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
- return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+ if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
+ return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
+ CostKind);
switch (Opcode) {
default:
@@ -466,7 +507,8 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
}
// If we are unable to perform the extend for free, get the default cost.
- return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+ return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
+ CostKind);
}
unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
@@ -602,8 +644,20 @@ int AArch64TTIImpl::getArithmeticInstrCost(
}
return Cost;
- case ISD::ADD:
case ISD::MUL:
+ if (LT.second != MVT::v2i64)
+ return (Cost + 1) * LT.first;
+ // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
+ // as elements are extracted from the vectors and the muls scalarized.
+ // As getScalarizationOverhead is a bit too pessimistic, we estimate the
+ // cost for a i64 vector directly here, which is:
+ // - four i64 extracts,
+ // - two i64 inserts, and
+ // - two muls.
+ // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
+ // LT.first = 2 the cost is 16.
+ return LT.first * 8;
+ case ISD::ADD:
case ISD::XOR:
case ISD::OR:
case ISD::AND:
@@ -642,19 +696,40 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
}
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy,
+ Type *CondTy, CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+ I);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
// width.
- if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+ if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
// We would need this many instructions to hide the scalarization happening.
const int AmortizationCost = 20;
+
+ // If VecPred is not set, check if we can get a predicate from the context
+ // instruction, if its type matches the requested ValTy.
+ if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
+ CmpInst::Predicate CurrentPred;
+ if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
+ m_Value())))
+ VecPred = CurrentPred;
+ }
+ // Check if we have a compare/select chain that can be lowered using CMxx &
+ // BFI pair.
+ if (CmpInst::isIntPredicate(VecPred)) {
+ static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
+ MVT::v8i16, MVT::v2i32, MVT::v4i32,
+ MVT::v2i64};
+ auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
+ return LT.first;
+ }
+
static const TypeConversionCostTblEntry
VectorSelectTbl[] = {
{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
@@ -674,7 +749,9 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return Entry->Cost;
}
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ // The base case handles scalable vectors fine for now, since it treats the
+ // cost as 1 * legalization cost.
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
AArch64TTIImpl::TTI::MemCmpExpansionOptions
@@ -695,6 +772,30 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
return Options;
}
+unsigned AArch64TTIImpl::getGatherScatterOpCost(
+ unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
+
+ if (!isa<ScalableVectorType>(DataTy))
+ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+ auto *VT = cast<VectorType>(DataTy);
+ auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
+ ElementCount LegalVF = LT.second.getVectorElementCount();
+ Optional<unsigned> MaxNumVScale = getMaxVScale();
+ assert(MaxNumVScale && "Expected valid max vscale value");
+
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
+ unsigned MaxNumElementsPerGather =
+ MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
+ return LT.first * MaxNumElementsPerGather * MemOpCost;
+}
+
+bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
+ return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
+}
+
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
MaybeAlign Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
@@ -722,7 +823,7 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
return LT.first * 2 * AmortizationCost;
}
- if (Ty->isVectorTy() &&
+ if (useNeonVector(Ty) &&
cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
unsigned ProfitableNumElements;
if (Opcode == Instruction::Store)
@@ -997,11 +1098,70 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
return false;
}
+int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsPairwise, bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
+ if (!isa<ScalableVectorType>(Ty))
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+ assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
+ "Both vector needs to be scalable");
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ int LegalizationCost = 0;
+ if (LT.first > 1) {
+ Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
+ unsigned CmpOpcode =
+ Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp;
+ LegalizationCost =
+ getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind) +
+ getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ LegalizationCost *= LT.first - 1;
+ }
+
+ return LegalizationCost + /*Cost of horizontal reduction*/ 2;
+}
+
+int AArch64TTIImpl::getArithmeticReductionCostSVE(
+ unsigned Opcode, VectorType *ValTy, bool IsPairwise,
+ TTI::TargetCostKind CostKind) {
+ assert(!IsPairwise && "Cannot be pair wise to continue");
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ int LegalizationCost = 0;
+ if (LT.first > 1) {
+ Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
+ LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
+ LegalizationCost *= LT.first - 1;
+ }
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+ // Add the final reduction cost for the legal horizontal reduction
+ switch (ISD) {
+ case ISD::ADD:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::FADD:
+ return LegalizationCost + 2;
+ default:
+ // TODO: Replace for invalid when InstructionCost is used
+ // cases not supported by SVE
+ return 16;
+ }
+}
+
int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
VectorType *ValTy,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind) {
+ if (isa<ScalableVectorType>(ValTy))
+ return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
if (IsPairwiseForm)
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
CostKind);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1f029689a60e..7c9360ada92e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -74,7 +74,8 @@ public:
int getIntImmCost(int64_t Val);
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
@@ -96,6 +97,9 @@ public:
return 31;
}
+ unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+
unsigned getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasSVE())
@@ -111,10 +115,21 @@ public:
return ST->getMinVectorRegisterBitWidth();
}
+ Optional<unsigned> getMaxVScale() const {
+ if (ST->hasSVE())
+ return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+ return BaseT::getMaxVScale();
+ }
+
unsigned getMaxInterleaveFactor(unsigned VF);
+ unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::TargetCostKind CostKind,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
@@ -124,6 +139,14 @@ public:
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsPairwise, bool IsUnsigned,
+ TTI::TargetCostKind CostKind);
+
+ int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind);
+
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
@@ -137,11 +160,13 @@ public:
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
+ bool useNeonVector(const Type *Ty) const;
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace,
@@ -166,6 +191,9 @@ public:
return false;
Type *Ty = cast<ScalableVectorType>(DataType)->getElementType();
+ if (Ty->isPointerTy())
+ return true;
+
if (Ty->isBFloatTy() || Ty->isHalfTy() ||
Ty->isFloatTy() || Ty->isDoubleTy())
return true;
@@ -213,28 +241,14 @@ public:
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);
- bool shouldExpandReduction(const IntrinsicInst *II) const {
- switch (II->getIntrinsicID()) {
- case Intrinsic::experimental_vector_reduce_v2_fadd:
- case Intrinsic::experimental_vector_reduce_v2_fmul:
- // We don't have legalization support for ordered FP reductions.
- return !II->getFastMathFlags().allowReassoc();
-
- case Intrinsic::experimental_vector_reduce_fmax:
- case Intrinsic::experimental_vector_reduce_fmin:
- // Lowering asserts that there are no NaNs.
- return !II->getFastMathFlags().noNaNs();
-
- default:
- // Don't expand anything else, let legalization deal with it.
- return false;
- }
- }
+ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
unsigned getGISelRematGlobalCost() const {
return 2;
}
+ bool supportsScalableVectors() const { return ST->hasSVE(); }
+
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e72ae0e62cb7..96c50ff3f8d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "MCTargetDesc/AArch64TargetStreamer.h"
@@ -158,8 +159,13 @@ private:
bool parseSymbolicImmVal(const MCExpr *&ImmVal);
bool parseNeonVectorList(OperandVector &Operands);
bool parseOptionalMulOperand(OperandVector &Operands);
+ bool parseKeywordOperand(OperandVector &Operands);
bool parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode);
+ bool parseImmExpr(int64_t &Out);
+ bool parseComma();
+ bool parseRegisterInRange(unsigned &Out, unsigned Base, unsigned First,
+ unsigned Last);
bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo,
OperandVector &Operands);
@@ -181,6 +187,31 @@ private:
bool parseDirectiveVariantPCS(SMLoc L);
+ bool parseDirectiveSEHAllocStack(SMLoc L);
+ bool parseDirectiveSEHPrologEnd(SMLoc L);
+ bool parseDirectiveSEHSaveR19R20X(SMLoc L);
+ bool parseDirectiveSEHSaveFPLR(SMLoc L);
+ bool parseDirectiveSEHSaveFPLRX(SMLoc L);
+ bool parseDirectiveSEHSaveReg(SMLoc L);
+ bool parseDirectiveSEHSaveRegX(SMLoc L);
+ bool parseDirectiveSEHSaveRegP(SMLoc L);
+ bool parseDirectiveSEHSaveRegPX(SMLoc L);
+ bool parseDirectiveSEHSaveLRPair(SMLoc L);
+ bool parseDirectiveSEHSaveFReg(SMLoc L);
+ bool parseDirectiveSEHSaveFRegX(SMLoc L);
+ bool parseDirectiveSEHSaveFRegP(SMLoc L);
+ bool parseDirectiveSEHSaveFRegPX(SMLoc L);
+ bool parseDirectiveSEHSetFP(SMLoc L);
+ bool parseDirectiveSEHAddFP(SMLoc L);
+ bool parseDirectiveSEHNop(SMLoc L);
+ bool parseDirectiveSEHSaveNext(SMLoc L);
+ bool parseDirectiveSEHEpilogStart(SMLoc L);
+ bool parseDirectiveSEHEpilogEnd(SMLoc L);
+ bool parseDirectiveSEHTrapFrame(SMLoc L);
+ bool parseDirectiveSEHMachineFrame(SMLoc L);
+ bool parseDirectiveSEHContext(SMLoc L);
+ bool parseDirectiveSEHClearUnwoundToCall(SMLoc L);
+
bool validateInstruction(MCInst &Inst, SMLoc &IDLoc,
SmallVectorImpl<SMLoc> &Loc);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -200,6 +231,7 @@ private:
RegKind MatchKind);
OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+ OperandMatchResultTy tryParseBarriernXSOperand(OperandVector &Operands);
OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
@@ -226,6 +258,7 @@ private:
OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
bool ExpectMatch = false);
OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);
+ OperandMatchResultTy tryParseGPR64x8(OperandVector &Operands);
public:
enum AArch64MatchResultTy {
@@ -238,7 +271,7 @@ public:
AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
: MCTargetAsmParser(Options, STI, MII) {
- IsILP32 = Options.getABIName() == "ilp32";
+ IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32;
MCAsmParserExtension::Initialize(Parser);
MCStreamer &S = getParser().getStreamer();
if (S.getTargetStreamer() == nullptr)
@@ -371,6 +404,7 @@ private:
const char *Data;
unsigned Length;
unsigned Val; // Not the enum since not all values have names.
+ bool HasnXSModifier;
};
struct SysRegOp {
@@ -540,6 +574,11 @@ public:
return StringRef(Barrier.Data, Barrier.Length);
}
+ bool getBarriernXSModifier() const {
+ assert(Kind == k_Barrier && "Invalid access!");
+ return Barrier.HasnXSModifier;
+ }
+
unsigned getReg() const override {
assert(Kind == k_Register && "Invalid access!");
return Reg.RegNum;
@@ -711,7 +750,8 @@ public:
ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
- ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) {
+ ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 ||
+ ELFRefKind == AArch64MCExpr::VK_GOT_PAGE_LO15) {
// Note that we don't range-check the addend. It's adjusted modulo page
// size when converted, so there is no "out of range" condition when using
// @pageoff.
@@ -857,7 +897,8 @@ public:
if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
return DiagnosticPredicateTy::NoMatch;
- bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
+ bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value ||
+ std::is_same<int8_t, T>::value;
if (auto ShiftedImm = getShiftedVal<8>())
if (!(IsByte && ShiftedImm->second) &&
AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
@@ -874,7 +915,8 @@ public:
if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
return DiagnosticPredicateTy::NoMatch;
- bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
+ bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value ||
+ std::is_same<int8_t, T>::value;
if (auto ShiftedImm = getShiftedVal<8>())
if (!(IsByte && ShiftedImm->second) &&
AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
@@ -999,7 +1041,12 @@ public:
AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1;
}
- bool isBarrier() const { return Kind == k_Barrier; }
+ bool isBarrier() const {
+ return Kind == k_Barrier && !getBarriernXSModifier();
+ }
+ bool isBarriernXS() const {
+ return Kind == k_Barrier && getBarriernXSModifier();
+ }
bool isSysReg() const { return Kind == k_SysReg; }
bool isMRSSystemRegister() const {
@@ -1126,6 +1173,12 @@ public:
AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
}
+ bool isGPR64x8() const {
+ return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
+ AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains(
+ Reg.RegNum);
+ }
+
bool isWSeqPair() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
@@ -1689,6 +1742,11 @@ public:
Inst.addOperand(MCOperand::createImm(getBarrier()));
}
+ void addBarriernXSOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getBarrier()));
+ }
+
void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
@@ -1924,11 +1982,13 @@ public:
static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
StringRef Str,
SMLoc S,
- MCContext &Ctx) {
+ MCContext &Ctx,
+ bool HasnXSModifier) {
auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx);
Op->Barrier.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
+ Op->Barrier.HasnXSModifier = HasnXSModifier;
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
@@ -2073,14 +2133,14 @@ void AArch64Operand::print(raw_ostream &OS) const {
case k_PSBHint:
OS << getPSBHintName();
break;
+ case k_BTIHint:
+ OS << getBTIHintName();
+ break;
case k_Register:
OS << "<register " << getReg() << ">";
if (!getShiftExtendAmount() && !hasShiftExtendAmount())
break;
LLVM_FALLTHROUGH;
- case k_BTIHint:
- OS << getBTIHintName();
- break;
case k_ShiftExtend:
OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
<< getShiftExtendAmount();
@@ -2510,6 +2570,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC &&
ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+ ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 &&
ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
// The operand must be an @page or @gotpage qualified symbolref.
@@ -2843,6 +2904,7 @@ static const struct Extension {
{"predres", {AArch64::FeaturePredRes}},
{"ccdp", {AArch64::FeatureCacheDeepPersist}},
{"mte", {AArch64::FeatureMTE}},
+ {"memtag", {AArch64::FeatureMTE}},
{"tlb-rmi", {AArch64::FeatureTLB_RMI}},
{"pan-rwv", {AArch64::FeaturePAN_RWV}},
{"ccpp", {AArch64::FeatureCCPP}},
@@ -2853,6 +2915,10 @@ static const struct Extension {
{"sve2-sm4", {AArch64::FeatureSVE2SM4}},
{"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
{"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
+ {"ls64", {AArch64::FeatureLS64}},
+ {"xs", {AArch64::FeatureXS}},
+ {"pauth", {AArch64::FeaturePAuth}},
+ {"flagm", {AArch64::FeatureFlagM}},
// FIXME: Unsupported extensions
{"pan", {}},
{"lor", {}},
@@ -2873,15 +2939,16 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
Str += "ARMv8.5a";
else if (FBS[AArch64::HasV8_6aOps])
Str += "ARMv8.6a";
+ else if (FBS[AArch64::HasV8_7aOps])
+ Str += "ARMv8.7a";
else {
- auto ext = std::find_if(std::begin(ExtensionMap),
- std::end(ExtensionMap),
- [&](const Extension& e)
+ SmallVector<std::string, 2> ExtMatches;
+ for (const auto& Ext : ExtensionMap) {
// Use & in case multiple features are enabled
- { return (FBS & e.Features) != FeatureBitset(); }
- );
-
- Str += ext != std::end(ExtensionMap) ? ext->Name : "(unknown)";
+ if ((FBS & Ext.Features) != FeatureBitset())
+ ExtMatches.push_back(Ext.Name);
+ }
+ Str += !ExtMatches.empty() ? llvm::join(ExtMatches, ", ") : "(unknown)";
}
}
@@ -2926,7 +2993,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
if (!IC)
return TokError("invalid operand for IC instruction");
else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
- std::string Str("IC " + std::string(IC->Name) + " requires ");
+ std::string Str("IC " + std::string(IC->Name) + " requires: ");
setRequiredFeatureString(IC->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
@@ -2936,7 +3003,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
if (!DC)
return TokError("invalid operand for DC instruction");
else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
- std::string Str("DC " + std::string(DC->Name) + " requires ");
+ std::string Str("DC " + std::string(DC->Name) + " requires: ");
setRequiredFeatureString(DC->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
@@ -2946,7 +3013,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
if (!AT)
return TokError("invalid operand for AT instruction");
else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
- std::string Str("AT " + std::string(AT->Name) + " requires ");
+ std::string Str("AT " + std::string(AT->Name) + " requires: ");
setRequiredFeatureString(AT->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
@@ -2956,7 +3023,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
if (!TLBI)
return TokError("invalid operand for TLBI instruction");
else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
- std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
+ std::string Str("TLBI " + std::string(TLBI->Name) + " requires: ");
setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
@@ -2967,7 +3034,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
return TokError("invalid operand for prediction restriction instruction");
else if (!PRCTX->haveFeatures(getSTI().getFeatureBits())) {
std::string Str(
- Mnemonic.upper() + std::string(PRCTX->Name) + " requires ");
+ Mnemonic.upper() + std::string(PRCTX->Name) + " requires: ");
setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
@@ -3011,11 +3078,11 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
TokError("'csync' operand expected");
return MatchOperand_ParseFail;
- // Can be either a #imm style literal or an option name
} else if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
// Immediate operand.
const MCExpr *ImmVal;
SMLoc ExprLoc = getLoc();
+ AsmToken IntTok = Tok;
if (getParser().parseExpression(ImmVal))
return MatchOperand_ParseFail;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
@@ -3023,13 +3090,22 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
Error(ExprLoc, "immediate value expected for barrier operand");
return MatchOperand_ParseFail;
}
- if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+ int64_t Value = MCE->getValue();
+ if (Mnemonic == "dsb" && Value > 15) {
+ // This case is a no match here, but it might be matched by the nXS
+ // variant. Deliberately not unlex the optional '#' as it is not necessary
+ // to characterize an integer immediate.
+ Parser.getLexer().UnLex(IntTok);
+ return MatchOperand_NoMatch;
+ }
+ if (Value < 0 || Value > 15) {
Error(ExprLoc, "barrier operand out of range");
return MatchOperand_ParseFail;
}
- auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
- Operands.push_back(AArch64Operand::CreateBarrier(
- MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
+ auto DB = AArch64DB::lookupDBByEncoding(Value);
+ Operands.push_back(AArch64Operand::CreateBarrier(Value, DB ? DB->Name : "",
+ ExprLoc, getContext(),
+ false /*hasnXSModifier*/));
return MatchOperand_Success;
}
@@ -3038,9 +3114,10 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- auto TSB = AArch64TSB::lookupTSBByName(Tok.getString());
+ StringRef Operand = Tok.getString();
+ auto TSB = AArch64TSB::lookupTSBByName(Operand);
+ auto DB = AArch64DB::lookupDBByName(Operand);
// The only valid named option for ISB is 'sy'
- auto DB = AArch64DB::lookupDBByName(Tok.getString());
if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
TokError("'sy' or #imm operand expected");
return MatchOperand_ParseFail;
@@ -3049,12 +3126,73 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
TokError("'csync' operand expected");
return MatchOperand_ParseFail;
} else if (!DB && !TSB) {
+ if (Mnemonic == "dsb") {
+ // This case is a no match here, but it might be matched by the nXS
+ // variant.
+ return MatchOperand_NoMatch;
+ }
TokError("invalid barrier option name");
return MatchOperand_ParseFail;
}
Operands.push_back(AArch64Operand::CreateBarrier(
- DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), getContext()));
+ DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(),
+ getContext(), false /*hasnXSModifier*/));
+ Parser.Lex(); // Consume the option
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseBarriernXSOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ const AsmToken &Tok = Parser.getTok();
+
+ assert(Mnemonic == "dsb" && "Instruction does not accept nXS operands");
+ if (Mnemonic != "dsb")
+ return MatchOperand_ParseFail;
+
+ if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
+ // Immediate operand.
+ const MCExpr *ImmVal;
+ SMLoc ExprLoc = getLoc();
+ if (getParser().parseExpression(ImmVal))
+ return MatchOperand_ParseFail;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+ if (!MCE) {
+ Error(ExprLoc, "immediate value expected for barrier operand");
+ return MatchOperand_ParseFail;
+ }
+ int64_t Value = MCE->getValue();
+ // v8.7-A DSB in the nXS variant accepts only the following immediate
+ // values: 16, 20, 24, 28.
+ if (Value != 16 && Value != 20 && Value != 24 && Value != 28) {
+ Error(ExprLoc, "barrier operand out of range");
+ return MatchOperand_ParseFail;
+ }
+ auto DB = AArch64DBnXS::lookupDBnXSByImmValue(Value);
+ Operands.push_back(AArch64Operand::CreateBarrier(DB->Encoding, DB->Name,
+ ExprLoc, getContext(),
+ true /*hasnXSModifier*/));
+ return MatchOperand_Success;
+ }
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ StringRef Operand = Tok.getString();
+ auto DB = AArch64DBnXS::lookupDBnXSByName(Operand);
+
+ if (!DB) {
+ TokError("invalid barrier option name");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateBarrier(DB->Encoding, Tok.getString(), getLoc(),
+ getContext(), true /*hasnXSModifier*/));
Parser.Lex(); // Consume the option
return MatchOperand_Success;
@@ -3300,6 +3438,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
.Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
.Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
.Case("got", AArch64MCExpr::VK_GOT_PAGE)
+ .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15)
.Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
.Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
.Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
@@ -3568,6 +3707,17 @@ bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
return Error(getLoc(), "expected 'vl' or '#<imm>'");
}
+bool AArch64AsmParser::parseKeywordOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ auto Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return true;
+ Operands.push_back(AArch64Operand::CreateToken(Tok.getString(), false,
+ Tok.getLoc(), getContext()));
+ Parser.Lex();
+ return false;
+}
+
/// parseOperand - Parse a arm instruction operand. For now this parses the
/// operand regardless of the mnemonic.
bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
@@ -3632,6 +3782,11 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
if (GotShift != MatchOperand_NoMatch)
return GotShift;
+ // If this is a two-word mnemonic, parse its special keyword
+ // operand as an identifier.
+ if (Mnemonic == "brb")
+ return parseKeywordOperand(Operands);
+
// This was not a register so parse other operands that start with an
// identifier (like labels) as expressions and create them as immediates.
const MCExpr *IdVal;
@@ -3740,6 +3895,66 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
}
}
+bool AArch64AsmParser::parseImmExpr(int64_t &Out) {
+ const MCExpr *Expr = nullptr;
+ SMLoc L = getLoc();
+ if (check(getParser().parseExpression(Expr), L, "expected expression"))
+ return true;
+ const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+ if (check(!Value, L, "expected constant expression"))
+ return true;
+ Out = Value->getValue();
+ return false;
+}
+
+bool AArch64AsmParser::parseComma() {
+ if (check(getParser().getTok().isNot(AsmToken::Comma), getLoc(),
+ "expected comma"))
+ return true;
+ // Eat the comma
+ getParser().Lex();
+ return false;
+}
+
+bool AArch64AsmParser::parseRegisterInRange(unsigned &Out, unsigned Base,
+ unsigned First, unsigned Last) {
+ unsigned Reg;
+ SMLoc Start, End;
+ if (check(ParseRegister(Reg, Start, End), getLoc(), "expected register"))
+ return true;
+
+ // Special handling for FP and LR; they aren't linearly after x28 in
+ // the registers enum.
+ unsigned RangeEnd = Last;
+ if (Base == AArch64::X0) {
+ if (Last == AArch64::FP) {
+ RangeEnd = AArch64::X28;
+ if (Reg == AArch64::FP) {
+ Out = 29;
+ return false;
+ }
+ }
+ if (Last == AArch64::LR) {
+ RangeEnd = AArch64::X28;
+ if (Reg == AArch64::FP) {
+ Out = 29;
+ return false;
+ } else if (Reg == AArch64::LR) {
+ Out = 30;
+ return false;
+ }
+ }
+ }
+
+ if (check(Reg < First || Reg > RangeEnd, Start,
+ Twine("expected register in range ") +
+ AArch64InstPrinter::getRegisterName(First) + " to " +
+ AArch64InstPrinter::getRegisterName(Last)))
+ return true;
+ Out = Reg - Base;
+ return false;
+}
+
bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
const MCParsedAsmOperand &Op2) const {
auto &AOp1 = static_cast<const AArch64Operand&>(Op1);
@@ -5058,6 +5273,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
const MCObjectFileInfo::Environment Format =
getContext().getObjectFileInfo()->getObjectFileType();
bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+ bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
auto IDVal = DirectiveID.getIdentifier().lower();
SMLoc Loc = DirectiveID.getLoc();
@@ -5086,6 +5302,57 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
parseDirectiveLOH(IDVal, Loc);
else
return true;
+ } else if (IsCOFF) {
+ if (IDVal == ".seh_stackalloc")
+ parseDirectiveSEHAllocStack(Loc);
+ else if (IDVal == ".seh_endprologue")
+ parseDirectiveSEHPrologEnd(Loc);
+ else if (IDVal == ".seh_save_r19r20_x")
+ parseDirectiveSEHSaveR19R20X(Loc);
+ else if (IDVal == ".seh_save_fplr")
+ parseDirectiveSEHSaveFPLR(Loc);
+ else if (IDVal == ".seh_save_fplr_x")
+ parseDirectiveSEHSaveFPLRX(Loc);
+ else if (IDVal == ".seh_save_reg")
+ parseDirectiveSEHSaveReg(Loc);
+ else if (IDVal == ".seh_save_reg_x")
+ parseDirectiveSEHSaveRegX(Loc);
+ else if (IDVal == ".seh_save_regp")
+ parseDirectiveSEHSaveRegP(Loc);
+ else if (IDVal == ".seh_save_regp_x")
+ parseDirectiveSEHSaveRegPX(Loc);
+ else if (IDVal == ".seh_save_lrpair")
+ parseDirectiveSEHSaveLRPair(Loc);
+ else if (IDVal == ".seh_save_freg")
+ parseDirectiveSEHSaveFReg(Loc);
+ else if (IDVal == ".seh_save_freg_x")
+ parseDirectiveSEHSaveFRegX(Loc);
+ else if (IDVal == ".seh_save_fregp")
+ parseDirectiveSEHSaveFRegP(Loc);
+ else if (IDVal == ".seh_save_fregp_x")
+ parseDirectiveSEHSaveFRegPX(Loc);
+ else if (IDVal == ".seh_set_fp")
+ parseDirectiveSEHSetFP(Loc);
+ else if (IDVal == ".seh_add_fp")
+ parseDirectiveSEHAddFP(Loc);
+ else if (IDVal == ".seh_nop")
+ parseDirectiveSEHNop(Loc);
+ else if (IDVal == ".seh_save_next")
+ parseDirectiveSEHSaveNext(Loc);
+ else if (IDVal == ".seh_startepilogue")
+ parseDirectiveSEHEpilogStart(Loc);
+ else if (IDVal == ".seh_endepilogue")
+ parseDirectiveSEHEpilogEnd(Loc);
+ else if (IDVal == ".seh_trap_frame")
+ parseDirectiveSEHTrapFrame(Loc);
+ else if (IDVal == ".seh_pushframe")
+ parseDirectiveSEHMachineFrame(Loc);
+ else if (IDVal == ".seh_context")
+ parseDirectiveSEHContext(Loc);
+ else if (IDVal == ".seh_clear_unwound_to_call")
+ parseDirectiveSEHClearUnwoundToCall(Loc);
+ else
+ return true;
} else
return true;
return false;
@@ -5093,12 +5360,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
SmallVector<StringRef, 4> &RequestedExtensions) {
- const bool NoCrypto =
- (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
- "nocrypto") != std::end(RequestedExtensions));
- const bool Crypto =
- (std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
- "crypto") != std::end(RequestedExtensions));
+ const bool NoCrypto = llvm::is_contained(RequestedExtensions, "nocrypto");
+ const bool Crypto = llvm::is_contained(RequestedExtensions, "crypto");
if (!NoCrypto && Crypto) {
switch (ArchKind) {
@@ -5114,6 +5377,8 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
case AArch64::ArchKind::ARMV8_4A:
case AArch64::ArchKind::ARMV8_5A:
case AArch64::ArchKind::ARMV8_6A:
+ case AArch64::ArchKind::ARMV8_7A:
+ case AArch64::ArchKind::ARMV8R:
RequestedExtensions.push_back("sm4");
RequestedExtensions.push_back("sha3");
RequestedExtensions.push_back("sha2");
@@ -5134,6 +5399,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
case AArch64::ArchKind::ARMV8_4A:
case AArch64::ArchKind::ARMV8_5A:
case AArch64::ArchKind::ARMV8_6A:
+ case AArch64::ArchKind::ARMV8_7A:
RequestedExtensions.push_back("nosm4");
RequestedExtensions.push_back("nosha3");
RequestedExtensions.push_back("nosha2");
@@ -5167,7 +5433,8 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
MCSubtargetInfo &STI = copySTI();
std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end());
- STI.setDefaultFeatures("generic", join(ArchFeatures.begin(), ArchFeatures.end(), ","));
+ STI.setDefaultFeatures("generic", /*TuneCPU*/ "generic",
+ join(ArchFeatures.begin(), ArchFeatures.end(), ","));
SmallVector<StringRef, 4> RequestedExtensions;
if (!ExtensionString.empty())
@@ -5269,7 +5536,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
}
MCSubtargetInfo &STI = copySTI();
- STI.setDefaultFeatures(CPU, "");
+ STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, "");
CurLoc = incrementLoc(CurLoc, CPU.size());
ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
@@ -5537,6 +5804,238 @@ bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
return false;
}
+/// parseDirectiveSEHAllocStack
+/// ::= .seh_stackalloc
+bool AArch64AsmParser::parseDirectiveSEHAllocStack(SMLoc L) {
+ int64_t Size;
+ if (parseImmExpr(Size))
+ return true;
+ getTargetStreamer().EmitARM64WinCFIAllocStack(Size);
+ return false;
+}
+
+/// parseDirectiveSEHPrologEnd
+/// ::= .seh_endprologue
+bool AArch64AsmParser::parseDirectiveSEHPrologEnd(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFIPrologEnd();
+ return false;
+}
+
+/// parseDirectiveSEHSaveR19R20X
+/// ::= .seh_save_r19r20_x
+bool AArch64AsmParser::parseDirectiveSEHSaveR19R20X(SMLoc L) {
+ int64_t Offset;
+ if (parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveR19R20X(Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveFPLR
+/// ::= .seh_save_fplr
+bool AArch64AsmParser::parseDirectiveSEHSaveFPLR(SMLoc L) {
+ int64_t Offset;
+ if (parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveFPLR(Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveFPLRX
+/// ::= .seh_save_fplr_x
+bool AArch64AsmParser::parseDirectiveSEHSaveFPLRX(SMLoc L) {
+ int64_t Offset;
+ if (parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveFPLRX(Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveReg
+/// ::= .seh_save_reg
+bool AArch64AsmParser::parseDirectiveSEHSaveReg(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveReg(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveRegX
+/// ::= .seh_save_reg_x
+bool AArch64AsmParser::parseDirectiveSEHSaveRegX(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveRegX(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveRegP
+/// ::= .seh_save_regp
+bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveRegPX
+/// ::= .seh_save_regp_x
+bool AArch64AsmParser::parseDirectiveSEHSaveRegPX(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveLRPair
+/// ::= .seh_save_lrpair
+bool AArch64AsmParser::parseDirectiveSEHSaveLRPair(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ L = getLoc();
+ if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ if (check(((Reg - 19) % 2 != 0), L,
+ "expected register with even offset from x19"))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveLRPair(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveFReg
+/// ::= .seh_save_freg
+bool AArch64AsmParser::parseDirectiveSEHSaveFReg(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveFReg(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveFRegX
+/// ::= .seh_save_freg_x
+bool AArch64AsmParser::parseDirectiveSEHSaveFRegX(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveFRegX(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveFRegP
+/// ::= .seh_save_fregp
+bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSaveFRegPX
+/// ::= .seh_save_fregp_x
+bool AArch64AsmParser::parseDirectiveSEHSaveFRegPX(SMLoc L) {
+ unsigned Reg;
+ int64_t Offset;
+ if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
+ parseComma() || parseImmExpr(Offset))
+ return true;
+ getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset);
+ return false;
+}
+
+/// parseDirectiveSEHSetFP
+/// ::= .seh_set_fp
+bool AArch64AsmParser::parseDirectiveSEHSetFP(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFISetFP();
+ return false;
+}
+
+/// parseDirectiveSEHAddFP
+/// ::= .seh_add_fp
+bool AArch64AsmParser::parseDirectiveSEHAddFP(SMLoc L) {
+ int64_t Size;
+ if (parseImmExpr(Size))
+ return true;
+ getTargetStreamer().EmitARM64WinCFIAddFP(Size);
+ return false;
+}
+
+/// parseDirectiveSEHNop
+/// ::= .seh_nop
+bool AArch64AsmParser::parseDirectiveSEHNop(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFINop();
+ return false;
+}
+
+/// parseDirectiveSEHSaveNext
+/// ::= .seh_save_next
+bool AArch64AsmParser::parseDirectiveSEHSaveNext(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFISaveNext();
+ return false;
+}
+
+/// parseDirectiveSEHEpilogStart
+/// ::= .seh_startepilogue
+bool AArch64AsmParser::parseDirectiveSEHEpilogStart(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFIEpilogStart();
+ return false;
+}
+
+/// parseDirectiveSEHEpilogEnd
+/// ::= .seh_endepilogue
+bool AArch64AsmParser::parseDirectiveSEHEpilogEnd(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFIEpilogEnd();
+ return false;
+}
+
+/// parseDirectiveSEHTrapFrame
+/// ::= .seh_trap_frame
+bool AArch64AsmParser::parseDirectiveSEHTrapFrame(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFITrapFrame();
+ return false;
+}
+
+/// parseDirectiveSEHMachineFrame
+/// ::= .seh_pushframe
+bool AArch64AsmParser::parseDirectiveSEHMachineFrame(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFIMachineFrame();
+ return false;
+}
+
+/// parseDirectiveSEHContext
+/// ::= .seh_context
+bool AArch64AsmParser::parseDirectiveSEHContext(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFIContext();
+ return false;
+}
+
+/// parseDirectiveSEHClearUnwoundToCall
+/// ::= .seh_clear_unwound_to_call
+bool AArch64AsmParser::parseDirectiveSEHClearUnwoundToCall(SMLoc L) {
+ getTargetStreamer().EmitARM64WinCFIClearUnwoundToCall();
+ return false;
+}
+
bool
AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
AArch64MCExpr::VariantKind &ELFRefKind,
@@ -5824,3 +6323,26 @@ AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
return MatchOperand_Success;
}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64x8(OperandVector &Operands) {
+ SMLoc SS = getLoc();
+
+ unsigned XReg;
+ if (tryParseScalarRegister(XReg) != MatchOperand_Success)
+ return MatchOperand_NoMatch;
+
+ MCContext &ctx = getContext();
+ const MCRegisterInfo *RI = ctx.getRegisterInfo();
+ int X8Reg = RI->getMatchingSuperReg(
+ XReg, AArch64::x8sub_0,
+ &AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID]);
+ if (!X8Reg) {
+ Error(SS, "expected an even-numbered x-register in the range [x0,x22]");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AArch64Operand::CreateReg(X8Reg, RegKind::Scalar, SS, getLoc(), ctx));
+ return MatchOperand_Success;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 1ff4abb34054..dca76f8457fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -62,6 +62,10 @@ static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
@@ -267,8 +271,16 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
uint32_t Insn =
(Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
- // Calling the auto-generated decoder function.
- return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
+ const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32};
+
+ for (auto Table : Tables) {
+ DecodeStatus Result =
+ decodeInstruction(Table, MI, Insn, Address, this, STI);
+ if (Result != MCDisassembler::Fail)
+ return Result;
+ }
+
+ return MCDisassembler::Fail;
}
static MCSymbolizer *
@@ -449,6 +461,35 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
return Success;
}
+static const unsigned GPR64x8DecoderTable[] = {
+ AArch64::X0_X1_X2_X3_X4_X5_X6_X7,
+ AArch64::X2_X3_X4_X5_X6_X7_X8_X9,
+ AArch64::X4_X5_X6_X7_X8_X9_X10_X11,
+ AArch64::X6_X7_X8_X9_X10_X11_X12_X13,
+ AArch64::X8_X9_X10_X11_X12_X13_X14_X15,
+ AArch64::X10_X11_X12_X13_X14_X15_X16_X17,
+ AArch64::X12_X13_X14_X15_X16_X17_X18_X19,
+ AArch64::X14_X15_X16_X17_X18_X19_X20_X21,
+ AArch64::X16_X17_X18_X19_X20_X21_X22_X23,
+ AArch64::X18_X19_X20_X21_X22_X23_X24_X25,
+ AArch64::X20_X21_X22_X23_X24_X25_X26_X27,
+ AArch64::X22_X23_X24_X25_X26_X27_X28_FP,
+};
+
+static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
+ unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 22)
+ return Fail;
+ if (RegNo & 1)
+ return Fail;
+
+ unsigned Register = GPR64x8DecoderTable[RegNo >> 1];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return Success;
+}
+
static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Addr,
const void *Decoder) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 4832ae8f415f..0f8b1d6584b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -52,10 +52,10 @@ AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
: CallLowering(&TLI) {}
namespace {
-struct IncomingArgHandler : public CallLowering::ValueHandler {
+struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -101,9 +101,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
/// How the physical register gets marked varies between formal
/// parameters (it's a basic-block live-in), and a call instruction
/// (it's an implicit-def of the BL).
- virtual void markPhysRegUsed(unsigned PhysReg) = 0;
-
- bool isIncomingArgumentHandler() const override { return true; }
+ virtual void markPhysRegUsed(MCRegister PhysReg) = 0;
uint64_t StackUsed;
};
@@ -113,7 +111,7 @@ struct FormalArgHandler : public IncomingArgHandler {
CCAssignFn *AssignFn)
: IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
- void markPhysRegUsed(unsigned PhysReg) override {
+ void markPhysRegUsed(MCRegister PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
@@ -124,24 +122,22 @@ struct CallReturnHandler : public IncomingArgHandler {
MachineInstrBuilder MIB, CCAssignFn *AssignFn)
: IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
- void markPhysRegUsed(unsigned PhysReg) override {
+ void markPhysRegUsed(MCRegister PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
}
MachineInstrBuilder MIB;
};
-struct OutgoingArgHandler : public CallLowering::ValueHandler {
+struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder MIB, CCAssignFn *AssignFn,
CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
int FPDiff = 0)
- : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+ : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
StackSize(0), SPReg(0) {}
- bool isIncomingArgumentHandler() const override { return false; }
-
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
MachineFunction &MF = MIRBuilder.getMF();
@@ -191,6 +187,8 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
if (!Arg.IsFixed)
MaxSize = 0;
+ assert(Arg.Regs.size() == 1);
+
Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
? extendRegister(Arg.Regs[0], VA, MaxSize)
: Arg.Regs[0];
@@ -276,6 +274,7 @@ void AArch64CallLowering::splitToValueTypes(
bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val,
ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI,
Register SwiftErrorVReg) const {
auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR);
assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
@@ -421,7 +420,7 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
// Conservatively forward X8, since it might be used for an aggregate
// return.
if (!CCInfo.isAllocated(AArch64::X8)) {
- unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+ Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
}
@@ -442,7 +441,7 @@ bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const {
bool AArch64CallLowering::lowerFormalArguments(
MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const {
+ ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const {
MachineFunction &MF = MIRBuilder.getMF();
MachineBasicBlock &MBB = MIRBuilder.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -624,64 +623,25 @@ bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
MachineRegisterInfo &MRI = MF.getRegInfo();
- for (unsigned i = 0; i < OutLocs.size(); ++i) {
- auto &ArgLoc = OutLocs[i];
- // If it's not a register, it's fine.
- if (!ArgLoc.isRegLoc()) {
- if (Info.IsVarArg) {
- // Be conservative and disallow variadic memory operands to match SDAG's
- // behaviour.
- // FIXME: If the caller's calling convention is C, then we can
- // potentially use its argument area. However, for cases like fastcc,
- // we can't do anything.
- LLVM_DEBUG(
- dbgs()
- << "... Cannot tail call vararg function with stack arguments\n");
- return false;
- }
- continue;
- }
-
- Register Reg = ArgLoc.getLocReg();
-
- // Only look at callee-saved registers.
- if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
- continue;
-
- LLVM_DEBUG(
- dbgs()
- << "... Call has an argument passed in a callee-saved register.\n");
-
- // Check if it was copied from.
- ArgInfo &OutInfo = OutArgs[i];
-
- if (OutInfo.Regs.size() > 1) {
- LLVM_DEBUG(
- dbgs() << "... Cannot handle arguments in multiple registers.\n");
- return false;
- }
+ if (Info.IsVarArg) {
+ // Be conservative and disallow variadic memory operands to match SDAG's
+ // behaviour.
+ // FIXME: If the caller's calling convention is C, then we can
+ // potentially use its argument area. However, for cases like fastcc,
+ // we can't do anything.
+ for (unsigned i = 0; i < OutLocs.size(); ++i) {
+ auto &ArgLoc = OutLocs[i];
+ if (ArgLoc.isRegLoc())
+ continue;
- // Check if we copy the register, walking through copies from virtual
- // registers. Note that getDefIgnoringCopies does not ignore copies from
- // physical registers.
- MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
- if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
LLVM_DEBUG(
dbgs()
- << "... Parameter was not copied into a VReg, cannot tail call.\n");
- return false;
- }
-
- // Got a copy. Verify that it's the same as the register we want.
- Register CopyRHS = RegDef->getOperand(1).getReg();
- if (CopyRHS != Reg) {
- LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
- "VReg, cannot tail call.\n");
+ << "... Cannot tail call vararg function with stack arguments\n");
return false;
}
}
- return true;
+ return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
}
bool AArch64CallLowering::isEligibleForTailCallOptimization(
@@ -796,7 +756,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
// When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
// x16 or x17.
- if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement"))
+ if (CallerF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
return AArch64::TCRETURNriBTI;
return AArch64::TCRETURNri;
@@ -816,7 +776,7 @@ bool AArch64CallLowering::lowerTailCall(
// TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64
// register class. Until we can do that, we should fall back here.
- if (F.hasFnAttribute("branch-target-enforcement")) {
+ if (MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) {
LLVM_DEBUG(
dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n");
return false;
@@ -934,10 +894,9 @@ bool AArch64CallLowering::lowerTailCall(
// If Callee is a reg, since it is used by a target specific instruction,
// it must have a register class matching the constraint of that instruction.
if (Info.Callee.isReg())
- MIB->getOperand(0).setReg(constrainOperandRegClass(
- MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
- 0));
+ constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB,
+ MIB->getDesc(), Info.Callee, 0);
MF.getFrameInfo().setHasTailCall();
Info.LoweredTailCall = true;
@@ -1019,10 +978,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// instruction, it must have a register class matching the
// constraint of that instruction.
if (Info.Callee.isReg())
- MIB->getOperand(0).setReg(constrainOperandRegClass(
- MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
- 0));
+ constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB,
+ MIB->getDesc(), Info.Callee, 0);
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index 640a86253059..1f45c9ebc048 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -34,13 +34,14 @@ public:
AArch64CallLowering(const AArch64TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<Register> VRegs,
+ ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
Register SwiftErrorVReg) const override;
bool fallBackToDAGISel(const Function &F) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
new file mode 100644
index 000000000000..bed1136c7a67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h
@@ -0,0 +1,29 @@
+//===- AArch64GlobalISelUtils.h ----------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file APIs for AArch64-specific helper functions used in the GlobalISel
+/// pipeline.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
+#define LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
+
+#include <cstdint>
+
+namespace llvm {
+namespace AArch64GISelUtils {
+
+/// \returns true if \p C is a legal immediate operand for an arithmetic
+/// instruction.
+constexpr bool isLegalArithImmed(const uint64_t C) {
+ return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+} // namespace AArch64GISelUtils
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 7733fe7f7b24..5259f4f5a4d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -18,6 +18,7 @@
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
@@ -33,14 +34,18 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "aarch64-isel"
using namespace llvm;
+using namespace MIPatternMatch;
namespace {
@@ -98,15 +103,23 @@ private:
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
- bool tryOptAndIntoCompareBranch(MachineInstr *LHS,
- int64_t CmpConstant,
- const CmpInst::Predicate &Pred,
+ ///@{
+ /// Helper functions for selectCompareBranch.
+ bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
+ MachineIRBuilder &MIB) const;
+ bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
+ MachineIRBuilder &MIB) const;
+ bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
+ MachineIRBuilder &MIB) const;
+ bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
MachineBasicBlock *DstMBB,
MachineIRBuilder &MIB) const;
+ ///@}
+
bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
- bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
// Helper to generate an equivalent of scalar_to_vector into a new register,
@@ -147,6 +160,7 @@ private:
bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const;
unsigned emitConstantPoolEntry(const Constant *CPVal,
MachineFunction &MF) const;
@@ -159,20 +173,72 @@ private:
MachineIRBuilder &MIRBuilder) const;
// Emit an integer compare between LHS and RHS, which checks for Predicate.
- //
- // This returns the produced compare instruction, and the predicate which
- // was ultimately used in the compare. The predicate may differ from what
- // is passed in \p Predicate due to optimization.
- std::pair<MachineInstr *, CmpInst::Predicate>
- emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
- MachineOperand &Predicate,
- MachineIRBuilder &MIRBuilder) const;
- MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
+ MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
+ MachineOperand &Predicate,
+ MachineIRBuilder &MIRBuilder) const;
+
+ /// Emit a floating point comparison between \p LHS and \p RHS.
+ /// \p Pred if given is the intended predicate to use.
+ MachineInstr *emitFPCompare(Register LHS, Register RHS,
+ MachineIRBuilder &MIRBuilder,
+ Optional<CmpInst::Predicate> = None) const;
+
+ MachineInstr *emitInstr(unsigned Opcode,
+ std::initializer_list<llvm::DstOp> DstOps,
+ std::initializer_list<llvm::SrcOp> SrcOps,
+ MachineIRBuilder &MIRBuilder,
+ const ComplexRendererFns &RenderFns = None) const;
+ /// Helper function to emit an add or sub instruction.
+ ///
+ /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
+ /// in a specific order.
+ ///
+ /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
+ ///
+ /// \code
+ /// const std::array<std::array<unsigned, 2>, 4> Table {
+ /// {{AArch64::ADDXri, AArch64::ADDWri},
+ /// {AArch64::ADDXrs, AArch64::ADDWrs},
+ /// {AArch64::ADDXrr, AArch64::ADDWrr},
+ /// {AArch64::SUBXri, AArch64::SUBWri},
+ /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
+ /// \endcode
+ ///
+ /// Each row in the table corresponds to a different addressing mode. Each
+ /// column corresponds to a different register size.
+ ///
+ /// \attention Rows must be structured as follows:
+ /// - Row 0: The ri opcode variants
+ /// - Row 1: The rs opcode variants
+ /// - Row 2: The rr opcode variants
+ /// - Row 3: The ri opcode variants for negative immediates
+ /// - Row 4: The rx opcode variants
+ ///
+ /// \attention Columns must be structured as follows:
+ /// - Column 0: The 64-bit opcode variants
+ /// - Column 1: The 32-bit opcode variants
+ ///
+ /// \p Dst is the destination register of the binop to emit.
+ /// \p LHS is the left-hand operand of the binop to emit.
+ /// \p RHS is the right-hand operand of the binop to emit.
+ MachineInstr *emitAddSub(
+ const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
+ Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
+ MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
- MachineInstr *emitTST(const Register &LHS, const Register &RHS,
+ MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
+ AArch64CC::CondCode CC,
+ MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
const RegisterBank &DstRB, LLT ScalarTy,
Register VecReg, unsigned LaneIdx,
@@ -184,9 +250,24 @@ private:
MachineInstr *emitFMovForFConstant(MachineInstr &MI,
MachineRegisterInfo &MRI) const;
- /// Emit a CSet for a compare.
+ /// Emit a CSet for an integer compare.
+ ///
+ /// \p DefReg is expected to be a 32-bit scalar register.
MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
MachineIRBuilder &MIRBuilder) const;
+ /// Emit a CSet for a FP compare.
+ ///
+ /// \p Dst is expected to be a 32-bit scalar register.
+ MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
+ MachineIRBuilder &MIRBuilder) const;
+
+ /// Emit the overflow op for \p Opcode.
+ ///
+ /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
+ /// G_USUBO, etc.
+ std::pair<MachineInstr *, AArch64CC::CondCode>
+ emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
+ MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
/// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
/// \p IsNegative is true if the test should be "not zero".
@@ -195,6 +276,11 @@ private:
MachineBasicBlock *DstMBB,
MachineIRBuilder &MIB) const;
+ /// Emit a CB(N)Z instruction which branches to \p DestMBB.
+ MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
+ MachineBasicBlock *DestMBB,
+ MachineIRBuilder &MIB) const;
+
// Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
// We use these manually instead of using the importer since it doesn't
// support SDNodeXForm.
@@ -316,13 +402,6 @@ private:
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
- MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS,
- MachineOperand &RHS,
- CmpInst::Predicate &Predicate,
- MachineIRBuilder &MIB) const;
- MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS,
- MachineOperand &RHS,
- MachineIRBuilder &MIB) const;
/// Return true if \p MI is a load or store of \p NumBytes bytes.
bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
@@ -498,7 +577,7 @@ static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
if (!ValAndVReg)
return None;
- Immed = ValAndVReg->Value;
+ Immed = ValAndVReg->Value.getSExtValue();
} else
return None;
return Immed;
@@ -786,6 +865,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
#ifndef NDEBUG
ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
assert(ValidCopy && "Invalid copy.");
+ (void)KnownValid;
#endif
return ValidCopy;
};
@@ -932,44 +1012,173 @@ static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
return GenericOpc;
}
-static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI,
- const RegisterBankInfo &RBI) {
- const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
- bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
- AArch64::GPRRegBankID);
- LLT Ty = MRI.getType(I.getOperand(0).getReg());
- if (Ty == LLT::scalar(32))
- return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr;
- else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64))
- return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr;
- return 0;
-}
+MachineInstr *
+AArch64InstructionSelector::emitSelect(Register Dst, Register True,
+ Register False, AArch64CC::CondCode CC,
+ MachineIRBuilder &MIB) const {
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
+ RBI.getRegBank(True, MRI, TRI)->getID() &&
+ "Expected both select operands to have the same regbank?");
+ LLT Ty = MRI.getType(True);
+ if (Ty.isVector())
+ return nullptr;
+ const unsigned Size = Ty.getSizeInBits();
+ assert((Size == 32 || Size == 64) &&
+ "Expected 32 bit or 64 bit select only?");
+ const bool Is32Bit = Size == 32;
+ if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
+ unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
+ auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
+ constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
+ return &*FCSel;
+ }
+
+ // By default, we'll try and emit a CSEL.
+ unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
+ bool Optimized = false;
+ auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
+ &Optimized](Register &Reg, Register &OtherReg,
+ bool Invert) {
+ if (Optimized)
+ return false;
-/// Helper function to select the opcode for a G_FCMP.
-static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) {
- // If this is a compare against +0.0, then we don't have to explicitly
- // materialize a constant.
- const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI);
- bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
- unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
- if (OpSize != 32 && OpSize != 64)
- return 0;
- unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
- {AArch64::FCMPSri, AArch64::FCMPDri}};
- return CmpOpcTbl[ShouldUseImm][OpSize == 64];
-}
+ // Attempt to fold:
+ //
+ // %sub = G_SUB 0, %x
+ // %select = G_SELECT cc, %reg, %sub
+ //
+ // Into:
+ // %select = CSNEG %reg, %x, cc
+ Register MatchReg;
+ if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
+ Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
+ Reg = MatchReg;
+ if (Invert) {
+ CC = AArch64CC::getInvertedCondCode(CC);
+ std::swap(Reg, OtherReg);
+ }
+ return true;
+ }
+
+ // Attempt to fold:
+ //
+ // %xor = G_XOR %x, -1
+ // %select = G_SELECT cc, %reg, %xor
+ //
+ // Into:
+ // %select = CSINV %reg, %x, cc
+ if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
+ Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+ Reg = MatchReg;
+ if (Invert) {
+ CC = AArch64CC::getInvertedCondCode(CC);
+ std::swap(Reg, OtherReg);
+ }
+ return true;
+ }
+
+ // Attempt to fold:
+ //
+ // %add = G_ADD %x, 1
+ // %select = G_SELECT cc, %reg, %add
+ //
+ // Into:
+ // %select = CSINC %reg, %x, cc
+ if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) {
+ Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+ Reg = MatchReg;
+ if (Invert) {
+ CC = AArch64CC::getInvertedCondCode(CC);
+ std::swap(Reg, OtherReg);
+ }
+ return true;
+ }
-/// Returns true if \p P is an unsigned integer comparison predicate.
-static bool isUnsignedICMPPred(const CmpInst::Predicate P) {
- switch (P) {
- default:
return false;
- case CmpInst::ICMP_UGT:
- case CmpInst::ICMP_UGE:
- case CmpInst::ICMP_ULT:
- case CmpInst::ICMP_ULE:
- return true;
- }
+ };
+
+ // Helper lambda which tries to use CSINC/CSINV for the instruction when its
+ // true/false values are constants.
+ // FIXME: All of these patterns already exist in tablegen. We should be
+ // able to import these.
+ auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
+ &Optimized]() {
+ if (Optimized)
+ return false;
+ auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
+ auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
+ if (!TrueCst && !FalseCst)
+ return false;
+
+ Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
+ if (TrueCst && FalseCst) {
+ int64_t T = TrueCst->Value.getSExtValue();
+ int64_t F = FalseCst->Value.getSExtValue();
+
+ if (T == 0 && F == 1) {
+ // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
+ Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+ True = ZReg;
+ False = ZReg;
+ return true;
+ }
+
+ if (T == 0 && F == -1) {
+ // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
+ Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+ True = ZReg;
+ False = ZReg;
+ return true;
+ }
+ }
+
+ if (TrueCst) {
+ int64_t T = TrueCst->Value.getSExtValue();
+ if (T == 1) {
+ // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
+ Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+ True = False;
+ False = ZReg;
+ CC = AArch64CC::getInvertedCondCode(CC);
+ return true;
+ }
+
+ if (T == -1) {
+ // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
+ Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+ True = False;
+ False = ZReg;
+ CC = AArch64CC::getInvertedCondCode(CC);
+ return true;
+ }
+ }
+
+ if (FalseCst) {
+ int64_t F = FalseCst->Value.getSExtValue();
+ if (F == 1) {
+ // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
+ Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
+ False = ZReg;
+ return true;
+ }
+
+ if (F == -1) {
+ // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
+ Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
+ False = ZReg;
+ return true;
+ }
+ }
+ return false;
+ };
+
+ Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
+ Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
+ Optimized |= TryOptSelectCst();
+ auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
+ constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
+ return &*SelectInst;
}
static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
@@ -1099,7 +1308,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
}
if (VRegAndVal)
- C = VRegAndVal->Value;
+ C = VRegAndVal->Value.getSExtValue();
break;
}
case TargetOpcode::G_ASHR:
@@ -1109,7 +1318,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
auto VRegAndVal =
getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
if (VRegAndVal)
- C = VRegAndVal->Value;
+ C = VRegAndVal->Value.getSExtValue();
break;
}
}
@@ -1211,8 +1420,9 @@ MachineInstr *AArch64InstructionSelector::emitTestBit(
}
bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
- MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred,
- MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const {
+ MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
+ MachineIRBuilder &MIB) const {
+ assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
// Given something like this:
//
// %x = ...Something...
@@ -1230,65 +1440,96 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
//
// TBNZ %x %bb.3
//
- if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND)
- return false;
-
- // Need to be comparing against 0 to fold.
- if (CmpConstant != 0)
- return false;
-
- MachineRegisterInfo &MRI = *MIB.getMRI();
-
- // Only support EQ and NE. If we have LT, then it *is* possible to fold, but
- // we don't want to do this. When we have an AND and LT, we need a TST/ANDS,
- // so folding would be redundant.
- if (Pred != CmpInst::Predicate::ICMP_EQ &&
- Pred != CmpInst::Predicate::ICMP_NE)
- return false;
// Check if the AND has a constant on its RHS which we can use as a mask.
// If it's a power of 2, then it's the same as checking a specific bit.
// (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
- auto MaybeBit =
- getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI);
- if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value))
+ auto MaybeBit = getConstantVRegValWithLookThrough(
+ AndInst.getOperand(2).getReg(), *MIB.getMRI());
+ if (!MaybeBit)
+ return false;
+
+ int32_t Bit = MaybeBit->Value.exactLogBase2();
+ if (Bit < 0)
return false;
- uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value));
- Register TestReg = AndInst->getOperand(1).getReg();
- bool Invert = Pred == CmpInst::Predicate::ICMP_NE;
+ Register TestReg = AndInst.getOperand(1).getReg();
// Emit a TB(N)Z.
emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
return true;
}
-bool AArch64InstructionSelector::selectCompareBranch(
- MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
+ bool IsNegative,
+ MachineBasicBlock *DestMBB,
+ MachineIRBuilder &MIB) const {
+ assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
+ AArch64::GPRRegBankID &&
+ "Expected GPRs only?");
+ auto Ty = MRI.getType(CompareReg);
+ unsigned Width = Ty.getSizeInBits();
+ assert(!Ty.isVector() && "Expected scalar only?");
+ assert(Width <= 64 && "Expected width to be at most 64?");
+ static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
+ {AArch64::CBNZW, AArch64::CBNZX}};
+ unsigned Opc = OpcTable[IsNegative][Width == 64];
+ auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
+ constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
+ return &*BranchMI;
+}
- const Register CondReg = I.getOperand(0).getReg();
+bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
+ MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
+ assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
+ assert(I.getOpcode() == TargetOpcode::G_BRCOND);
+ // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+ // totally clean. Some of them require two branches to implement.
+ auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
+ emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
+ Pred);
+ AArch64CC::CondCode CC1, CC2;
+ changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
- MachineInstr *CCMI = MRI.getVRegDef(CondReg);
- if (CCMI->getOpcode() == TargetOpcode::G_TRUNC)
- CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg());
- if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
+ MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
+ if (CC2 != AArch64CC::AL)
+ MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
+ MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
+ assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
+ assert(I.getOpcode() == TargetOpcode::G_BRCOND);
+ // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
+ //
+ // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
+ // instructions will not be produced, as they are conditional branch
+ // instructions that do not set flags.
+ if (!ProduceNonFlagSettingCondBr)
return false;
- Register LHS = CCMI->getOperand(2).getReg();
- Register RHS = CCMI->getOperand(3).getReg();
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+ auto Pred =
+ static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
+ Register LHS = ICmp.getOperand(2).getReg();
+ Register RHS = ICmp.getOperand(3).getReg();
+
+ // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
- MachineIRBuilder MIB(I);
- CmpInst::Predicate Pred =
- (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
- MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI);
+ MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
// When we can emit a TB(N)Z, prefer that.
//
// Handle non-commutative condition codes first.
// Note that we don't want to do this when we have a G_AND because it can
// become a tst. The tst will make the test bit in the TB(N)Z redundant.
- if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) {
- int64_t C = VRegAndVal->Value;
+ if (VRegAndVal && !AndInst) {
+ int64_t C = VRegAndVal->Value.getSExtValue();
// When we have a greater-than comparison, we can just test if the msb is
// zero.
@@ -1309,54 +1550,97 @@ bool AArch64InstructionSelector::selectCompareBranch(
}
}
- if (!VRegAndVal) {
- std::swap(RHS, LHS);
- VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
- LHSMI = getDefIgnoringCopies(LHS, MRI);
+ // Attempt to handle commutative condition codes. Right now, that's only
+ // eq/ne.
+ if (ICmpInst::isEquality(Pred)) {
+ if (!VRegAndVal) {
+ std::swap(RHS, LHS);
+ VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+ AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
+ }
+
+ if (VRegAndVal && VRegAndVal->Value == 0) {
+ // If there's a G_AND feeding into this branch, try to fold it away by
+ // emitting a TB(N)Z instead.
+ //
+ // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
+ // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
+ // would be redundant.
+ if (AndInst &&
+ tryOptAndIntoCompareBranch(
+ *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
+ I.eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, try to emit a CB(N)Z instead.
+ auto LHSTy = MRI.getType(LHS);
+ if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
+ emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
+ I.eraseFromParent();
+ return true;
+ }
+ }
}
- if (!VRegAndVal || VRegAndVal->Value != 0) {
- // If we can't select a CBZ then emit a cmp + Bcc.
- MachineInstr *Cmp;
- std::tie(Cmp, Pred) = emitIntegerCompare(
- CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB);
- if (!Cmp)
- return false;
- const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred);
- MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
- I.eraseFromParent();
+ return false;
+}
+
+bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
+ MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
+ assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
+ assert(I.getOpcode() == TargetOpcode::G_BRCOND);
+ if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
return true;
+
+ // Couldn't optimize. Emit a compare + a Bcc.
+ MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+ auto PredOp = ICmp.getOperand(1);
+ emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
+ const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
+ static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
+ MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectCompareBranch(
+ MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+ Register CondReg = I.getOperand(0).getReg();
+ MachineInstr *CCMI = MRI.getVRegDef(CondReg);
+ if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
+ CondReg = CCMI->getOperand(1).getReg();
+ CCMI = MRI.getVRegDef(CondReg);
}
- // Try to emit a TB(N)Z for an eq or ne condition.
- if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB,
- MIB)) {
+ // Try to select the G_BRCOND using whatever is feeding the condition if
+ // possible.
+ MachineIRBuilder MIB(I);
+ unsigned CCMIOpc = CCMI->getOpcode();
+ if (CCMIOpc == TargetOpcode::G_FCMP)
+ return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
+ if (CCMIOpc == TargetOpcode::G_ICMP)
+ return selectCompareBranchFedByICmp(I, *CCMI, MIB);
+
+ // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
+ // instructions will not be produced, as they are conditional branch
+ // instructions that do not set flags.
+ if (ProduceNonFlagSettingCondBr) {
+ emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
+ I.getOperand(1).getMBB(), MIB);
I.eraseFromParent();
return true;
}
- const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
- if (RB.getID() != AArch64::GPRRegBankID)
- return false;
- if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
- return false;
-
- const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
- unsigned CBOpc = 0;
- if (CmpWidth <= 32)
- CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
- else if (CmpWidth == 64)
- CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
- else
- return false;
-
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
- .addUse(LHS)
- .addMBB(DestMBB)
- .constrainAllUses(TII, TRI, RBI);
-
+ // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
+ auto TstMI =
+ MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
+ constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+ auto Bcc = MIB.buildInstr(AArch64::Bcc)
+ .addImm(AArch64CC::EQ)
+ .addMBB(I.getOperand(1).getMBB());
I.eraseFromParent();
- return true;
+ return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
}
/// Returns the element immediate value of a vector shift operand if found.
@@ -1377,8 +1661,8 @@ static Optional<int64_t> getVectorShiftImm(Register Reg,
return None;
if (Idx == 1)
- ImmVal = VRegAndVal->Value;
- if (ImmVal != VRegAndVal->Value)
+ ImmVal = VRegAndVal->Value.getSExtValue();
+ if (ImmVal != VRegAndVal->Value.getSExtValue())
return None;
}
@@ -1441,6 +1725,14 @@ bool AArch64InstructionSelector::selectVectorSHL(
Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
} else if (Ty == LLT::vector(2, 32)) {
Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
+ } else if (Ty == LLT::vector(4, 16)) {
+ Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
+ } else if (Ty == LLT::vector(8, 16)) {
+ Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
+ } else if (Ty == LLT::vector(16, 8)) {
+ Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
+ } else if (Ty == LLT::vector(8, 8)) {
+ Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
return false;
@@ -1457,9 +1749,10 @@ bool AArch64InstructionSelector::selectVectorSHL(
return true;
}
-bool AArch64InstructionSelector::selectVectorASHR(
+bool AArch64InstructionSelector::selectVectorAshrLshr(
MachineInstr &I, MachineRegisterInfo &MRI) const {
- assert(I.getOpcode() == TargetOpcode::G_ASHR);
+ assert(I.getOpcode() == TargetOpcode::G_ASHR ||
+ I.getOpcode() == TargetOpcode::G_LSHR);
Register DstReg = I.getOperand(0).getReg();
const LLT Ty = MRI.getType(DstReg);
Register Src1Reg = I.getOperand(1).getReg();
@@ -1468,25 +1761,40 @@ bool AArch64InstructionSelector::selectVectorASHR(
if (!Ty.isVector())
return false;
+ bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
+
+ // We expect the immediate case to be lowered in the PostLegalCombiner to
+ // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
+
// There is not a shift right register instruction, but the shift left
// register instruction takes a signed value, where negative numbers specify a
// right shift.
unsigned Opc = 0;
unsigned NegOpc = 0;
- const TargetRegisterClass *RC = nullptr;
+ const TargetRegisterClass *RC =
+ getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
if (Ty == LLT::vector(2, 64)) {
- Opc = AArch64::SSHLv2i64;
+ Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
NegOpc = AArch64::NEGv2i64;
- RC = &AArch64::FPR128RegClass;
} else if (Ty == LLT::vector(4, 32)) {
- Opc = AArch64::SSHLv4i32;
+ Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
NegOpc = AArch64::NEGv4i32;
- RC = &AArch64::FPR128RegClass;
} else if (Ty == LLT::vector(2, 32)) {
- Opc = AArch64::SSHLv2i32;
+ Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
NegOpc = AArch64::NEGv2i32;
- RC = &AArch64::FPR64RegClass;
+ } else if (Ty == LLT::vector(4, 16)) {
+ Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
+ NegOpc = AArch64::NEGv4i16;
+ } else if (Ty == LLT::vector(8, 16)) {
+ Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
+ NegOpc = AArch64::NEGv8i16;
+ } else if (Ty == LLT::vector(16, 8)) {
+ Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
+ NegOpc = AArch64::NEGv8i16;
+ } else if (Ty == LLT::vector(8, 8)) {
+ Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
+ NegOpc = AArch64::NEGv8i8;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
return false;
@@ -1569,7 +1877,6 @@ void AArch64InstructionSelector::materializeLargeCMVal(
AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
- return;
}
bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
@@ -1624,6 +1931,40 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
MRI.setType(DstReg, LLT::scalar(64));
return true;
}
+ case AArch64::G_DUP: {
+ // Convert the type from p0 to s64 to help selection.
+ LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ if (!DstTy.getElementType().isPointer())
+ return false;
+ MachineIRBuilder MIB(I);
+ auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
+ MRI.setType(I.getOperand(0).getReg(),
+ DstTy.changeElementType(LLT::scalar(64)));
+ MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+ I.getOperand(1).setReg(NewSrc.getReg(0));
+ return true;
+ }
+ case TargetOpcode::G_UITOFP:
+ case TargetOpcode::G_SITOFP: {
+ // If both source and destination regbanks are FPR, then convert the opcode
+ // to G_SITOF so that the importer can select it to an fpr variant.
+ // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
+ // copy.
+ Register SrcReg = I.getOperand(1).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
+ return false;
+
+ if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
+ if (I.getOpcode() == TargetOpcode::G_SITOFP)
+ I.setDesc(TII.get(AArch64::G_SITOF));
+ else
+ I.setDesc(TII.get(AArch64::G_UITOF));
+ return true;
+ }
+ return false;
+ }
default:
return false;
}
@@ -1664,6 +2005,14 @@ bool AArch64InstructionSelector::convertPtrAddToAdd(
LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
return false;
}
+
+ // Also take the opportunity here to try to do some optimization.
+ // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
+ Register NegatedReg;
+ if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
+ return true;
+ I.getOperand(2).setReg(NegatedReg);
+ I.setDesc(TII.get(TargetOpcode::G_SUB));
return true;
}
@@ -1753,6 +2102,17 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
switch (I.getOpcode()) {
+ case TargetOpcode::G_BR: {
+ // If the branch jumps to the fallthrough block, don't bother emitting it.
+ // Only do this for -O0 for a good code size improvement, because when
+ // optimizations are enabled we want to leave this choice to
+ // MachineBlockPlacement.
+ bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
+ if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
case TargetOpcode::G_SHL:
return earlySelectSHL(I, MRI);
case TargetOpcode::G_CONSTANT: {
@@ -1872,48 +2232,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
MachineIRBuilder MIB(I);
switch (Opcode) {
- case TargetOpcode::G_BRCOND: {
- if (Ty.getSizeInBits() > 32) {
- // We shouldn't need this on AArch64, but it would be implemented as an
- // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
- // bit being tested is < 32.
- LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty
- << ", expected at most 32-bits");
- return false;
- }
-
- const Register CondReg = I.getOperand(0).getReg();
- MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
-
- // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
- // instructions will not be produced, as they are conditional branch
- // instructions that do not set flags.
- if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
- return true;
-
- if (ProduceNonFlagSettingCondBr) {
- auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
- .addUse(CondReg)
- .addImm(/*bit offset=*/0)
- .addMBB(DestMBB);
-
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
- } else {
- auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
- .addDef(AArch64::WZR)
- .addUse(CondReg)
- .addImm(1);
- constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI);
- auto Bcc =
- BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc))
- .addImm(AArch64CC::EQ)
- .addMBB(DestMBB);
-
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI);
- }
- }
+ case TargetOpcode::G_BRCOND:
+ return selectCompareBranch(I, MF, MRI);
case TargetOpcode::G_BRINDIRECT: {
I.setDesc(TII.get(AArch64::BR));
@@ -1993,6 +2313,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
+ const LLT s128 = LLT::scalar(128);
const LLT p0 = LLT::pointer(0, 64);
const Register DefReg = I.getOperand(0).getReg();
@@ -2002,10 +2323,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// FIXME: Redundant check, but even less readable when factored out.
if (isFP) {
- if (Ty != s32 && Ty != s64) {
+ if (Ty != s32 && Ty != s64 && Ty != s128) {
LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
<< " constant, expected: " << s32 << " or " << s64
- << '\n');
+ << " or " << s128 << '\n');
return false;
}
@@ -2018,7 +2339,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// The case when we have 0.0 is covered by tablegen. Reject it here so we
// can be sure tablegen works correctly and isn't rescued by this code.
- if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0))
+ // 0.0 is not covered by tablegen for FP128. So we will handle this
+ // scenario in the code here.
+ if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
return false;
} else {
// s32 and s64 are covered by tablegen.
@@ -2045,15 +2368,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// Either emit a FMOV, or emit a copy to emit a normal mov.
const TargetRegisterClass &GPRRC =
DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
- const TargetRegisterClass &FPRRC =
- DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
+ const TargetRegisterClass &FPRRC =
+ DefSize == 32 ? AArch64::FPR32RegClass
+ : (DefSize == 64 ? AArch64::FPR64RegClass
+ : AArch64::FPR128RegClass);
// Can we use a FMOV instruction to represent the immediate?
if (emitFMovForFConstant(I, MRI))
return true;
// For 64b values, emit a constant pool load instead.
- if (DefSize == 64) {
+ if (DefSize == 64 || DefSize == 128) {
auto *FPImm = I.getOperand(1).getFPImm();
MachineIRBuilder MIB(I);
auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
@@ -2246,21 +2571,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
auto &MemOp = **I.memoperands_begin();
+ uint64_t MemSizeInBytes = MemOp.getSize();
if (MemOp.isAtomic()) {
// For now we just support s8 acquire loads to be able to compile stack
// protector code.
if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
- MemOp.getSize() == 1) {
+ MemSizeInBytes == 1) {
I.setDesc(TII.get(AArch64::LDARB));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
return false;
}
- unsigned MemSizeInBits = MemOp.getSize() * 8;
+ unsigned MemSizeInBits = MemSizeInBytes * 8;
- const Register PtrReg = I.getOperand(1).getReg();
#ifndef NDEBUG
+ const Register PtrReg = I.getOperand(1).getReg();
const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
// Sanity-check the pointer register.
assert(PtrRB.getID() == AArch64::GPRRegBankID &&
@@ -2272,68 +2598,78 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
const Register ValReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
- const unsigned NewOpc =
- selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
- if (NewOpc == I.getOpcode())
- return false;
-
- I.setDesc(TII.get(NewOpc));
-
- uint64_t Offset = 0;
- auto *PtrMI = MRI.getVRegDef(PtrReg);
-
- // Try to fold a GEP into our unsigned immediate addressing mode.
- if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
- if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
- int64_t Imm = *COff;
- const unsigned Size = MemSizeInBits / 8;
- const unsigned Scale = Log2_32(Size);
- if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
- Register Ptr2Reg = PtrMI->getOperand(1).getReg();
- I.getOperand(1).setReg(Ptr2Reg);
- PtrMI = MRI.getVRegDef(Ptr2Reg);
- Offset = Imm / Size;
- }
+ // Helper lambda for partially selecting I. Either returns the original
+ // instruction with an updated opcode, or a new instruction.
+ auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
+ bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
+ const unsigned NewOpc =
+ selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
+ if (NewOpc == I.getOpcode())
+ return nullptr;
+ // Check if we can fold anything into the addressing mode.
+ auto AddrModeFns =
+ selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
+ if (!AddrModeFns) {
+ // Can't fold anything. Use the original instruction.
+ I.setDesc(TII.get(NewOpc));
+ I.addOperand(MachineOperand::CreateImm(0));
+ return &I;
}
- }
- // If we haven't folded anything into our addressing mode yet, try to fold
- // a frame index into the base+offset.
- if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
- I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
+ // Folded something. Create a new instruction and return it.
+ auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
+ IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
+ NewInst.cloneMemRefs(I);
+ for (auto &Fn : *AddrModeFns)
+ Fn(NewInst);
+ I.eraseFromParent();
+ return &*NewInst;
+ };
- I.addOperand(MachineOperand::CreateImm(Offset));
+ MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
+ if (!LoadStore)
+ return false;
// If we're storing a 0, use WZR/XZR.
- if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
- if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
- if (I.getOpcode() == AArch64::STRWui)
- I.getOperand(0).setReg(AArch64::WZR);
- else if (I.getOpcode() == AArch64::STRXui)
- I.getOperand(0).setReg(AArch64::XZR);
+ if (Opcode == TargetOpcode::G_STORE) {
+ auto CVal = getConstantVRegValWithLookThrough(
+ LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
+ /*HandleFConstants = */ false);
+ if (CVal && CVal->Value == 0) {
+ switch (LoadStore->getOpcode()) {
+ case AArch64::STRWui:
+ case AArch64::STRHHui:
+ case AArch64::STRBBui:
+ LoadStore->getOperand(0).setReg(AArch64::WZR);
+ break;
+ case AArch64::STRXui:
+ LoadStore->getOperand(0).setReg(AArch64::XZR);
+ break;
+ }
}
}
if (IsZExtLoad) {
- // The zextload from a smaller type to i32 should be handled by the importer.
- if (MRI.getType(ValReg).getSizeInBits() != 64)
+ // The zextload from a smaller type to i32 should be handled by the
+ // importer.
+ if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
return false;
// If we have a ZEXTLOAD then change the load's type to be a narrower reg
- //and zero_extend with SUBREG_TO_REG.
+ // and zero_extend with SUBREG_TO_REG.
Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- Register DstReg = I.getOperand(0).getReg();
- I.getOperand(0).setReg(LdReg);
+ Register DstReg = LoadStore->getOperand(0).getReg();
+ LoadStore->getOperand(0).setReg(LdReg);
- MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
+ MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
.addImm(0)
.addUse(LdReg)
.addImm(AArch64::sub_32);
- constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
MRI);
}
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
}
case TargetOpcode::G_SMULH:
@@ -2364,22 +2700,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
// operands to use appropriate classes.
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- case TargetOpcode::G_FADD:
- case TargetOpcode::G_FSUB:
- case TargetOpcode::G_FMUL:
- case TargetOpcode::G_FDIV:
-
+ case TargetOpcode::G_LSHR:
case TargetOpcode::G_ASHR:
if (MRI.getType(I.getOperand(0).getReg()).isVector())
- return selectVectorASHR(I, MRI);
+ return selectVectorAshrLshr(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_SHL:
if (Opcode == TargetOpcode::G_SHL &&
MRI.getType(I.getOperand(0).getReg()).isVector())
return selectVectorSHL(I, MRI);
LLVM_FALLTHROUGH;
- case TargetOpcode::G_OR:
- case TargetOpcode::G_LSHR: {
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+ case TargetOpcode::G_OR: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
@@ -2408,37 +2743,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
I.eraseFromParent();
return true;
}
- case TargetOpcode::G_UADDO: {
- // TODO: Support other types.
- unsigned OpSize = Ty.getSizeInBits();
- if (OpSize != 32 && OpSize != 64) {
- LLVM_DEBUG(
- dbgs()
- << "G_UADDO currently only supported for 32 and 64 b types.\n");
- return false;
- }
-
- // TODO: Support vectors.
- if (Ty.isVector()) {
- LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n");
- return false;
- }
-
- // Add and set the set condition flag.
- unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr;
+ case TargetOpcode::G_SADDO:
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_SSUBO:
+ case TargetOpcode::G_USUBO: {
+ // Emit the operation and get the correct condition code.
MachineIRBuilder MIRBuilder(I);
- auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)},
- {I.getOperand(2), I.getOperand(3)});
- constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI);
+ auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
+ I.getOperand(2), I.getOperand(3), MIRBuilder);
// Now, put the overflow result in the register given by the first operand
- // to the G_UADDO. CSINC increments the result when the predicate is false,
- // so to get the increment when it's true, we need to use the inverse. In
- // this case, we want to increment when carry is set.
+ // to the overflow op. CSINC increments the result when the predicate is
+ // false, so to get the increment when it's true, we need to use the
+ // inverse. In this case, we want to increment when carry is set.
+ Register ZReg = AArch64::WZR;
auto CsetMI = MIRBuilder
.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
- {Register(AArch64::WZR), Register(AArch64::WZR)})
- .addImm(getInvertedCondCode(AArch64CC::HS));
+ {ZReg, ZReg})
+ .addImm(getInvertedCondCode(OpAndCC.second));
constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
I.eraseFromParent();
return true;
@@ -2446,7 +2768,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_PTRMASK: {
Register MaskReg = I.getOperand(2).getReg();
- Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI);
+ Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
// TODO: Implement arbitrary cases
if (!MaskVal || !isShiftedMask_64(*MaskVal))
return false;
@@ -2737,22 +3059,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
if (tryOptSelect(I))
return true;
- Register CSelOpc = selectSelectOpc(I, MRI, RBI);
- MachineInstr &TstMI =
- *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
- .addDef(AArch64::WZR)
- .addUse(CondReg)
- .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-
- MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
- .addDef(I.getOperand(0).getReg())
- .addUse(TReg)
- .addUse(FReg)
- .addImm(AArch64CC::NE);
-
- constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
- constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);
-
+ // Make sure to use an unused vreg instead of wzr, so that the peephole
+ // optimizations will be able to optimize these.
+ MachineIRBuilder MIB(I);
+ Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+ if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
+ return false;
I.eraseFromParent();
return true;
}
@@ -2767,76 +3082,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
}
MachineIRBuilder MIRBuilder(I);
- MachineInstr *Cmp;
- CmpInst::Predicate Pred;
- std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3),
- I.getOperand(1), MIRBuilder);
- if (!Cmp)
- return false;
+ auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+ emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
+ MIRBuilder);
emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_FCMP: {
- if (Ty != LLT::scalar(32)) {
- LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty
- << ", expected: " << LLT::scalar(32) << '\n');
- return false;
- }
-
- unsigned CmpOpc = selectFCMPOpc(I, MRI);
- if (!CmpOpc)
+ MachineIRBuilder MIRBuilder(I);
+ CmpInst::Predicate Pred =
+ static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
+ if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(),
+ MIRBuilder, Pred) ||
+ !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder))
return false;
-
- // FIXME: regbank
-
- AArch64CC::CondCode CC1, CC2;
- changeFCMPPredToAArch64CC(
- (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
-
- // Partially build the compare. Decide if we need to add a use for the
- // third operand based off whether or not we're comparing against 0.0.
- auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
- .addUse(I.getOperand(2).getReg());
-
- // If we don't have an immediate compare, then we need to add a use of the
- // register which wasn't used for the immediate.
- // Note that the immediate will always be the last operand.
- if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
- CmpMI = CmpMI.addUse(I.getOperand(3).getReg());
-
- const Register DefReg = I.getOperand(0).getReg();
- Register Def1Reg = DefReg;
- if (CC2 != AArch64CC::AL)
- Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-
- MachineInstr &CSetMI =
- *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
- .addDef(Def1Reg)
- .addUse(AArch64::WZR)
- .addUse(AArch64::WZR)
- .addImm(getInvertedCondCode(CC1));
-
- if (CC2 != AArch64CC::AL) {
- Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- MachineInstr &CSet2MI =
- *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
- .addDef(Def2Reg)
- .addUse(AArch64::WZR)
- .addUse(AArch64::WZR)
- .addImm(getInvertedCondCode(CC2));
- MachineInstr &OrMI =
- *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
- .addDef(DefReg)
- .addUse(Def1Reg)
- .addUse(Def2Reg);
- constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
- constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
- }
- constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
- constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
-
I.eraseFromParent();
return true;
}
@@ -2875,6 +3136,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
}
+ case AArch64::G_DUP: {
+ // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
+ // imported patterns. Do it manually here. Avoiding generating s16 gpr is
+ // difficult because at RBS we may end up pessimizing the fpr case if we
+ // decided to add an anyextend to fix this. Manual selection is the most
+ // robust solution for now.
+ Register SrcReg = I.getOperand(1).getReg();
+ if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID)
+ return false; // We expect the fpr regbank case to be imported.
+ LLT SrcTy = MRI.getType(SrcReg);
+ if (SrcTy.getSizeInBits() == 16)
+ I.setDesc(TII.get(AArch64::DUPv8i16gpr));
+ else if (SrcTy.getSizeInBits() == 8)
+ I.setDesc(TII.get(AArch64::DUPv16i8gpr));
+ else
+ return false;
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
case TargetOpcode::G_INTRINSIC_TRUNC:
return selectIntrinsicTrunc(I, MRI);
case TargetOpcode::G_INTRINSIC_ROUND:
@@ -2895,8 +3174,49 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return selectConcatVectors(I, MRI);
case TargetOpcode::G_JUMP_TABLE:
return selectJumpTable(I, MRI);
+ case TargetOpcode::G_VECREDUCE_FADD:
+ case TargetOpcode::G_VECREDUCE_ADD:
+ return selectReduction(I, MRI);
+ }
+
+ return false;
+}
+
+bool AArch64InstructionSelector::selectReduction(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ Register VecReg = I.getOperand(1).getReg();
+ LLT VecTy = MRI.getType(VecReg);
+ if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
+ unsigned Opc = 0;
+ if (VecTy == LLT::vector(16, 8))
+ Opc = AArch64::ADDVv16i8v;
+ else if (VecTy == LLT::vector(8, 16))
+ Opc = AArch64::ADDVv8i16v;
+ else if (VecTy == LLT::vector(4, 32))
+ Opc = AArch64::ADDVv4i32v;
+ else if (VecTy == LLT::vector(2, 64))
+ Opc = AArch64::ADDPv2i64p;
+ else {
+ LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
+ return false;
+ }
+ I.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
+ unsigned Opc = 0;
+ if (VecTy == LLT::vector(2, 32))
+ Opc = AArch64::FADDPv2i32p;
+ else if (VecTy == LLT::vector(2, 64))
+ Opc = AArch64::FADDPv2i64p;
+ else {
+ LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
+ return false;
+ }
+ I.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
return false;
}
@@ -2910,6 +3230,8 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
+
+ MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
{TargetReg, ScratchReg}, {JTAddr, Index})
.addJumpTableIndex(JTI);
@@ -2946,17 +3268,20 @@ bool AArch64InstructionSelector::selectTLSGlobalValue(
const GlobalValue &GV = *I.getOperand(1).getGlobal();
MachineIRBuilder MIB(I);
- MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {})
- .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
+ auto LoadGOT =
+ MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
+ .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
- {Register(AArch64::X0)})
+ {LoadGOT.getReg(0)})
.addImm(0);
+ MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
+ .addUse(AArch64::X0, RegState::Implicit)
.addDef(AArch64::X0, RegState::Implicit)
.addRegMask(TRI.getTLSCallPreservedMask());
@@ -3442,7 +3767,7 @@ bool AArch64InstructionSelector::selectExtractElt(
(void)WideTy;
assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
"source register size too small!");
- assert(NarrowTy.isScalar() && "cannot extract vector into vector!");
+ assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
// Need the lane index to determine the correct copy opcode.
MachineOperand &LaneIdxOp = I.getOperand(2);
@@ -3457,7 +3782,7 @@ bool AArch64InstructionSelector::selectExtractElt(
auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
if (!VRegAndVal)
return false;
- unsigned LaneIdx = VRegAndVal->Value;
+ unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
MachineIRBuilder MIRBuilder(I);
@@ -3680,7 +4005,10 @@ static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
unsigned Opc, SubregIdx;
if (RB.getID() == AArch64::GPRRegBankID) {
- if (EltSize == 32) {
+ if (EltSize == 16) {
+ Opc = AArch64::INSvi16gpr;
+ SubregIdx = AArch64::ssub;
+ } else if (EltSize == 32) {
Opc = AArch64::INSvi32gpr;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
@@ -3709,135 +4037,223 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
return std::make_pair(Opc, SubregIdx);
}
+MachineInstr *AArch64InstructionSelector::emitInstr(
+ unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
+ std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
+ const ComplexRendererFns &RenderFns) const {
+ assert(Opcode && "Expected an opcode?");
+ assert(!isPreISelGenericOpcode(Opcode) &&
+ "Function should only be used to produce selected instructions!");
+ auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
+ if (RenderFns)
+ for (auto &Fn : *RenderFns)
+ Fn(MI);
+ constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+ return &*MI;
+}
+
+MachineInstr *AArch64InstructionSelector::emitAddSub(
+ const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
+ Register Dst, MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
+ auto Ty = MRI.getType(LHS.getReg());
+ assert(!Ty.isVector() && "Expected a scalar or pointer?");
+ unsigned Size = Ty.getSizeInBits();
+ assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
+ bool Is32Bit = Size == 32;
+
+ // INSTRri form with positive arithmetic immediate.
+ if (auto Fns = selectArithImmed(RHS))
+ return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
+ MIRBuilder, Fns);
+
+ // INSTRri form with negative arithmetic immediate.
+ if (auto Fns = selectNegArithImmed(RHS))
+ return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
+ MIRBuilder, Fns);
+
+ // INSTRrx form.
+ if (auto Fns = selectArithExtendedRegister(RHS))
+ return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
+ MIRBuilder, Fns);
+
+ // INSTRrs form.
+ if (auto Fns = selectShiftedRegister(RHS))
+ return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
+ MIRBuilder, Fns);
+ return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
+ MIRBuilder);
+}
+
MachineInstr *
AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
- assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
- MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
- static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri},
- {AArch64::ADDWrr, AArch64::ADDWri}};
- bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
- auto ImmFns = selectArithImmed(RHS);
- unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
- auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS});
-
- // If we matched a valid constant immediate, add those operands.
- if (ImmFns) {
- for (auto &RenderFn : *ImmFns)
- RenderFn(AddMI);
- } else {
- AddMI.addUse(RHS.getReg());
- }
+ const std::array<std::array<unsigned, 2>, 5> OpcTable{
+ {{AArch64::ADDXri, AArch64::ADDWri},
+ {AArch64::ADDXrs, AArch64::ADDWrs},
+ {AArch64::ADDXrr, AArch64::ADDWrr},
+ {AArch64::SUBXri, AArch64::SUBWri},
+ {AArch64::ADDXrx, AArch64::ADDWrx}}};
+ return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
+}
+
+MachineInstr *
+AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
+ MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ const std::array<std::array<unsigned, 2>, 5> OpcTable{
+ {{AArch64::ADDSXri, AArch64::ADDSWri},
+ {AArch64::ADDSXrs, AArch64::ADDSWrs},
+ {AArch64::ADDSXrr, AArch64::ADDSWrr},
+ {AArch64::SUBSXri, AArch64::SUBSWri},
+ {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
+ return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
+}
- constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI);
- return &*AddMI;
+MachineInstr *
+AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
+ MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ const std::array<std::array<unsigned, 2>, 5> OpcTable{
+ {{AArch64::SUBSXri, AArch64::SUBSWri},
+ {AArch64::SUBSXrs, AArch64::SUBSWrs},
+ {AArch64::SUBSXrr, AArch64::SUBSWrr},
+ {AArch64::ADDSXri, AArch64::ADDSWri},
+ {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
+ return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
}
MachineInstr *
AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
- assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
- static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri},
- {AArch64::ADDSWrr, AArch64::ADDSWri}};
bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
- auto ImmFns = selectArithImmed(RHS);
- unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
- Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
-
- auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
-
- // If we matched a valid constant immediate, add those operands.
- if (ImmFns) {
- for (auto &RenderFn : *ImmFns)
- RenderFn(CmpMI);
- } else {
- CmpMI.addUse(RHS.getReg());
- }
-
- constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
- return &*CmpMI;
+ auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
+ return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
}
MachineInstr *
-AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS,
+AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
+ assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
- unsigned RegSize = MRI.getType(LHS).getSizeInBits();
+ LLT Ty = MRI.getType(LHS.getReg());
+ unsigned RegSize = Ty.getSizeInBits();
bool Is32Bit = (RegSize == 32);
- static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri},
- {AArch64::ANDSWrr, AArch64::ANDSWri}};
- Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
-
- // We might be able to fold in an immediate into the TST. We need to make sure
- // it's a logical immediate though, since ANDS requires that.
- auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
- bool IsImmForm = ValAndVReg.hasValue() &&
- AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize);
- unsigned Opc = OpcTable[Is32Bit][IsImmForm];
- auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
-
- if (IsImmForm)
- TstMI.addImm(
- AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize));
- else
- TstMI.addUse(RHS);
+ const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
+ {AArch64::ANDSXrs, AArch64::ANDSWrs},
+ {AArch64::ANDSXrr, AArch64::ANDSWrr}};
+ // ANDS needs a logical immediate for its immediate form. Check if we can
+ // fold one in.
+ if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
+ int64_t Imm = ValAndVReg->Value.getSExtValue();
+
+ if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
+ auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
+ TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+ constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
+ return &*TstMI;
+ }
+ }
- constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
- return &*TstMI;
+ if (auto Fns = selectLogicalShiftedRegister(RHS))
+ return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
+ return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
}
-std::pair<MachineInstr *, CmpInst::Predicate>
-AArch64InstructionSelector::emitIntegerCompare(
+MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
assert(Predicate.isPredicate() && "Expected predicate?");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ LLT CmpTy = MRI.getType(LHS.getReg());
+ assert(!CmpTy.isVector() && "Expected scalar or pointer");
+ unsigned Size = CmpTy.getSizeInBits();
+ (void)Size;
+ assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
+ // Fold the compare into a cmn or tst if possible.
+ if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
+ return FoldCmp;
+ auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
+ return emitSUBS(Dst, LHS, RHS, MIRBuilder);
+}
- CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
-
- // Fold the compare if possible.
- MachineInstr *FoldCmp =
- tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
- if (FoldCmp)
- return {FoldCmp, P};
+MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
+ Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+#ifndef NDEBUG
+ LLT Ty = MRI.getType(Dst);
+ assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
+ "Expected a 32-bit scalar register?");
+#endif
+ const Register ZeroReg = AArch64::WZR;
+ auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
+ auto CSet =
+ MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
+ .addImm(getInvertedCondCode(CC));
+ constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
+ return &*CSet;
+ };
- // Can't fold into a CMN. Just emit a normal compare.
- unsigned CmpOpc = 0;
- Register ZReg;
+ AArch64CC::CondCode CC1, CC2;
+ changeFCMPPredToAArch64CC(Pred, CC1, CC2);
+ if (CC2 == AArch64CC::AL)
+ return EmitCSet(Dst, CC1);
+
+ const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
+ Register Def1Reg = MRI.createVirtualRegister(RC);
+ Register Def2Reg = MRI.createVirtualRegister(RC);
+ EmitCSet(Def1Reg, CC1);
+ EmitCSet(Def2Reg, CC2);
+ auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
+ constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
+ return &*OrMI;
+}
- LLT CmpTy = MRI.getType(LHS.getReg());
- assert((CmpTy.isScalar() || CmpTy.isPointer()) &&
- "Expected scalar or pointer");
- if (CmpTy == LLT::scalar(32)) {
- CmpOpc = AArch64::SUBSWrr;
- ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
- CmpOpc = AArch64::SUBSXrr;
- ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- } else {
- return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE};
- }
+MachineInstr *
+AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
+ MachineIRBuilder &MIRBuilder,
+ Optional<CmpInst::Predicate> Pred) const {
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ LLT Ty = MRI.getType(LHS);
+ if (Ty.isVector())
+ return nullptr;
+ unsigned OpSize = Ty.getSizeInBits();
+ if (OpSize != 32 && OpSize != 64)
+ return nullptr;
- // Try to match immediate forms.
- MachineInstr *ImmedCmp =
- tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder);
- if (ImmedCmp)
- return {ImmedCmp, P};
+ // If this is a compare against +0.0, then we don't have
+ // to explicitly materialize a constant.
+ const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
+ bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
- // If we don't have an immediate, we may have a shift which can be folded
- // into the compare.
- MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder);
- if (ShiftedCmp)
- return {ShiftedCmp, P};
+ auto IsEqualityPred = [](CmpInst::Predicate P) {
+ return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
+ P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
+ };
+ if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
+ // Try commutating the operands.
+ const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
+ if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
+ ShouldUseImm = true;
+ std::swap(LHS, RHS);
+ }
+ }
+ unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
+ {AArch64::FCMPSri, AArch64::FCMPDri}};
+ unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
- auto CmpMI =
- MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()});
- // Make sure that we can constrain the compare that we emitted.
+ // Partially build the compare. Decide if we need to add a use for the
+ // third operand based off whether or not we're comparing against 0.0.
+ auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
+ if (!ShouldUseImm)
+ CmpMI.addUse(RHS);
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
- return {&*CmpMI, P};
+ return &*CmpMI;
}
MachineInstr *AArch64InstructionSelector::emitVectorConcat(
@@ -3947,11 +4363,28 @@ AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
return &*I;
}
+std::pair<MachineInstr *, AArch64CC::CondCode>
+AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
+ MachineOperand &LHS,
+ MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case TargetOpcode::G_SADDO:
+ return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
+ case TargetOpcode::G_UADDO:
+ return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
+ case TargetOpcode::G_SSUBO:
+ return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
+ case TargetOpcode::G_USUBO:
+ return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
+ }
+}
+
bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
MachineIRBuilder MIB(I);
MachineRegisterInfo &MRI = *MIB.getMRI();
- const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-
// We want to recognize this pattern:
//
// $z = G_FCMP pred, $x, $y
@@ -4008,27 +4441,17 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
- MachineInstr *Cmp;
- CmpInst::Predicate Pred;
-
- std::tie(Cmp, Pred) =
- emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
- CondDef->getOperand(1), MIB);
-
- if (!Cmp) {
- LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
- return false;
- }
-
- // Have to collect the CondCode after emitIntegerCompare, since it can
- // update the predicate.
+ auto Pred =
+ static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
CondCode = changeICMPPredToAArch64CC(Pred);
+ emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
+ CondDef->getOperand(1), MIB);
} else {
// Get the condition code for the select.
+ auto Pred =
+ static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
AArch64CC::CondCode CondCode2;
- changeFCMPPredToAArch64CC(
- (CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode,
- CondCode2);
+ changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
// changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
// instructions to emit the comparison.
@@ -4037,25 +4460,16 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
if (CondCode2 != AArch64CC::AL)
return false;
- // Make sure we'll be able to select the compare.
- unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI);
- if (!CmpOpc)
+ if (!emitFPCompare(CondDef->getOperand(2).getReg(),
+ CondDef->getOperand(3).getReg(), MIB)) {
+ LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
return false;
-
- // Emit a new compare.
- auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()});
- if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
- Cmp.addUse(CondDef->getOperand(3).getReg());
- constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
+ }
}
// Emit the select.
- unsigned CSelOpc = selectSelectOpc(I, MRI, RBI);
- auto CSel =
- MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()},
- {I.getOperand(2).getReg(), I.getOperand(3).getReg()})
- .addImm(CondCode);
- constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI);
+ emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
+ I.getOperand(3).getReg(), CondCode, MIB);
I.eraseFromParent();
return true;
}
@@ -4138,162 +4552,20 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
// Produce this if the compare is signed:
//
// tst x, y
- if (!isUnsignedICMPPred(P) && LHSDef &&
+ if (!CmpInst::isUnsigned(P) && LHSDef &&
LHSDef->getOpcode() == TargetOpcode::G_AND) {
// Make sure that the RHS is 0.
auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
if (!ValAndVReg || ValAndVReg->Value != 0)
return nullptr;
- return emitTST(LHSDef->getOperand(1).getReg(),
- LHSDef->getOperand(2).getReg(), MIRBuilder);
+ return emitTST(LHSDef->getOperand(1),
+ LHSDef->getOperand(2), MIRBuilder);
}
return nullptr;
}
-MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
- MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P,
- MachineIRBuilder &MIB) const {
- // Attempt to select the immediate form of an integer compare.
- MachineRegisterInfo &MRI = *MIB.getMRI();
- auto Ty = MRI.getType(LHS.getReg());
- assert(!Ty.isVector() && "Expected scalar or pointer only?");
- unsigned Size = Ty.getSizeInBits();
- assert((Size == 32 || Size == 64) &&
- "Expected 32 bit or 64 bit compare only?");
-
- // Check if this is a case we can already handle.
- InstructionSelector::ComplexRendererFns ImmFns;
- ImmFns = selectArithImmed(RHS);
-
- if (!ImmFns) {
- // We didn't get a rendering function, but we may still have a constant.
- auto MaybeImmed = getImmedFromMO(RHS);
- if (!MaybeImmed)
- return nullptr;
-
- // We have a constant, but it doesn't fit. Try adjusting it by one and
- // updating the predicate if possible.
- uint64_t C = *MaybeImmed;
- CmpInst::Predicate NewP;
- switch (P) {
- default:
- return nullptr;
- case CmpInst::ICMP_SLT:
- case CmpInst::ICMP_SGE:
- // Check for
- //
- // x slt c => x sle c - 1
- // x sge c => x sgt c - 1
- //
- // When c is not the smallest possible negative number.
- if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
- (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
- return nullptr;
- NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
- C -= 1;
- break;
- case CmpInst::ICMP_ULT:
- case CmpInst::ICMP_UGE:
- // Check for
- //
- // x ult c => x ule c - 1
- // x uge c => x ugt c - 1
- //
- // When c is not zero.
- if (C == 0)
- return nullptr;
- NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
- C -= 1;
- break;
- case CmpInst::ICMP_SLE:
- case CmpInst::ICMP_SGT:
- // Check for
- //
- // x sle c => x slt c + 1
- // x sgt c => s sge c + 1
- //
- // When c is not the largest possible signed integer.
- if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
- (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
- return nullptr;
- NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
- C += 1;
- break;
- case CmpInst::ICMP_ULE:
- case CmpInst::ICMP_UGT:
- // Check for
- //
- // x ule c => x ult c + 1
- // x ugt c => s uge c + 1
- //
- // When c is not the largest possible unsigned integer.
- if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
- (Size == 64 && C == UINT64_MAX))
- return nullptr;
- NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
- C += 1;
- break;
- }
-
- // Check if the new constant is valid.
- if (Size == 32)
- C = static_cast<uint32_t>(C);
- ImmFns = select12BitValueWithLeftShift(C);
- if (!ImmFns)
- return nullptr;
- P = NewP;
- }
-
- // At this point, we know we can select an immediate form. Go ahead and do
- // that.
- Register ZReg;
- unsigned Opc;
- if (Size == 32) {
- ZReg = AArch64::WZR;
- Opc = AArch64::SUBSWri;
- } else {
- ZReg = AArch64::XZR;
- Opc = AArch64::SUBSXri;
- }
-
- auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
- for (auto &RenderFn : *ImmFns)
- RenderFn(CmpMI);
- constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
- return &*CmpMI;
-}
-
-MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
- MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const {
- // We are looking for the following pattern:
- //
- // shift = G_SHL/ASHR/LHSR y, c
- // ...
- // cmp = G_ICMP pred, something, shift
- //
- // Since we will select the G_ICMP to a SUBS, we can potentially fold the
- // shift into the subtract.
- static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs};
- static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR};
- auto ImmFns = selectShiftedRegister(RHS);
- if (!ImmFns)
- return nullptr;
- MachineRegisterInfo &MRI = *MIB.getMRI();
- auto Ty = MRI.getType(LHS.getReg());
- assert(!Ty.isVector() && "Expected scalar or pointer only?");
- unsigned Size = Ty.getSizeInBits();
- bool Idx = (Size == 64);
- Register ZReg = ZRegTable[Idx];
- unsigned Opc = OpcTable[Idx];
- auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
- for (auto &RenderFn : *ImmFns)
- RenderFn(CmpMI);
- constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
- return &*CmpMI;
-}
-
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -4436,7 +4708,7 @@ bool AArch64InstructionSelector::selectInsertElt(
auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
if (!VRegAndVal)
return false;
- unsigned LaneIdx = VRegAndVal->Value;
+ unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
// Perform the lane insert.
Register SrcReg = I.getOperand(1).getReg();
@@ -4493,8 +4765,9 @@ bool AArch64InstructionSelector::selectInsertElt(
bool AArch64InstructionSelector::tryOptConstantBuildVec(
MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
- assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!");
- if (DstTy.getSizeInBits() < 32)
+ unsigned DstSize = DstTy.getSizeInBits();
+ assert(DstSize <= 128 && "Unexpected build_vec type!");
+ if (DstSize < 32)
return false;
// Check if we're building a constant vector, in which case we want to
// generate a constant pool load instead of a vector insert sequence.
@@ -4515,6 +4788,24 @@ bool AArch64InstructionSelector::tryOptConstantBuildVec(
}
Constant *CV = ConstantVector::get(Csts);
MachineIRBuilder MIB(I);
+ if (CV->isNullValue()) {
+ // Until the importer can support immAllZerosV in pattern leaf nodes,
+ // select a zero move manually here.
+ Register DstReg = I.getOperand(0).getReg();
+ if (DstSize == 128) {
+ auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
+ } else if (DstSize == 64) {
+ auto Mov =
+ MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
+ .addImm(0);
+ MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ .addReg(Mov.getReg(0), 0, AArch64::dsub);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI);
+ }
+ }
auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
if (!CPLoad) {
LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
@@ -4634,10 +4925,12 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1);
break;
case Intrinsic::debugtrap:
- if (!STI.isTargetWindows())
- return false;
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
break;
+ case Intrinsic::ubsantrap:
+ MIRBuilder.buildInstr(AArch64::BRK, {}, {})
+ .addImm(I.getOperand(1).getImm() | ('U' << 8));
+ break;
}
I.eraseFromParent();
@@ -4703,22 +4996,22 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
- if (MFReturnAddr) {
- MIRBuilder.buildCopy({DstReg}, MFReturnAddr);
- I.eraseFromParent();
- return true;
+ if (!MFReturnAddr) {
+ // Insert the copy from LR/X30 into the entry block, before it can be
+ // clobbered by anything.
+ MFI.setReturnAddressIsTaken(true);
+ MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
+ AArch64::GPR64RegClass);
}
- MFI.setReturnAddressIsTaken(true);
- MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass);
- // Insert the copy from LR/X30 into the entry block, before it can be
- // clobbered by anything.
- MachineBasicBlock &EntryBlock = *MF.begin();
- if (!EntryBlock.isLiveIn(AArch64::LR))
- EntryBlock.addLiveIn(AArch64::LR);
- MachineIRBuilder EntryBuilder(MF);
- EntryBuilder.setInstr(*EntryBlock.begin());
- EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
- MFReturnAddr = DstReg;
+
+ if (STI.hasPAuth()) {
+ MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
+ } else {
+ MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
+ MIRBuilder.buildInstr(AArch64::XPACLRI);
+ MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+ }
+
I.eraseFromParent();
return true;
}
@@ -4738,7 +5031,16 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
MIRBuilder.buildCopy({DstReg}, {FrameAddr});
else {
MFI.setReturnAddressIsTaken(true);
- MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1);
+
+ if (STI.hasPAuth()) {
+ Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
+ MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
+ } else {
+ MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1);
+ MIRBuilder.buildInstr(AArch64::XPACLRI);
+ MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+ }
}
I.eraseFromParent();
@@ -4946,7 +5248,7 @@ AArch64InstructionSelector::selectExtendedSHL(
// The value must fit into 3 bits, and must be positive. Make sure that is
// true.
- int64_t ImmVal = ValAndVReg->Value;
+ int64_t ImmVal = ValAndVReg->Value.getSExtValue();
// Since we're going to pull this into a shift, the constant value must be
// a power of 2. If we got a multiply, then we need to check this.
@@ -5086,12 +5388,60 @@ InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
unsigned SizeInBytes) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
-
- // If we have a constant offset, then we probably don't want to match a
- // register offset.
- if (isBaseWithConstantOffset(Root, MRI))
+ if (!Root.isReg())
+ return None;
+ MachineInstr *PtrAdd =
+ getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
+ if (!PtrAdd)
return None;
+ // Check for an immediates which cannot be encoded in the [base + imm]
+ // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
+ // end up with code like:
+ //
+ // mov x0, wide
+ // add x1 base, x0
+ // ldr x2, [x1, x0]
+ //
+ // In this situation, we can use the [base, xreg] addressing mode to save an
+ // add/sub:
+ //
+ // mov x0, wide
+ // ldr x2, [base, x0]
+ auto ValAndVReg =
+ getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
+ if (ValAndVReg) {
+ unsigned Scale = Log2_32(SizeInBytes);
+ int64_t ImmOff = ValAndVReg->Value.getSExtValue();
+
+ // Skip immediates that can be selected in the load/store addresing
+ // mode.
+ if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
+ ImmOff < (0x1000 << Scale))
+ return None;
+
+ // Helper lambda to decide whether or not it is preferable to emit an add.
+ auto isPreferredADD = [](int64_t ImmOff) {
+ // Constants in [0x0, 0xfff] can be encoded in an add.
+ if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+ return true;
+
+ // Can it be encoded in an add lsl #12?
+ if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
+ return false;
+
+ // It can be encoded in an add lsl #12, but we may not want to. If it is
+ // possible to select this as a single movz, then prefer that. A single
+ // movz is faster than an add with a shift.
+ return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+ (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+ };
+
+ // If the immediate can be encoded in a single add/sub, then bail out.
+ if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+ return None;
+ }
+
// Try to fold shifts into the addressing mode.
auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
if (AddrModeFns)
@@ -5521,7 +5871,8 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
- Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+ Optional<int64_t> CstVal =
+ getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
assert(CstVal && "Expected constant value");
MIB.addImm(CstVal.getValue());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 4ffde2a7e3c4..5a6c904e3f5d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -14,6 +14,7 @@
#include "AArch64LegalizerInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -22,6 +23,8 @@
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Type.h"
+#include <initializer_list>
+#include "llvm/Support/MathExtras.h"
#define DEBUG_TYPE "aarch64-legalinfo"
@@ -53,6 +56,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
const LLT v2s64 = LLT::vector(2, 64);
const LLT v2p0 = LLT::vector(2, p0);
+ std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
+ v16s8, v8s16, v4s32,
+ v2s64, v2p0,
+ /* End 128bit types */
+ /* Begin 64bit types */
+ v8s8, v4s16, v2s32};
+
const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
// FIXME: support subtargets which have neon/fp-armv8 disabled.
@@ -61,25 +71,31 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
return;
}
+ // Some instructions only support s16 if the subtarget has full 16-bit FP
+ // support.
+ const bool HasFP16 = ST.hasFullFP16();
+ const LLT &MinFPScalar = HasFP16 ? s16 : s32;
+
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
- .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
- .clampScalar(0, s1, s64)
- .widenScalarToNextPow2(0, 8)
- .fewerElementsIf(
- [=](const LegalityQuery &Query) {
- return Query.Types[0].isVector() &&
- (Query.Types[0].getElementType() != s64 ||
- Query.Types[0].getNumElements() != 2);
- },
- [=](const LegalityQuery &Query) {
- LLT EltTy = Query.Types[0].getElementType();
- if (EltTy == s64)
- return std::make_pair(0, LLT::vector(2, 64));
- return std::make_pair(0, EltTy);
- });
-
- getActionDefinitionsBuilder(G_PHI)
- .legalFor({p0, s16, s32, s64, v2s32, v4s32, v2s64})
+ .legalFor({p0, s1, s8, s16, s32, s64})
+ .legalFor(PackedVectorAllTypeList)
+ .clampScalar(0, s1, s64)
+ .widenScalarToNextPow2(0, 8)
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Types[0].isVector() &&
+ (Query.Types[0].getElementType() != s64 ||
+ Query.Types[0].getNumElements() != 2);
+ },
+ [=](const LegalityQuery &Query) {
+ LLT EltTy = Query.Types[0].getElementType();
+ if (EltTy == s64)
+ return std::make_pair(0, LLT::vector(2, 64));
+ return std::make_pair(0, EltTy);
+ });
+
+ getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64})
+ .legalFor(PackedVectorAllTypeList)
.clampScalar(0, s16, s64)
.widenScalarToNextPow2(0);
@@ -89,26 +105,38 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
- .legalFor({s32, s64, v2s32, v4s32, v2s64, v8s16, v16s8})
+ .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
+ .scalarizeIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
+ },
+ 0)
+ .legalFor({v2s64})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
.clampNumElements(0, v2s32, v4s32)
.clampNumElements(0, v2s64, v2s64)
.moreElementsToNextPow2(0);
- getActionDefinitionsBuilder(G_SHL)
+ getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
.customIf([=](const LegalityQuery &Query) {
const auto &SrcTy = Query.Types[0];
const auto &AmtTy = Query.Types[1];
return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
AmtTy.getSizeInBits() == 32;
})
- .legalFor({{s32, s32},
- {s64, s64},
- {s32, s64},
- {v2s32, v2s32},
- {v4s32, v4s32},
- {v2s64, v2s64}})
+ .legalFor({
+ {s32, s32},
+ {s32, s64},
+ {s64, s64},
+ {v8s8, v8s8},
+ {v16s8, v16s8},
+ {v4s16, v4s16},
+ {v8s16, v8s16},
+ {v2s32, v2s32},
+ {v4s32, v4s32},
+ {v2s64, v2s64},
+ })
.clampScalar(1, s32, s64)
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
@@ -130,43 +158,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0)
.scalarize(0);
- getActionDefinitionsBuilder({G_LSHR, G_ASHR})
- .customIf([=](const LegalityQuery &Query) {
- const auto &SrcTy = Query.Types[0];
- const auto &AmtTy = Query.Types[1];
- return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
- AmtTy.getSizeInBits() == 32;
- })
- .legalFor({{s32, s32},
- {s32, s64},
- {s64, s64},
- {v2s32, v2s32},
- {v4s32, v4s32},
- {v2s64, v2s64}})
- .clampScalar(1, s32, s64)
- .clampScalar(0, s32, s64)
- .minScalarSameAs(1, 0);
-
getActionDefinitionsBuilder({G_SREM, G_UREM})
.lowerFor({s1, s8, s16, s32, s64});
- getActionDefinitionsBuilder({G_SMULO, G_UMULO})
- .lowerFor({{s64, s1}});
+ getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
- getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO})
+ getActionDefinitionsBuilder(
+ {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
.legalFor({{s32, s1}, {s64, s1}})
.minScalar(0, s32);
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
- .legalFor({s32, s64, v2s64, v4s32, v2s32});
+ .legalFor({s32, s64, v2s64, v4s32, v2s32})
+ .clampNumElements(0, v2s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64);
getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
- G_FNEARBYINT})
+ G_FNEARBYINT, G_INTRINSIC_LRINT})
// If we don't have full FP16 support, then scalarize the elements of
// vectors containing fp16 types.
.fewerElementsIf(
@@ -272,8 +285,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{v4s32, p0, 128, 8},
{v2s64, p0, 128, 8}})
// These extends are also legal
- .legalForTypesWithMemDesc({{s32, p0, 8, 8},
- {s32, p0, 16, 8}})
+ .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}})
.clampScalar(0, s8, s64)
.lowerIfMemSizeNotPow2()
// Lower any any-extending loads left into G_ANYEXT and G_LOAD
@@ -295,6 +307,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{p0, p0, 64, 8},
{s128, p0, 128, 8},
{v16s8, p0, 128, 8},
+ {v8s8, p0, 64, 8},
{v4s16, p0, 64, 8},
{v8s16, p0, 128, 8},
{v2s32, p0, 64, 8},
@@ -312,14 +325,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// Constants
getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({p0, s8, s16, s32, s64})
+ .legalFor({p0, s8, s16, s32, s64})
.clampScalar(0, s8, s64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({s32, s64})
- .clampScalar(0, s32, s64);
+ .legalIf([=](const LegalityQuery &Query) {
+ const auto &Ty = Query.Types[0];
+ if (HasFP16 && Ty == s16)
+ return true;
+ return Ty == s32 || Ty == s64 || Ty == s128;
+ })
+ .clampScalar(0, MinFPScalar, s128);
- getActionDefinitionsBuilder(G_ICMP)
+ getActionDefinitionsBuilder({G_ICMP, G_FCMP})
.legalFor({{s32, s32},
{s32, s64},
{s32, p0},
@@ -347,13 +365,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.minScalarOrEltIf(
[=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
s64)
- .widenScalarOrEltToNextPow2(1);
-
- getActionDefinitionsBuilder(G_FCMP)
- .legalFor({{s32, s32}, {s32, s64}})
- .clampScalar(0, s32, s32)
- .clampScalar(1, s32, s64)
- .widenScalarToNextPow2(1);
+ .widenScalarOrEltToNextPow2(1)
+ .clampNumElements(0, v2s32, v4s32);
// Extensions
auto ExtLegalFunc = [=](const LegalityQuery &Query) {
@@ -361,7 +374,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
if (DstSize == 128 && !Query.Types[0].isVector())
return false; // Extending to a scalar s128 needs narrowing.
-
+
// Make sure that we have something that will fit in a register, and
// make sure it's a power of 2.
if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
@@ -386,17 +399,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalIf(ExtLegalFunc)
.clampScalar(0, s64, s64); // Just for s128, others are handled above.
- getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
+ getActionDefinitionsBuilder(G_TRUNC)
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
+ 0, s8)
+ .customIf([=](const LegalityQuery &Query) {
+ LLT DstTy = Query.Types[0];
+ LLT SrcTy = Query.Types[1];
+ return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
+ })
+ .alwaysLegal();
- getActionDefinitionsBuilder(G_SEXT_INREG)
- .legalFor({s32, s64})
- .lower();
+ getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower();
// FP conversions
- getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
- {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}});
- getActionDefinitionsBuilder(G_FPEXT).legalFor(
- {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}});
+ getActionDefinitionsBuilder(G_FPTRUNC)
+ .legalFor(
+ {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
+ .clampMaxNumElements(0, s32, 2);
+ getActionDefinitionsBuilder(G_FPEXT)
+ .legalFor(
+ {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
+ .clampMaxNumElements(0, s64, 2);
// Conversions
getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
@@ -409,7 +433,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
.legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
.clampScalar(1, s32, s64)
- .widenScalarToNextPow2(1)
+ .minScalarSameAs(1, 0)
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0);
@@ -417,14 +441,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
- // Select
- // FIXME: We can probably do a bit better than just scalarizing vector
- // selects.
getActionDefinitionsBuilder(G_SELECT)
.legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
- .scalarize(0);
+ .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
+ .lowerIf(isVector(0));
// Pointer-handling
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
@@ -554,8 +576,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
})
// Any vectors left are the wrong size. Scalarize them.
- .scalarize(0)
- .scalarize(1);
+ .scalarize(0)
+ .scalarize(1);
}
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -567,18 +589,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalIf([=](const LegalityQuery &Query) {
const LLT &VecTy = Query.Types[1];
return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
- VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32;
- });
+ VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
+ VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0;
+ })
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) {
+ // We want to promote to <M x s1> to <M x s64> if that wouldn't
+ // cause the total vec size to be > 128b.
+ return Query.Types[1].getNumElements() <= 2;
+ },
+ 0, s64)
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Types[1].getNumElements() <= 4;
+ },
+ 0, s32)
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Types[1].getNumElements() <= 8;
+ },
+ 0, s16)
+ .minScalarOrEltIf(
+ [=](const LegalityQuery &Query) {
+ return Query.Types[1].getNumElements() <= 16;
+ },
+ 0, s8)
+ .minScalarOrElt(0, s8); // Worst case, we need at least s8.
getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &VecTy = Query.Types[0];
- // TODO: Support s8 and s16
- return VecTy == v2s32 || VecTy == v4s32 || VecTy == v2s64;
- });
+ .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalFor({{v4s16, s16},
+ .legalFor({{v8s8, s8},
+ {v16s8, s8},
+ {v4s16, s16},
{v8s16, s16},
{v2s32, s32},
{v4s32, s32},
@@ -594,8 +638,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
})
.minScalarSameAs(1, 0);
- getActionDefinitionsBuilder(G_CTLZ).legalForCartesianProduct(
- {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ getActionDefinitionsBuilder(G_CTLZ)
+ .legalForCartesianProduct(
+ {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
.scalarize(1);
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
@@ -606,7 +651,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// to be the same size as the dest.
if (DstTy != SrcTy)
return false;
- for (auto &Ty : {v2s32, v4s32, v2s64}) {
+ for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) {
if (DstTy == Ty)
return true;
}
@@ -623,8 +668,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalFor({{v4s32, v2s32}, {v8s16, v4s16}});
- getActionDefinitionsBuilder(G_JUMP_TABLE)
- .legalFor({{p0}, {s64}});
+ getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
return Query.Types[0] == p0 && Query.Types[1] == s64;
@@ -632,6 +676,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
+ getActionDefinitionsBuilder(G_ABS).lowerIf(
+ [=](const LegalityQuery &Query) { return Query.Types[0].isScalar(); });
+
+ getActionDefinitionsBuilder(G_VECREDUCE_FADD)
+ // We only have FADDP to do reduction-like operations. Lower the rest.
+ .legalFor({{s32, v2s32}, {s64, v2s64}})
+ .lower();
+
+ getActionDefinitionsBuilder(G_VECREDUCE_ADD)
+ .legalFor({{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s64, v2s64}})
+ .lower();
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -656,15 +714,63 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
case TargetOpcode::G_GLOBAL_VALUE:
return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
+ case TargetOpcode::G_TRUNC:
+ return legalizeVectorTrunc(MI, Helper);
}
llvm_unreachable("expected switch to return");
}
-bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
- GISelChangeObserver &Observer) const {
+static void extractParts(Register Reg, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
+ SmallVectorImpl<Register> &VRegs) {
+ for (int I = 0; I < NumParts; ++I)
+ VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
+ MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+bool AArch64LegalizerInfo::legalizeVectorTrunc(
+ MachineInstr &MI, LegalizerHelper &Helper) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+ // Similar to how operand splitting is done in SelectiondDAG, we can handle
+ // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
+ // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
+ // %lo16(<4 x s16>) = G_TRUNC %inlo
+ // %hi16(<4 x s16>) = G_TRUNC %inhi
+ // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
+ // %res(<8 x s8>) = G_TRUNC %in16
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT SrcTy = MRI.getType(SrcReg);
+ assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
+ isPowerOf2_32(SrcTy.getSizeInBits()));
+
+ // Split input type.
+ LLT SplitSrcTy = SrcTy.changeNumElements(SrcTy.getNumElements() / 2);
+ // First, split the source into two smaller vectors.
+ SmallVector<Register, 2> SplitSrcs;
+ extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
+
+ // Truncate the splits into intermediate narrower elements.
+ LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
+ for (unsigned I = 0; I < SplitSrcs.size(); ++I)
+ SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
+
+ auto Concat = MIRBuilder.buildConcatVectors(
+ DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
+
+ Helper.Observer.changingInstr(MI);
+ MI.getOperand(1).setReg(Concat.getReg(0));
+ Helper.Observer.changedInstr(MI);
+ return true;
+}
+
+bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
// We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
// G_ADD_LOW instructions.
@@ -686,6 +792,27 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
// Set the regclass on the dest reg too.
MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
+ // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
+ // by creating a MOVK that sets bits 48-63 of the register to (global address
+ // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
+ // prevent an incorrect tag being generated during relocation when the the
+ // global appears before the code section. Without the offset, a global at
+ // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
+ // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
+ // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
+ // instead of `0xf`.
+ // This assumes that we're in the small code model so we can assume a binary
+ // size of <= 4GB, which makes the untagged PC relative offset positive. The
+ // binary must also be loaded into address range [0, 2^48). Both of these
+ // properties need to be ensured at runtime when using tagged addresses.
+ if (OpFlags & AArch64II::MO_TAGGED) {
+ ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
+ .addGlobalAddress(GV, 0x100000000,
+ AArch64II::MO_PREL | AArch64II::MO_G3)
+ .addImm(48);
+ MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
+ }
+
MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
.addGlobalAddress(GV, 0,
OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
@@ -693,21 +820,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
return true;
}
-bool AArch64LegalizerInfo::legalizeIntrinsic(
- LegalizerHelper &Helper, MachineInstr &MI) const {
- MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
- switch (MI.getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memset:
- case Intrinsic::memmove:
- if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
- LegalizerHelper::UnableToLegalize)
- return false;
- MI.eraseFromParent();
- return true;
- default:
- break;
- }
+bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
return true;
}
@@ -724,11 +838,13 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
if (!VRegAndVal)
return true;
// Check the shift amount is in range for an immediate form.
- int64_t Amount = VRegAndVal->Value;
+ int64_t Amount = VRegAndVal->Value.getSExtValue();
if (Amount > 31)
return true; // This will have to remain a register variant.
auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
+ Observer.changingInstr(MI);
MI.getOperand(2).setReg(ExtCst.getReg(0));
+ Observer.changedInstr(MI);
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 1cb24559c1ab..8217e37c8512 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
namespace llvm {
@@ -45,6 +46,7 @@ private:
bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const;
+ bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index baa8515baf3e..fdd04cb77fad 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -1,17 +1,22 @@
- //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===//
+//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-//
-// This performs post-legalization combines on generic MachineInstrs.
-//
-// Any combine that this pass performs must preserve instruction legality.
-// Combines unconcerned with legality should be handled by the
-// PreLegalizerCombiner instead.
-//
+///
+/// \file
+/// Post-legalization combines on generic MachineInstrs.
+///
+/// The combines here must preserve instruction legality.
+///
+/// Lowering combines (e.g. pseudo matching) should be handled by
+/// AArch64PostLegalizerLowering.
+///
+/// Combines which don't rely on instruction legality should go in the
+/// AArch64PreLegalizerCombiner.
+///
//===----------------------------------------------------------------------===//
#include "AArch64TargetMachine.h"
@@ -19,373 +24,215 @@
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
using namespace llvm;
-using namespace MIPatternMatch;
-
-/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
-///
-/// Used for matching target-supported shuffles before codegen.
-struct ShuffleVectorPseudo {
- unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1)
- Register Dst; ///< Destination register.
- SmallVector<SrcOp, 2> SrcOps; ///< Source registers.
- ShuffleVectorPseudo(unsigned Opc, Register Dst,
- std::initializer_list<SrcOp> SrcOps)
- : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
- ShuffleVectorPseudo() {}
-};
-
-/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
-/// If \p MI is not a splat, returns None.
-static Optional<int> getSplatIndex(MachineInstr &MI) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
- "Only G_SHUFFLE_VECTOR can have a splat index!");
- ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
- auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
-
- // If all elements are undefined, this shuffle can be considered a splat.
- // Return 0 for better potential for callers to simplify.
- if (FirstDefinedIdx == Mask.end())
- return 0;
-
- // Make sure all remaining elements are either undef or the same
- // as the first non-undef value.
- int SplatValue = *FirstDefinedIdx;
- if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
- [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
- return None;
-
- return SplatValue;
-}
-
-/// Check if a vector shuffle corresponds to a REV instruction with the
-/// specified blocksize.
-static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
- unsigned BlockSize) {
- assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
- "Only possible block sizes for REV are: 16, 32, 64");
- assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
-
- unsigned BlockElts = M[0] + 1;
- // If the first shuffle index is UNDEF, be optimistic.
- if (M[0] < 0)
- BlockElts = BlockSize / EltSize;
-
- if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+/// This combine tries do what performExtractVectorEltCombine does in SDAG.
+/// Rewrite for pairwise fadd pattern
+/// (s32 (g_extract_vector_elt
+/// (g_fadd (vXs32 Other)
+/// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0))
+/// ->
+/// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0)
+/// (g_extract_vector_elt (vXs32 Other) 1))
+bool matchExtractVecEltPairwiseAdd(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::tuple<unsigned, LLT, Register> &MatchInfo) {
+ Register Src1 = MI.getOperand(1).getReg();
+ Register Src2 = MI.getOperand(2).getReg();
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+ auto Cst = getConstantVRegValWithLookThrough(Src2, MRI);
+ if (!Cst || Cst->Value != 0)
return false;
+ // SDAG also checks for FullFP16, but this looks to be beneficial anyway.
- for (unsigned i = 0; i < NumElts; ++i) {
- // Ignore undef indices.
- if (M[i] < 0)
- continue;
- if (static_cast<unsigned>(M[i]) !=
- (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
- return false;
- }
-
- return true;
-}
-
-/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
-/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
-static bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
- unsigned &WhichResult) {
- if (NumElts % 2 != 0)
+ // Now check for an fadd operation. TODO: expand this for integer add?
+ auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI);
+ if (!FAddMI)
return false;
- WhichResult = (M[0] == 0 ? 0 : 1);
- for (unsigned i = 0; i < NumElts; i += 2) {
- if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
- (M[i + 1] >= 0 &&
- static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
- return false;
- }
- return true;
-}
-
-/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
-/// sources of the shuffle are different.
-static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
- unsigned NumElts) {
- // Look for the first non-undef element.
- auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
- if (FirstRealElt == M.end())
- return None;
-
- // Use APInt to handle overflow when calculating expected element.
- unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
- APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
-
- // The following shuffle indices must be the successive elements after the
- // first real element.
- if (any_of(
- make_range(std::next(FirstRealElt), M.end()),
- [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; }))
- return None;
-
- // The index of an EXT is the first element if it is not UNDEF.
- // Watch out for the beginning UNDEFs. The EXT index should be the expected
- // value of the first element. E.g.
- // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
- // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
- // ExpectedElt is the last mask index plus 1.
- uint64_t Imm = ExpectedElt.getZExtValue();
- bool ReverseExt = false;
-
- // There are two difference cases requiring to reverse input vectors.
- // For example, for vector <4 x i32> we have the following cases,
- // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
- // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
- // For both cases, we finally use mask <5, 6, 7, 0>, which requires
- // to reverse two input vectors.
- if (Imm < NumElts)
- ReverseExt = true;
- else
- Imm -= NumElts;
- return std::make_pair(ReverseExt, Imm);
-}
-
-/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
-/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
-static bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
- unsigned &WhichResult) {
- WhichResult = (M[0] == 0 ? 0 : 1);
- for (unsigned i = 0; i != NumElts; ++i) {
- // Skip undef indices.
- if (M[i] < 0)
- continue;
- if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
- return false;
- }
- return true;
-}
-/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
-/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
-static bool isZipMask(ArrayRef<int> M, unsigned NumElts,
- unsigned &WhichResult) {
- if (NumElts % 2 != 0)
+ // If we add support for integer add, must restrict these types to just s64.
+ unsigned DstSize = DstTy.getSizeInBits();
+ if (DstSize != 16 && DstSize != 32 && DstSize != 64)
return false;
- // 0 means use ZIP1, 1 means use ZIP2.
- WhichResult = (M[0] == 0 ? 0 : 1);
- unsigned Idx = WhichResult * NumElts / 2;
- for (unsigned i = 0; i != NumElts; i += 2) {
- if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
- (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
- return false;
- Idx += 1;
+ Register Src1Op1 = FAddMI->getOperand(1).getReg();
+ Register Src1Op2 = FAddMI->getOperand(2).getReg();
+ MachineInstr *Shuffle =
+ getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI);
+ MachineInstr *Other = MRI.getVRegDef(Src1Op1);
+ if (!Shuffle) {
+ Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI);
+ Other = MRI.getVRegDef(Src1Op2);
}
- return true;
-}
-
-/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
-/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
-static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
- ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- LLT Ty = MRI.getType(Dst);
- unsigned EltSize = Ty.getScalarSizeInBits();
-
- // Element size for a rev cannot be 64.
- if (EltSize == 64)
- return false;
- unsigned NumElts = Ty.getNumElements();
-
- // Try to produce G_REV64
- if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) {
- MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src});
+ // We're looking for a shuffle that moves the second element to index 0.
+ if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 &&
+ Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) {
+ std::get<0>(MatchInfo) = TargetOpcode::G_FADD;
+ std::get<1>(MatchInfo) = DstTy;
+ std::get<2>(MatchInfo) = Other->getOperand(0).getReg();
return true;
}
-
- // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support.
- // This should be identical to above, but with a constant 32 and constant
- // 16.
return false;
}
-/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
-/// a G_TRN1 or G_TRN2 instruction.
-static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
- unsigned WhichResult;
- ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
- Register Dst = MI.getOperand(0).getReg();
- unsigned NumElts = MRI.getType(Dst).getNumElements();
- if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
- return false;
- unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
- Register V1 = MI.getOperand(1).getReg();
- Register V2 = MI.getOperand(2).getReg();
- MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+bool applyExtractVecEltPairwiseAdd(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ std::tuple<unsigned, LLT, Register> &MatchInfo) {
+ unsigned Opc = std::get<0>(MatchInfo);
+ assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!");
+ // We want to generate two extracts of elements 0 and 1, and add them.
+ LLT Ty = std::get<1>(MatchInfo);
+ Register Src = std::get<2>(MatchInfo);
+ LLT s64 = LLT::scalar(64);
+ B.setInstrAndDebugLoc(MI);
+ auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0));
+ auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1));
+ B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1});
+ MI.eraseFromParent();
return true;
}
-/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
-/// a G_UZP1 or G_UZP2 instruction.
-///
-/// \param [in] MI - The shuffle vector instruction.
-/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success.
-static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
- unsigned WhichResult;
- ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
- Register Dst = MI.getOperand(0).getReg();
- unsigned NumElts = MRI.getType(Dst).getNumElements();
- if (!isUZPMask(ShuffleMask, NumElts, WhichResult))
- return false;
- unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2;
- Register V1 = MI.getOperand(1).getReg();
- Register V2 = MI.getOperand(2).getReg();
- MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
- return true;
+static bool isSignExtended(Register R, MachineRegisterInfo &MRI) {
+ // TODO: check if extended build vector as well.
+ unsigned Opc = MRI.getVRegDef(R)->getOpcode();
+ return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG;
}
-static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
- unsigned WhichResult;
- ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
- Register Dst = MI.getOperand(0).getReg();
- unsigned NumElts = MRI.getType(Dst).getNumElements();
- if (!isZipMask(ShuffleMask, NumElts, WhichResult))
- return false;
- unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
- Register V1 = MI.getOperand(1).getReg();
- Register V2 = MI.getOperand(2).getReg();
- MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
- return true;
+static bool isZeroExtended(Register R, MachineRegisterInfo &MRI) {
+ // TODO: check if extended build vector as well.
+ return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT;
}
-/// Helper function for matchDup.
-static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI,
- MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- if (Lane != 0)
- return false;
-
- // Try to match a vector splat operation into a dup instruction.
- // We're looking for this pattern:
- //
- // %scalar:gpr(s64) = COPY $x0
- // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
- // %cst0:gpr(s32) = G_CONSTANT i32 0
- // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
- // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
- // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
- //
- // ...into:
- // %splat = G_DUP %scalar
-
- // Begin matching the insert.
- auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
- MI.getOperand(1).getReg(), MRI);
- if (!InsMI)
- return false;
- // Match the undef vector operand.
- if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(),
- MRI))
- return false;
-
- // Match the index constant 0.
- int64_t Index = 0;
- if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
- return false;
-
- MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(),
- {InsMI->getOperand(2).getReg()});
- return true;
-}
+bool matchAArch64MulConstCombine(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
+ assert(MI.getOpcode() == TargetOpcode::G_MUL);
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ const LLT Ty = MRI.getType(LHS);
-/// Helper function for matchDup.
-static bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
- MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(Lane >= 0 && "Expected positive lane?");
- // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
- // lane's definition directly.
- auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
- MI.getOperand(1).getReg(), MRI);
- if (!BuildVecMI)
+ // The below optimizations require a constant RHS.
+ auto Const = getConstantVRegValWithLookThrough(RHS, MRI);
+ if (!Const)
return false;
- Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
- MatchInfo =
- ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg});
- return true;
-}
-static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
- auto MaybeLane = getSplatIndex(MI);
- if (!MaybeLane)
- return false;
- int Lane = *MaybeLane;
- // If this is undef splat, generate it via "just" vdup, if possible.
- if (Lane < 0)
- Lane = 0;
- if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo))
- return true;
- if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo))
- return true;
- return false;
-}
+ const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits());
+ // The following code is ported from AArch64ISelLowering.
+ // Multiplication of a power of two plus/minus one can be done more
+ // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+ // future CPUs have a cheaper MADD instruction, this may need to be
+ // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+ // 64-bit is 5 cycles, so this is always a win.
+ // More aggressively, some multiplications N0 * C can be lowered to
+ // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
+ // e.g. 6=3*2=(2+1)*2.
+ // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
+ // which equals to (1+2)*16-(1+2).
+ // TrailingZeroes is used to test if the mul can be lowered to
+ // shift+add+shift.
+ unsigned TrailingZeroes = ConstValue.countTrailingZeros();
+ if (TrailingZeroes) {
+ // Conservatively do not lower to shift+add+shift if the mul might be
+ // folded into smul or umul.
+ if (MRI.hasOneNonDBGUse(LHS) &&
+ (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI)))
+ return false;
+ // Conservatively do not lower to shift+add+shift if the mul might be
+ // folded into madd or msub.
+ if (MRI.hasOneNonDBGUse(Dst)) {
+ MachineInstr &UseMI = *MRI.use_instr_begin(Dst);
+ if (UseMI.getOpcode() == TargetOpcode::G_ADD ||
+ UseMI.getOpcode() == TargetOpcode::G_SUB)
+ return false;
+ }
+ }
+ // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
+ // and shift+add+shift.
+ APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
+
+ unsigned ShiftAmt, AddSubOpc;
+ // Is the shifted value the LHS operand of the add/sub?
+ bool ShiftValUseIsLHS = true;
+ // Do we need to negate the result?
+ bool NegateResult = false;
+
+ if (ConstValue.isNonNegative()) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
+ APInt SCVMinus1 = ShiftedConstValue - 1;
+ APInt CVPlus1 = ConstValue + 1;
+ if (SCVMinus1.isPowerOf2()) {
+ ShiftAmt = SCVMinus1.logBase2();
+ AddSubOpc = TargetOpcode::G_ADD;
+ } else if (CVPlus1.isPowerOf2()) {
+ ShiftAmt = CVPlus1.logBase2();
+ AddSubOpc = TargetOpcode::G_SUB;
+ } else
+ return false;
+ } else {
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+ // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+ APInt CVNegPlus1 = -ConstValue + 1;
+ APInt CVNegMinus1 = -ConstValue - 1;
+ if (CVNegPlus1.isPowerOf2()) {
+ ShiftAmt = CVNegPlus1.logBase2();
+ AddSubOpc = TargetOpcode::G_SUB;
+ ShiftValUseIsLHS = false;
+ } else if (CVNegMinus1.isPowerOf2()) {
+ ShiftAmt = CVNegMinus1.logBase2();
+ AddSubOpc = TargetOpcode::G_ADD;
+ NegateResult = true;
+ } else
+ return false;
+ }
-static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI,
- ShuffleVectorPseudo &MatchInfo) {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
- Register Dst = MI.getOperand(0).getReg();
- auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(),
- MRI.getType(Dst).getNumElements());
- if (!ExtInfo)
+ if (NegateResult && TrailingZeroes)
return false;
- bool ReverseExt;
- uint64_t Imm;
- std::tie(ReverseExt, Imm) = *ExtInfo;
- Register V1 = MI.getOperand(1).getReg();
- Register V2 = MI.getOperand(2).getReg();
- if (ReverseExt)
- std::swap(V1, V2);
- uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8;
- Imm *= ExtFactor;
- MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm});
- return true;
-}
-/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
-/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
-static bool applyShuffleVectorPseudo(MachineInstr &MI,
- ShuffleVectorPseudo &MatchInfo) {
- MachineIRBuilder MIRBuilder(MI);
- MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps);
- MI.eraseFromParent();
+ ApplyFn = [=](MachineIRBuilder &B, Register DstReg) {
+ auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt);
+ auto ShiftedVal = B.buildShl(Ty, LHS, Shift);
+
+ Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS;
+ Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0);
+ auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS});
+ assert(!(NegateResult && TrailingZeroes) &&
+ "NegateResult and TrailingZeroes cannot both be true for now.");
+ // Negate the result.
+ if (NegateResult) {
+ B.buildSub(DstReg, B.buildConstant(Ty, 0), Res);
+ return;
+ }
+ // Shift the result.
+ if (TrailingZeroes) {
+ B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes));
+ return;
+ }
+ B.buildCopy(DstReg, Res.getReg(0));
+ };
return true;
}
-/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT.
-/// Special-cased because the constant operand must be emitted as a G_CONSTANT
-/// for the imported tablegen patterns to work.
-static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
- MachineIRBuilder MIRBuilder(MI);
- // Tablegen patterns expect an i32 G_CONSTANT as the final op.
- auto Cst =
- MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm());
- MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst},
- {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst});
+bool applyAArch64MulConstCombine(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
+ B.setInstrAndDebugLoc(MI);
+ ApplyFn(B, MI.getOperand(0).getReg());
MI.eraseFromParent();
return true;
}
@@ -501,7 +348,7 @@ INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
false)
namespace llvm {
-FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) {
+FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) {
return new AArch64PostLegalizerCombiner(IsOptNone);
}
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
new file mode 100644
index 000000000000..a06ff4b5417a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -0,0 +1,704 @@
+//=== AArch64PostLegalizerLowering.cpp --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Post-legalization lowering for instructions.
+///
+/// This is used to offload pattern matching from the selector.
+///
+/// For example, this combiner will notice that a G_SHUFFLE_VECTOR is actually
+/// a G_ZIP, G_UZP, etc.
+///
+/// General optimization combines should be handled by either the
+/// AArch64PostLegalizerCombiner or the AArch64PreLegalizerCombiner.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "AArch64GlobalISelUtils.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-postlegalizer-lowering"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+using namespace AArch64GISelUtils;
+
+/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
+///
+/// Used for matching target-supported shuffles before codegen.
+struct ShuffleVectorPseudo {
+ unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1)
+ Register Dst; ///< Destination register.
+ SmallVector<SrcOp, 2> SrcOps; ///< Source registers.
+ ShuffleVectorPseudo(unsigned Opc, Register Dst,
+ std::initializer_list<SrcOp> SrcOps)
+ : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
+ ShuffleVectorPseudo() {}
+};
+
+/// Check if a vector shuffle corresponds to a REV instruction with the
+/// specified blocksize.
+static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
+ unsigned BlockSize) {
+ assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+ "Only possible block sizes for REV are: 16, 32, 64");
+ assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
+
+ unsigned BlockElts = M[0] + 1;
+
+ // If the first shuffle index is UNDEF, be optimistic.
+ if (M[0] < 0)
+ BlockElts = BlockSize / EltSize;
+
+ if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+ return false;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ // Ignore undef indices.
+ if (M[i] < 0)
+ continue;
+ if (static_cast<unsigned>(M[i]) !=
+ (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+ return false;
+ }
+
+ return true;
+}
+
+/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
+/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
+static bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
+ unsigned &WhichResult) {
+ if (NumElts % 2 != 0)
+ return false;
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
+ (M[i + 1] >= 0 &&
+ static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
+ return false;
+ }
+ return true;
+}
+
+/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
+/// sources of the shuffle are different.
+static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
+ unsigned NumElts) {
+ // Look for the first non-undef element.
+ auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
+ if (FirstRealElt == M.end())
+ return None;
+
+ // Use APInt to handle overflow when calculating expected element.
+ unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+ APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+
+ // The following shuffle indices must be the successive elements after the
+ // first real element.
+ if (any_of(
+ make_range(std::next(FirstRealElt), M.end()),
+ [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; }))
+ return None;
+
+ // The index of an EXT is the first element if it is not UNDEF.
+ // Watch out for the beginning UNDEFs. The EXT index should be the expected
+ // value of the first element. E.g.
+ // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+ // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+ // ExpectedElt is the last mask index plus 1.
+ uint64_t Imm = ExpectedElt.getZExtValue();
+ bool ReverseExt = false;
+
+ // There are two difference cases requiring to reverse input vectors.
+ // For example, for vector <4 x i32> we have the following cases,
+ // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+ // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+ // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+ // to reverse two input vectors.
+ if (Imm < NumElts)
+ ReverseExt = true;
+ else
+ Imm -= NumElts;
+ return std::make_pair(ReverseExt, Imm);
+}
+
+/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
+/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
+static bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
+ unsigned &WhichResult) {
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ // Skip undef indices.
+ if (M[i] < 0)
+ continue;
+ if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
+ return false;
+ }
+ return true;
+}
+
+/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
+/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
+static bool isZipMask(ArrayRef<int> M, unsigned NumElts,
+ unsigned &WhichResult) {
+ if (NumElts % 2 != 0)
+ return false;
+
+ // 0 means use ZIP1, 1 means use ZIP2.
+ WhichResult = (M[0] == 0 ? 0 : 1);
+ unsigned Idx = WhichResult * NumElts / 2;
+ for (unsigned i = 0; i != NumElts; i += 2) {
+ if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
+ (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
+ return false;
+ Idx += 1;
+ }
+ return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
+/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
+static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(Dst);
+ unsigned EltSize = Ty.getScalarSizeInBits();
+
+ // Element size for a rev cannot be 64.
+ if (EltSize == 64)
+ return false;
+
+ unsigned NumElts = Ty.getNumElements();
+
+ // Try to produce G_REV64
+ if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) {
+ MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src});
+ return true;
+ }
+
+ // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support.
+ // This should be identical to above, but with a constant 32 and constant
+ // 16.
+ return false;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_TRN1 or G_TRN2 instruction.
+static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ unsigned WhichResult;
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned NumElts = MRI.getType(Dst).getNumElements();
+ if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+ return false;
+ unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+ return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_UZP1 or G_UZP2 instruction.
+///
+/// \param [in] MI - The shuffle vector instruction.
+/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success.
+static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ unsigned WhichResult;
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned NumElts = MRI.getType(Dst).getNumElements();
+ if (!isUZPMask(ShuffleMask, NumElts, WhichResult))
+ return false;
+ unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+ return true;
+}
+
+static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ unsigned WhichResult;
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned NumElts = MRI.getType(Dst).getNumElements();
+ if (!isZipMask(ShuffleMask, NumElts, WhichResult))
+ return false;
+ unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+ return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ if (Lane != 0)
+ return false;
+
+ // Try to match a vector splat operation into a dup instruction.
+ // We're looking for this pattern:
+ //
+ // %scalar:gpr(s64) = COPY $x0
+ // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+ // %cst0:gpr(s32) = G_CONSTANT i32 0
+ // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+ // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+ // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
+ //
+ // ...into:
+ // %splat = G_DUP %scalar
+
+ // Begin matching the insert.
+ auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
+ MI.getOperand(1).getReg(), MRI);
+ if (!InsMI)
+ return false;
+ // Match the undef vector operand.
+ if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(),
+ MRI))
+ return false;
+
+ // Match the index constant 0.
+ if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ZeroInt()))
+ return false;
+
+ MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(),
+ {InsMI->getOperand(2).getReg()});
+ return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(Lane >= 0 && "Expected positive lane?");
+ // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
+ // lane's definition directly.
+ auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
+ MI.getOperand(1).getReg(), MRI);
+ if (!BuildVecMI)
+ return false;
+ Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
+ MatchInfo =
+ ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg});
+ return true;
+}
+
+static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ auto MaybeLane = getSplatIndex(MI);
+ if (!MaybeLane)
+ return false;
+ int Lane = *MaybeLane;
+ // If this is undef splat, generate it via "just" vdup, if possible.
+ if (Lane < 0)
+ Lane = 0;
+ if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo))
+ return true;
+ if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo))
+ return true;
+ return false;
+}
+
+static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI,
+ ShuffleVectorPseudo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ Register Dst = MI.getOperand(0).getReg();
+ auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(),
+ MRI.getType(Dst).getNumElements());
+ if (!ExtInfo)
+ return false;
+ bool ReverseExt;
+ uint64_t Imm;
+ std::tie(ReverseExt, Imm) = *ExtInfo;
+ Register V1 = MI.getOperand(1).getReg();
+ Register V2 = MI.getOperand(2).getReg();
+ if (ReverseExt)
+ std::swap(V1, V2);
+ uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8;
+ Imm *= ExtFactor;
+ MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm});
+ return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
+/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
+static bool applyShuffleVectorPseudo(MachineInstr &MI,
+ ShuffleVectorPseudo &MatchInfo) {
+ MachineIRBuilder MIRBuilder(MI);
+ MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT.
+/// Special-cased because the constant operand must be emitted as a G_CONSTANT
+/// for the imported tablegen patterns to work.
+static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
+ MachineIRBuilder MIRBuilder(MI);
+ // Tablegen patterns expect an i32 G_CONSTANT as the final op.
+ auto Cst =
+ MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm());
+ MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst},
+ {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst});
+ MI.eraseFromParent();
+ return true;
+}
+
+/// isVShiftRImm - Check if this is a valid vector for the immediate
+/// operand of a vector shift right operation. The value must be in the range:
+/// 1 <= Value <= ElementBits for a right shift.
+static bool isVShiftRImm(Register Reg, MachineRegisterInfo &MRI, LLT Ty,
+ int64_t &Cnt) {
+ assert(Ty.isVector() && "vector shift count is not a vector type");
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ auto Cst = getBuildVectorConstantSplat(*MI, MRI);
+ if (!Cst)
+ return false;
+ Cnt = *Cst;
+ int64_t ElementBits = Ty.getScalarSizeInBits();
+ return Cnt >= 1 && Cnt <= ElementBits;
+}
+
+/// Match a vector G_ASHR or G_LSHR with a valid immediate shift.
+static bool matchVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI,
+ int64_t &Imm) {
+ assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
+ MI.getOpcode() == TargetOpcode::G_LSHR);
+ LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+ if (!Ty.isVector())
+ return false;
+ return isVShiftRImm(MI.getOperand(2).getReg(), MRI, Ty, Imm);
+}
+
+static bool applyVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI,
+ int64_t &Imm) {
+ unsigned Opc = MI.getOpcode();
+ assert(Opc == TargetOpcode::G_ASHR || Opc == TargetOpcode::G_LSHR);
+ unsigned NewOpc =
+ Opc == TargetOpcode::G_ASHR ? AArch64::G_VASHR : AArch64::G_VLSHR;
+ MachineIRBuilder MIB(MI);
+ auto ImmDef = MIB.buildConstant(LLT::scalar(32), Imm);
+ MIB.buildInstr(NewOpc, {MI.getOperand(0)}, {MI.getOperand(1), ImmDef});
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Determine if it is possible to modify the \p RHS and predicate \p P of a
+/// G_ICMP instruction such that the right-hand side is an arithmetic immediate.
+///
+/// \returns A pair containing the updated immediate and predicate which may
+/// be used to optimize the instruction.
+///
+/// \note This assumes that the comparison has been legalized.
+Optional<std::pair<uint64_t, CmpInst::Predicate>>
+tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
+ const MachineRegisterInfo &MRI) {
+ const auto &Ty = MRI.getType(RHS);
+ if (Ty.isVector())
+ return None;
+ unsigned Size = Ty.getSizeInBits();
+ assert((Size == 32 || Size == 64) && "Expected 32 or 64 bit compare only?");
+
+ // If the RHS is not a constant, or the RHS is already a valid arithmetic
+ // immediate, then there is nothing to change.
+ auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
+ if (!ValAndVReg)
+ return None;
+ uint64_t C = ValAndVReg->Value.getZExtValue();
+ if (isLegalArithImmed(C))
+ return None;
+
+ // We have a non-arithmetic immediate. Check if adjusting the immediate and
+ // adjusting the predicate will result in a legal arithmetic immediate.
+ switch (P) {
+ default:
+ return None;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SGE:
+ // Check for
+ //
+ // x slt c => x sle c - 1
+ // x sge c => x sgt c - 1
+ //
+ // When c is not the smallest possible negative number.
+ if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
+ (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
+ return None;
+ P = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
+ C -= 1;
+ break;
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_UGE:
+ // Check for
+ //
+ // x ult c => x ule c - 1
+ // x uge c => x ugt c - 1
+ //
+ // When c is not zero.
+ if (C == 0)
+ return None;
+ P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+ C -= 1;
+ break;
+ case CmpInst::ICMP_SLE:
+ case CmpInst::ICMP_SGT:
+ // Check for
+ //
+ // x sle c => x slt c + 1
+ // x sgt c => s sge c + 1
+ //
+ // When c is not the largest possible signed integer.
+ if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
+ (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
+ return None;
+ P = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
+ C += 1;
+ break;
+ case CmpInst::ICMP_ULE:
+ case CmpInst::ICMP_UGT:
+ // Check for
+ //
+ // x ule c => x ult c + 1
+ // x ugt c => s uge c + 1
+ //
+ // When c is not the largest possible unsigned integer.
+ if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
+ (Size == 64 && C == UINT64_MAX))
+ return None;
+ P = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+ C += 1;
+ break;
+ }
+
+ // Check if the new constant is valid, and return the updated constant and
+ // predicate if it is.
+ if (Size == 32)
+ C = static_cast<uint32_t>(C);
+ if (!isLegalArithImmed(C))
+ return None;
+ return {{C, P}};
+}
+
+/// Determine whether or not it is possible to update the RHS and predicate of
+/// a G_ICMP instruction such that the RHS will be selected as an arithmetic
+/// immediate.
+///
+/// \p MI - The G_ICMP instruction
+/// \p MatchInfo - The new RHS immediate and predicate on success
+///
+/// See tryAdjustICmpImmAndPred for valid transformations.
+bool matchAdjustICmpImmAndPred(
+ MachineInstr &MI, const MachineRegisterInfo &MRI,
+ std::pair<uint64_t, CmpInst::Predicate> &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+ Register RHS = MI.getOperand(3).getReg();
+ auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ if (auto MaybeNewImmAndPred = tryAdjustICmpImmAndPred(RHS, Pred, MRI)) {
+ MatchInfo = *MaybeNewImmAndPred;
+ return true;
+ }
+ return false;
+}
+
+bool applyAdjustICmpImmAndPred(
+ MachineInstr &MI, std::pair<uint64_t, CmpInst::Predicate> &MatchInfo,
+ MachineIRBuilder &MIB, GISelChangeObserver &Observer) {
+ MIB.setInstrAndDebugLoc(MI);
+ MachineOperand &RHS = MI.getOperand(3);
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ auto Cst = MIB.buildConstant(MRI.cloneVirtualRegister(RHS.getReg()),
+ MatchInfo.first);
+ Observer.changingInstr(MI);
+ RHS.setReg(Cst->getOperand(0).getReg());
+ MI.getOperand(1).setPredicate(MatchInfo.second);
+ Observer.changedInstr(MI);
+ return true;
+}
+
+bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::pair<unsigned, int> &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ Register Src1Reg = MI.getOperand(1).getReg();
+ const LLT SrcTy = MRI.getType(Src1Reg);
+ const LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+ auto LaneIdx = getSplatIndex(MI);
+ if (!LaneIdx)
+ return false;
+
+ // The lane idx should be within the first source vector.
+ if (*LaneIdx >= SrcTy.getNumElements())
+ return false;
+
+ if (DstTy != SrcTy)
+ return false;
+
+ LLT ScalarTy = SrcTy.getElementType();
+ unsigned ScalarSize = ScalarTy.getSizeInBits();
+
+ unsigned Opc = 0;
+ switch (SrcTy.getNumElements()) {
+ case 2:
+ if (ScalarSize == 64)
+ Opc = AArch64::G_DUPLANE64;
+ break;
+ case 4:
+ if (ScalarSize == 32)
+ Opc = AArch64::G_DUPLANE32;
+ break;
+ case 8:
+ if (ScalarSize == 16)
+ Opc = AArch64::G_DUPLANE16;
+ break;
+ case 16:
+ if (ScalarSize == 8)
+ Opc = AArch64::G_DUPLANE8;
+ break;
+ default:
+ break;
+ }
+ if (!Opc)
+ return false;
+
+ MatchInfo.first = Opc;
+ MatchInfo.second = *LaneIdx;
+ return true;
+}
+
+bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, std::pair<unsigned, int> &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ B.setInstrAndDebugLoc(MI);
+ auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second);
+ B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()},
+ {MI.getOperand(1).getReg(), Lane});
+ MI.eraseFromParent();
+ return true;
+}
+
+#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenPostLegalizeGILowering.inc"
+#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenPostLegalizeGILowering.inc"
+#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H
+
+class AArch64PostLegalizerLoweringInfo : public CombinerInfo {
+public:
+ AArch64GenPostLegalizerLoweringHelperRuleConfig GeneratedRuleCfg;
+
+ AArch64PostLegalizerLoweringInfo(bool OptSize, bool MinSize)
+ : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+ /*LegalizerInfo*/ nullptr, /*OptEnabled = */ true, OptSize,
+ MinSize) {
+ if (!GeneratedRuleCfg.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AArch64PostLegalizerLoweringInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B);
+ AArch64GenPostLegalizerLoweringHelper Generated(GeneratedRuleCfg);
+ return Generated.tryCombineAll(Observer, MI, B, Helper);
+}
+
+#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenPostLegalizeGILowering.inc"
+#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP
+
+class AArch64PostLegalizerLowering : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AArch64PostLegalizerLowering();
+
+ StringRef getPassName() const override {
+ return "AArch64PostLegalizerLowering";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+} // end anonymous namespace
+
+void AArch64PostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64PostLegalizerLowering::AArch64PostLegalizerLowering()
+ : MachineFunctionPass(ID) {
+ initializeAArch64PostLegalizerLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+bool AArch64PostLegalizerLowering::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ assert(MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::Legalized) &&
+ "Expected a legalized function?");
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ const Function &F = MF.getFunction();
+ AArch64PostLegalizerLoweringInfo PCInfo(F.hasOptSize(), F.hasMinSize());
+ Combiner C(PCInfo, TPC);
+ return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AArch64PostLegalizerLowering::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64PostLegalizerLowering, DEBUG_TYPE,
+ "Lower AArch64 MachineInstrs after legalization", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AArch64PostLegalizerLowering, DEBUG_TYPE,
+ "Lower AArch64 MachineInstrs after legalization", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAArch64PostLegalizerLowering() {
+ return new AArch64PostLegalizerLowering();
+}
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
new file mode 100644
index 000000000000..2f882ecb1fd4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
@@ -0,0 +1,187 @@
+//=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does post-instruction-selection optimizations in the GlobalISel
+// pipeline, before the rest of codegen runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-post-select-optimize"
+
+using namespace llvm;
+
+namespace {
+class AArch64PostSelectOptimize : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AArch64PostSelectOptimize();
+
+ StringRef getPassName() const override {
+ return "AArch64 Post Select Optimizer";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ bool optimizeNZCVDefs(MachineBasicBlock &MBB);
+};
+} // end anonymous namespace
+
+void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64PostSelectOptimize::AArch64PostSelectOptimize()
+ : MachineFunctionPass(ID) {
+ initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
+}
+
+unsigned getNonFlagSettingVariant(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return 0;
+ case AArch64::SUBSXrr:
+ return AArch64::SUBXrr;
+ case AArch64::SUBSWrr:
+ return AArch64::SUBWrr;
+ case AArch64::SUBSXrs:
+ return AArch64::SUBXrs;
+ case AArch64::SUBSXri:
+ return AArch64::SUBXri;
+ case AArch64::SUBSWri:
+ return AArch64::SUBWri;
+ }
+}
+
+bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
+ // Consider the following code:
+ // FCMPSrr %0, %1, implicit-def $nzcv
+ // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
+ // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
+ // FCMPSrr %0, %1, implicit-def $nzcv
+ // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
+ // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
+ // when we have a single IR fcmp being used by two selects. During selection,
+ // to ensure that there can be no clobbering of nzcv between the fcmp and the
+ // csel, we have to generate an fcmp immediately before each csel is
+ // selected.
+ // However, often we can essentially CSE these together later in MachineCSE.
+ // This doesn't work though if there are unrelated flag-setting instructions
+ // in between the two FCMPs. In this case, the SUBS defines NZCV
+ // but it doesn't have any users, being overwritten by the second FCMP.
+ //
+ // Our solution here is to try to convert flag setting operations between
+ // a interval of identical FCMPs, so that CSE will be able to eliminate one.
+ bool Changed = false;
+ const auto *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+
+ // The first step is to find the first and last FCMPs. If we have found
+ // at least two, then set the limit of the bottom-up walk to the first FCMP
+ // found since we're only interested in dealing with instructions between
+ // them.
+ MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr;
+ for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) {
+ if (MI.getOpcode() == AArch64::FCMPSrr ||
+ MI.getOpcode() == AArch64::FCMPDrr) {
+ if (!FirstCmp)
+ FirstCmp = &MI;
+ else
+ LastCmp = &MI;
+ }
+ }
+
+ // In addition to converting flag-setting ops in fcmp ranges into non-flag
+ // setting ops, across the whole basic block we also detect when nzcv
+ // implicit-defs are dead, and mark them as dead. Peephole optimizations need
+ // this information later.
+
+ LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
+ LRU.addLiveOuts(MBB);
+ bool NZCVDead = LRU.available(AArch64::NZCV);
+ bool InsideCmpRange = false;
+ for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
+ LRU.stepBackward(II);
+
+ if (LastCmp) { // There's a range present in this block.
+ // If we're inside an fcmp range, look for begin instruction.
+ if (InsideCmpRange && &II == FirstCmp)
+ InsideCmpRange = false;
+ else if (&II == LastCmp)
+ InsideCmpRange = true;
+ }
+
+ // Did this instruction define NZCV?
+ bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV);
+ if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) {
+ // If we have a def and NZCV is dead, then we may convert this op.
+ unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
+ int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV);
+ if (DeadNZCVIdx != -1) {
+ // If we're inside an fcmp range, then convert flag setting ops.
+ if (InsideCmpRange && NewOpc) {
+ LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
+ "op in fcmp range: "
+ << II);
+ II.setDesc(TII->get(NewOpc));
+ II.RemoveOperand(DeadNZCVIdx);
+ Changed |= true;
+ } else {
+ // Otherwise, we just set the nzcv imp-def operand to be dead, so the
+ // peephole optimizations can optimize them further.
+ II.getOperand(DeadNZCVIdx).setIsDead();
+ }
+ }
+ }
+
+ NZCVDead = NZCVDeadAtCurrInstr;
+ }
+ return Changed;
+}
+
+bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ assert(MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::Selected) &&
+ "Expected a selected MF");
+
+ bool Changed = false;
+ for (auto &BB : MF)
+ Changed |= optimizeNZCVDefs(BB);
+ return true;
+}
+
+char AArch64PostSelectOptimize::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
+ "Optimize AArch64 selected instructions",
+ false, false)
+INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
+ "Optimize AArch64 selected instructions", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAArch64PostSelectOptimize() {
+ return new AArch64PostSelectOptimize();
+}
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 9a1f200d5222..5f9b64e274b3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -96,24 +96,6 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
CombinerHelper Helper(Observer, B, KB, MDT);
AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
- switch (MI.getOpcode()) {
- case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
- switch (MI.getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memmove:
- case Intrinsic::memset: {
- // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
- // heuristics decide.
- unsigned MaxLen = EnableOpt ? 0 : 32;
- // Try to inline memcpy type calls if optimizations are enabled.
- return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen)
- : false;
- }
- default:
- break;
- }
- }
-
if (Generated.tryCombineAll(Observer, MI, B))
return true;
@@ -122,6 +104,15 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return Helper.tryCombineShuffleVector(MI);
+ case TargetOpcode::G_MEMCPY:
+ case TargetOpcode::G_MEMMOVE:
+ case TargetOpcode::G_MEMSET: {
+ // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
+ // heuristics decide.
+ unsigned MaxLen = EnableOpt ? 0 : 32;
+ // Try to inline memcpy type calls if optimizations are enabled.
+ return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false;
+ }
}
return false;
@@ -197,7 +188,7 @@ INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
namespace llvm {
-FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) {
+FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone) {
return new AArch64PreLegalizerCombiner(IsOptNone);
}
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 93213f5977e5..c76c43389b37 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -13,6 +13,7 @@
#include "AArch64RegisterBankInfo.h"
#include "AArch64InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -465,9 +466,10 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
getValueMapping(RBIdx, Size), NumOperands);
}
-bool AArch64RegisterBankInfo::hasFPConstraints(
- const MachineInstr &MI, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) const {
+bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Depth) const {
unsigned Op = MI.getOpcode();
// Do we have an explicit floating point instruction?
@@ -479,14 +481,30 @@ bool AArch64RegisterBankInfo::hasFPConstraints(
if (Op != TargetOpcode::COPY && !MI.isPHI())
return false;
- // MI is copy-like. Return true if it outputs an FPR.
- return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) ==
- &AArch64::FPRRegBank;
+ // Check if we already know the register bank.
+ auto *RB = getRegBank(MI.getOperand(0).getReg(), MRI, TRI);
+ if (RB == &AArch64::FPRRegBank)
+ return true;
+ if (RB == &AArch64::GPRRegBank)
+ return false;
+
+ // We don't know anything.
+ //
+ // If we have a phi, we may be able to infer that it will be assigned a FPR
+ // based off of its inputs.
+ if (!MI.isPHI() || Depth > MaxFPRSearchDepth)
+ return false;
+
+ return any_of(MI.explicit_uses(), [&](const MachineOperand &Op) {
+ return Op.isReg() &&
+ onlyDefinesFP(*MRI.getVRegDef(Op.getReg()), MRI, TRI, Depth + 1);
+ });
}
bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) const {
+ const TargetRegisterInfo &TRI,
+ unsigned Depth) const {
switch (MI.getOpcode()) {
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI:
@@ -495,12 +513,13 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
default:
break;
}
- return hasFPConstraints(MI, MRI, TRI);
+ return hasFPConstraints(MI, MRI, TRI, Depth);
}
-bool AArch64RegisterBankInfo::onlyDefinesFP(
- const MachineInstr &MI, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) const {
+bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ unsigned Depth) const {
switch (MI.getOpcode()) {
case AArch64::G_DUP:
case TargetOpcode::G_SITOFP:
@@ -511,7 +530,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(
default:
break;
}
- return hasFPConstraints(MI, MRI, TRI);
+ return hasFPConstraints(MI, MRI, TRI, Depth);
}
const RegisterBankInfo::InstructionMapping &
@@ -661,11 +680,18 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case TargetOpcode::G_SITOFP:
- case TargetOpcode::G_UITOFP:
+ case TargetOpcode::G_UITOFP: {
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
break;
- OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+ // Integer to FP conversions don't necessarily happen between GPR -> FPR
+ // regbanks. They can also be done within an FPR register.
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (getRegBank(SrcReg, MRI, TRI) == &AArch64::FPRRegBank)
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ else
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
break;
+ }
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI:
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
@@ -703,7 +729,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// assume this was a floating point load in the IR.
// If it was not, we would have had a bitcast before
// reaching that instruction.
- if (onlyUsesFP(UseMI, MRI, TRI)) {
+ // Int->FP conversion operations are also captured in onlyDefinesFP().
+ if (onlyUsesFP(UseMI, MRI, TRI) || onlyDefinesFP(UseMI, MRI, TRI)) {
OpRegBankIdx[0] = PMI_FirstFPR;
break;
}
@@ -826,7 +853,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
- case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_BUILD_VECTOR: {
// If the first source operand belongs to a FPR register bank, then make
// sure that we preserve that.
if (OpRegBankIdx[1] != PMI_FirstGPR)
@@ -837,10 +864,17 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Get the instruction that defined the source operand reg, and check if
// it's a floating point operation. Or, if it's a type like s16 which
- // doesn't have a exact size gpr register class.
+ // doesn't have a exact size gpr register class. The exception is if the
+ // build_vector has all constant operands, which may be better to leave as
+ // gpr without copies, so it can be matched in imported patterns.
MachineInstr *DefMI = MRI.getVRegDef(VReg);
unsigned DefOpc = DefMI->getOpcode();
const LLT SrcTy = MRI.getType(VReg);
+ if (all_of(MI.operands(), [&](const MachineOperand &Op) {
+ return Op.isDef() || MRI.getVRegDef(Op.getReg())->getOpcode() ==
+ TargetOpcode::G_CONSTANT;
+ }))
+ break;
if (isPreISelGenericFloatingPointOpcode(DefOpc) ||
SrcTy.getSizeInBits() < 32) {
// Have a floating point op.
@@ -851,6 +885,30 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
+ case TargetOpcode::G_VECREDUCE_FADD:
+ case TargetOpcode::G_VECREDUCE_FMUL:
+ case TargetOpcode::G_VECREDUCE_FMAX:
+ case TargetOpcode::G_VECREDUCE_FMIN:
+ case TargetOpcode::G_VECREDUCE_ADD:
+ case TargetOpcode::G_VECREDUCE_MUL:
+ case TargetOpcode::G_VECREDUCE_AND:
+ case TargetOpcode::G_VECREDUCE_OR:
+ case TargetOpcode::G_VECREDUCE_XOR:
+ case TargetOpcode::G_VECREDUCE_SMAX:
+ case TargetOpcode::G_VECREDUCE_SMIN:
+ case TargetOpcode::G_VECREDUCE_UMAX:
+ case TargetOpcode::G_VECREDUCE_UMIN:
+ // Reductions produce a scalar value from a vector, the scalar should be on
+ // FPR bank.
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ break;
+ case TargetOpcode::G_VECREDUCE_SEQ_FADD:
+ case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
+ // These reductions also take a scalar accumulator input.
+ // Assign them FPR for now.
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR};
+ break;
+ }
// Finally construct the computed mapping.
SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index e956fca1aa10..019017bc3ec4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -114,17 +114,20 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
const InstructionMapping &
getSameKindOfOperandsMapping(const MachineInstr &MI) const;
- /// Returns true if the output of \p MI must be stored on a FPR register.
+ /// Maximum recursion depth for hasFPConstraints.
+ const unsigned MaxFPRSearchDepth = 2;
+
+ /// \returns true if \p MI only uses and defines FPRs.
bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) const;
+ const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
- /// Returns true if the source registers of \p MI must all be FPRs.
+ /// \returns true if \p MI only uses FPRs.
bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) const;
+ const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
- /// Returns true if the destination register of \p MI must be a FPR.
+ /// \returns true if \p MI only defines FPRs.
bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) const;
+ const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
public:
AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir
new file mode 100644
index 000000000000..6f05bd7ac838
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-saddo.mir
@@ -0,0 +1,158 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-uknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
+
+...
+---
+name: saddo_s32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+
+ ; CHECK-LABEL: name: saddo_s32
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %reg0:gpr32 = COPY $w0
+ ; CHECK: %reg1:gpr32 = COPY $w1
+ ; CHECK: %saddo:gpr32 = ADDSWrr %reg0, %reg1, implicit-def $nzcv
+ ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %saddo
+ ; CHECK: RET_ReallyLR implicit $w0
+ %reg0:gpr(s32) = COPY $w0
+ %reg1:gpr(s32) = COPY $w1
+ %saddo:gpr(s32), %4:gpr(s1) = G_SADDO %reg0, %reg1
+ $w0 = COPY %saddo(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: saddo_s64
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $x0, $x1, $x2
+
+ ; CHECK-LABEL: name: saddo_s64
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: %reg0:gpr64 = COPY $x0
+ ; CHECK: %reg1:gpr64 = COPY $x1
+ ; CHECK: %saddo:gpr64 = ADDSXrr %reg0, %reg1, implicit-def $nzcv
+ ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $x0 = COPY %saddo
+ ; CHECK: RET_ReallyLR implicit $x0
+ %reg0:gpr(s64) = COPY $x0
+ %reg1:gpr(s64) = COPY $x1
+ %saddo:gpr(s64), %4:gpr(s1) = G_SADDO %reg0, %reg1
+ $x0 = COPY %saddo(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: saddo_s32_imm
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+ ; Check that we get ADDSWri when we can fold in a constant.
+ ;
+ ; CHECK-LABEL: name: saddo_s32_imm
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %copy:gpr32sp = COPY $w0
+ ; CHECK: %saddo:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv
+ ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %saddo
+ ; CHECK: RET_ReallyLR implicit $w0
+ %copy:gpr(s32) = COPY $w0
+ %constant:gpr(s32) = G_CONSTANT i32 16
+ %saddo:gpr(s32), %overflow:gpr(s1) = G_SADDO %copy, %constant
+ $w0 = COPY %saddo(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: saddo_s32_shifted
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+ ; Check that we get ADDSWrs when we can fold in a shift.
+ ;
+ ; CHECK-LABEL: name: saddo_s32_shifted
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %reg0:gpr32 = COPY $w0
+ ; CHECK: %reg1:gpr32 = COPY $w1
+ ; CHECK: %add:gpr32 = ADDSWrs %reg0, %reg1, 16, implicit-def $nzcv
+ ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %add
+ ; CHECK: RET_ReallyLR implicit $w0
+ %reg0:gpr(s32) = COPY $w0
+ %reg1:gpr(s32) = COPY $w1
+ %constant:gpr(s32) = G_CONSTANT i32 16
+ %shift:gpr(s32) = G_SHL %reg1(s32), %constant(s32)
+ %add:gpr(s32), %overflow:gpr(s1) = G_SADDO %reg0, %shift
+ $w0 = COPY %add(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: saddo_s32_neg_imm
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+ ; Check that we get SUBSWri when we can fold in a negative constant.
+ ;
+ ; CHECK-LABEL: name: saddo_s32_neg_imm
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %copy:gpr32sp = COPY $w0
+ ; CHECK: %add:gpr32 = SUBSWri %copy, 16, 0, implicit-def $nzcv
+ ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %add
+ ; CHECK: RET_ReallyLR implicit $w0
+ %copy:gpr(s32) = COPY $w0
+ %constant:gpr(s32) = G_CONSTANT i32 -16
+ %add:gpr(s32), %overflow:gpr(s1) = G_SADDO %copy, %constant
+ $w0 = COPY %add(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: saddo_arith_extended
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $x0
+ ; Check that we get ADDSXrx.
+ ; CHECK-LABEL: name: saddo_arith_extended
+ ; CHECK: liveins: $w0, $x0
+ ; CHECK: %reg0:gpr64sp = COPY $x0
+ ; CHECK: %reg1:gpr32 = COPY $w0
+ ; CHECK: %add:gpr64 = ADDSXrx %reg0, %reg1, 18, implicit-def $nzcv
+ ; CHECK: %flags:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $x0 = COPY %add
+ ; CHECK: RET_ReallyLR implicit $x0
+ %reg0:gpr(s64) = COPY $x0
+ %reg1:gpr(s32) = COPY $w0
+ %ext:gpr(s64) = G_ZEXT %reg1(s32)
+ %cst:gpr(s64) = G_CONSTANT i64 2
+ %shift:gpr(s64) = G_SHL %ext, %cst(s64)
+ %add:gpr(s64), %flags:gpr(s1) = G_SADDO %reg0, %shift
+ $x0 = COPY %add(s64)
+ RET_ReallyLR implicit $x0
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
new file mode 100644
index 000000000000..f6b1794645f7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
@@ -0,0 +1,158 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-uknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
+
+...
+---
+name: ssubo_s32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+
+ ; CHECK-LABEL: name: ssubo_s32
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %reg0:gpr32 = COPY $w0
+ ; CHECK: %reg1:gpr32 = COPY $w1
+ ; CHECK: %ssubo:gpr32 = SUBSWrr %reg0, %reg1, implicit-def $nzcv
+ ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %ssubo
+ ; CHECK: RET_ReallyLR implicit $w0
+ %reg0:gpr(s32) = COPY $w0
+ %reg1:gpr(s32) = COPY $w1
+ %ssubo:gpr(s32), %4:gpr(s1) = G_SSUBO %reg0, %reg1
+ $w0 = COPY %ssubo(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: ssubo_s64
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $x0, $x1, $x2
+
+ ; CHECK-LABEL: name: ssubo_s64
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK: %reg0:gpr64 = COPY $x0
+ ; CHECK: %reg1:gpr64 = COPY $x1
+ ; CHECK: %ssubo:gpr64 = SUBSXrr %reg0, %reg1, implicit-def $nzcv
+ ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $x0 = COPY %ssubo
+ ; CHECK: RET_ReallyLR implicit $x0
+ %reg0:gpr(s64) = COPY $x0
+ %reg1:gpr(s64) = COPY $x1
+ %ssubo:gpr(s64), %4:gpr(s1) = G_SSUBO %reg0, %reg1
+ $x0 = COPY %ssubo(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: ssubo_s32_imm
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+ ; Check that we get SUBSWri when we can fold in a constant.
+ ;
+ ; CHECK-LABEL: name: ssubo_s32_imm
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %copy:gpr32sp = COPY $w0
+ ; CHECK: %ssubo:gpr32 = SUBSWri %copy, 16, 0, implicit-def $nzcv
+ ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %ssubo
+ ; CHECK: RET_ReallyLR implicit $w0
+ %copy:gpr(s32) = COPY $w0
+ %constant:gpr(s32) = G_CONSTANT i32 16
+ %ssubo:gpr(s32), %overflow:gpr(s1) = G_SSUBO %copy, %constant
+ $w0 = COPY %ssubo(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: ssubo_s32_shifted
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+ ; Check that we get SUBSWrs when we can fold in a shift.
+ ;
+ ; CHECK-LABEL: name: ssubo_s32_shifted
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %reg0:gpr32 = COPY $w0
+ ; CHECK: %reg1:gpr32 = COPY $w1
+ ; CHECK: %sub:gpr32 = SUBSWrs %reg0, %reg1, 16, implicit-def $nzcv
+ ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %sub
+ ; CHECK: RET_ReallyLR implicit $w0
+ %reg0:gpr(s32) = COPY $w0
+ %reg1:gpr(s32) = COPY $w1
+ %constant:gpr(s32) = G_CONSTANT i32 16
+ %shift:gpr(s32) = G_SHL %reg1(s32), %constant(s32)
+ %sub:gpr(s32), %overflow:gpr(s1) = G_SSUBO %reg0, %shift
+ $w0 = COPY %sub(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: ssubo_s32_neg_imm
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $w1, $x2
+ ; Check that we get ADDSWri when we can fold in a negative constant.
+ ;
+ ; CHECK-LABEL: name: ssubo_s32_neg_imm
+ ; CHECK: liveins: $w0, $w1, $x2
+ ; CHECK: %copy:gpr32sp = COPY $w0
+ ; CHECK: %sub:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv
+ ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $w0 = COPY %sub
+ ; CHECK: RET_ReallyLR implicit $w0
+ %copy:gpr(s32) = COPY $w0
+ %constant:gpr(s32) = G_CONSTANT i32 -16
+ %sub:gpr(s32), %overflow:gpr(s1) = G_SSUBO %copy, %constant
+ $w0 = COPY %sub(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: ssubo_arith_extended
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.1.entry:
+ liveins: $w0, $x0
+ ; Check that we get SUBSXrx.
+ ; CHECK-LABEL: name: ssubo_arith_extended
+ ; CHECK: liveins: $w0, $x0
+ ; CHECK: %reg0:gpr64sp = COPY $x0
+ ; CHECK: %reg1:gpr32 = COPY $w0
+ ; CHECK: %sub:gpr64 = SUBSXrx %reg0, %reg1, 18, implicit-def $nzcv
+ ; CHECK: %flags:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
+ ; CHECK: $x0 = COPY %sub
+ ; CHECK: RET_ReallyLR implicit $x0
+ %reg0:gpr(s64) = COPY $x0
+ %reg1:gpr(s32) = COPY $w0
+ %ext:gpr(s64) = G_ZEXT %reg1(s32)
+ %cst:gpr(s64) = G_CONSTANT i64 2
+ %shift:gpr(s64) = G_SHL %ext, %cst(s64)
+ %sub:gpr(s64), %flags:gpr(s1) = G_SSUBO %reg0, %shift
+ $x0 = COPY %sub(s64)
+ RET_ReallyLR implicit $x0
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 9814f7625853..2cbe8315bc7e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -763,7 +763,8 @@ static inline bool isSVECpyImm(int64_t Imm) {
bool IsImm8 = int8_t(Imm) == Imm;
bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
- if (std::is_same<int8_t, std::make_signed_t<T>>::value)
+ if (std::is_same<int8_t, std::make_signed_t<T>>::value ||
+ std::is_same<int8_t, T>::value)
return IsImm8 || uint8_t(Imm) == Imm;
if (std::is_same<int16_t, std::make_signed_t<T>>::value)
@@ -775,7 +776,8 @@ static inline bool isSVECpyImm(int64_t Imm) {
/// Returns true if Imm is valid for ADD/SUB.
template <typename T>
static inline bool isSVEAddSubImm(int64_t Imm) {
- bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value;
+ bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value ||
+ std::is_same<int8_t, T>::value;
return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 9f7dfdf62482..75a9f2f5c80e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -88,8 +88,6 @@ public:
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const override;
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
@@ -156,19 +154,6 @@ static unsigned AdrImmBits(unsigned Value) {
return (hi19 << 5) | (lo2 << 29);
}
-static bool valueFitsIntoFixupKind(unsigned Kind, uint64_t Value) {
- unsigned NumBits;
- switch(Kind) {
- case FK_Data_1: NumBits = 8; break;
- case FK_Data_2: NumBits = 16; break;
- case FK_Data_4: NumBits = 32; break;
- case FK_Data_8: NumBits = 64; break;
- default: return true;
- }
- return isUIntN(NumBits, Value) ||
- isIntN(NumBits, static_cast<int64_t>(Value));
-}
-
static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
uint64_t Value, MCContext &Ctx,
const Triple &TheTriple, bool IsResolved) {
@@ -343,9 +328,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
case FK_Data_2:
case FK_Data_4:
case FK_Data_8:
- if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value))
- Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!");
- LLVM_FALLTHROUGH;
case FK_SecRel_2:
case FK_SecRel_4:
return Value;
@@ -463,11 +445,6 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
}
}
-bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const {
- return false;
-}
-
bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value,
const MCRelaxableFragment *DF,
@@ -781,7 +758,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
assert(TheTriple.isOSBinFormatELF() && "Invalid target");
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
- bool IsILP32 = Options.getABIName() == "ilp32";
+ bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32;
return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/true,
IsILP32);
}
@@ -794,7 +771,7 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
assert(TheTriple.isOSBinFormatELF() &&
"Big endian is only supported for ELF targets!");
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
- bool IsILP32 = Options.getABIName() == "ilp32";
+ bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32;
return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/false,
IsILP32);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index e5637dcab941..fcf67bd2f740 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -43,7 +43,7 @@ protected:
} // end anonymous namespace
AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
- : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+ : MCELFObjectTargetWriter(/*Is64Bit*/ !IsILP32, OSABI, ELF::EM_AARCH64,
/*HasRelocationAddend*/ true),
IsILP32(IsILP32) {}
@@ -322,7 +322,11 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return R_CLS(LDST64_ABS_LO12_NC);
if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+ AArch64MCExpr::VariantKind AddressLoc =
+ AArch64MCExpr::getAddressFrag(RefKind);
if (!IsILP32) {
+ if (AddressLoc == AArch64MCExpr::VK_LO15)
+ return ELF::R_AARCH64_LD64_GOTPAGE_LO15;
return ELF::R_AARCH64_LD64_GOT_LO12_NC;
} else {
Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 6dfda8217628..ec97e1c8b76a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -51,6 +51,61 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
OS << "\t.variant_pcs " << Symbol->getName() << "\n";
}
+ void EmitARM64WinCFIAllocStack(unsigned Size) override {
+ OS << "\t.seh_stackalloc " << Size << "\n";
+ }
+ void EmitARM64WinCFISaveR19R20X(int Offset) override {
+ OS << "\t.seh_save_r19r20_x " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveFPLR(int Offset) override {
+ OS << "\t.seh_save_fplr " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveFPLRX(int Offset) override {
+ OS << "\t.seh_save_fplr_x " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_reg x" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_reg_x x" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_regp x" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_regp_x x" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_lrpair x" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_freg d" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_freg_x d" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_fregp d" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override {
+ OS << "\t.seh_save_fregp_x d" << Reg << ", " << Offset << "\n";
+ }
+ void EmitARM64WinCFISetFP() override { OS << "\t.seh_set_fp\n"; }
+ void EmitARM64WinCFIAddFP(unsigned Size) override {
+ OS << "\t.seh_add_fp " << Size << "\n";
+ }
+ void EmitARM64WinCFINop() override { OS << "\t.seh_nop\n"; }
+ void EmitARM64WinCFISaveNext() override { OS << "\t.seh_save_next\n"; }
+ void EmitARM64WinCFIPrologEnd() override { OS << "\t.seh_endprologue\n"; }
+ void EmitARM64WinCFIEpilogStart() override { OS << "\t.seh_startepilogue\n"; }
+ void EmitARM64WinCFIEpilogEnd() override { OS << "\t.seh_endepilogue\n"; }
+ void EmitARM64WinCFITrapFrame() override { OS << "\t.seh_trap_frame\n"; }
+ void EmitARM64WinCFIMachineFrame() override { OS << "\t.seh_pushframe\n"; }
+ void EmitARM64WinCFIContext() override { OS << "\t.seh_context\n"; }
+ void EmitARM64WinCFIClearUnwoundToCall() override {
+ OS << "\t.seh_clear_unwound_to_call\n";
+ }
+
public:
AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 38474d31460d..340120d2b9e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -849,7 +849,7 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
}
break;
}
- } else if (CnVal == 8) {
+ } else if (CnVal == 8 || CnVal == 9) {
// TLBI aliases
const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
@@ -1377,7 +1377,8 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
}
}
-void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, uint64_t Address,
+ unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNum);
@@ -1385,7 +1386,11 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
// If the label has already been resolved to an immediate offset (say, when
// we're running the disassembler), just print the immediate.
if (Op.isImm()) {
- O << "#" << formatImm(Op.getImm() * (1 << 12));
+ const int64_t Offset = Op.getImm() * 4096;
+ if (PrintBranchImmAsAddress)
+ O << formatHex((Address & -4096) + Offset);
+ else
+ O << "#" << Offset;
return;
}
@@ -1416,6 +1421,22 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
O << "#" << Val;
}
+void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ assert(MI->getOpcode() == AArch64::DSBnXS);
+
+ StringRef Name;
+ auto DB = AArch64DBnXS::lookupDBnXSByEncoding(Val);
+ Name = DB ? DB->Name : "";
+
+ if (!Name.empty())
+ O << Name;
+ else
+ O << "#" << Val;
+}
+
void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1623,3 +1644,10 @@ void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum,
unsigned Reg = MI->getOperand(OpNum).getReg();
O << getRegisterName(getWRegFromXReg(Reg));
}
+
+void AArch64InstPrinter::printGPR64x8(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ O << getRegisterName(MRI.getSubReg(Reg, AArch64::x8sub_0));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 6da5f0e81c80..4be885e667d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -30,6 +30,7 @@ public:
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
virtual void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
@@ -155,10 +156,12 @@ protected:
void printVectorIndex(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printAdrpLabel(const MCInst *MI, unsigned OpNum,
+ void printAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printBarrierOption(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBarriernXSOption(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printMSRSystemRegister(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printMRSSystemRegister(const MCInst *MI, unsigned OpNum,
@@ -187,6 +190,8 @@ protected:
const MCSubtargetInfo &STI, raw_ostream &O);
void printGPR64as32(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printGPR64x8(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
template <int Width>
void printZPRasFPR(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -203,6 +208,7 @@ public:
void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
const MCSubtargetInfo &STI, raw_ostream &O) override;
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O) override;
bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 9a63e26dec19..37c924d879b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -73,7 +73,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
// targeting ELF.
AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
- CodePointerSize = 8;
+ CodePointerSize = T.getEnvironment() == Triple::GNUILP32 ? 4 : 8;
// ".comm align is in bytes but .align is pow-2."
AlignmentIsInBytes = false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 548e399e05a3..844bd6bbada9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -70,6 +70,7 @@ StringRef AArch64MCExpr::getVariantKindName() const {
case VK_ABS_PAGE_NC: return ":pg_hi21_nc:";
case VK_GOT: return ":got:";
case VK_GOT_PAGE: return ":got:";
+ case VK_GOT_PAGE_LO15: return ":gotpage_lo15:";
case VK_GOT_LO12: return ":got_lo12:";
case VK_GOTTPREL: return ":gottprel:";
case VK_GOTTPREL_PAGE: return ":gottprel:";
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index a82ff2e91426..d3e834a140b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -46,6 +46,7 @@ public:
VK_G1 = 0x050,
VK_G2 = 0x060,
VK_G3 = 0x070,
+ VK_LO15 = 0x080,
VK_AddressFragBits = 0x0f0,
// Whether the final relocation is a checked one (where a linker should
@@ -82,6 +83,7 @@ public:
VK_PREL_G0_NC = VK_PREL | VK_G0 | VK_NC,
VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC,
VK_GOT_PAGE = VK_GOT | VK_PAGE,
+ VK_GOT_PAGE_LO15 = VK_GOT | VK_LO15 | VK_NC,
VK_DTPREL_G2 = VK_DTPREL | VK_G2,
VK_DTPREL_G1 = VK_DTPREL | VK_G1,
VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC,
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 209bff3a2311..3c2df1621e11 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -50,10 +50,14 @@ static MCInstrInfo *createAArch64MCInstrInfo() {
static MCSubtargetInfo *
createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
- if (CPU.empty())
+ if (CPU.empty()) {
CPU = "generic";
- return createAArch64MCSubtargetInfoImpl(TT, CPU, FS);
+ if (TT.isArm64e())
+ CPU = "apple-a12";
+ }
+
+ return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index b0f414bd27ed..012661edbbfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -373,7 +373,11 @@ void AArch64MachObjectWriter::recordRelocation(
Type == MachO::ARM64_RELOC_PAGE21 ||
Type == MachO::ARM64_RELOC_PAGEOFF12) &&
Value) {
- assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+ if (!isInt<24>(Value)) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "addend too big for relocation");
+ return;
+ }
MachO::any_relocation_info MRE;
MRE.r_word0 = FixupOffset;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 48ed68f49263..f32a8f15b8a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -11,12 +11,23 @@
//===----------------------------------------------------------------------===//
#include "AArch64TargetStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
using namespace llvm;
+static cl::opt<bool> MarkBTIProperty(
+ "aarch64-mark-bti-property", cl::Hidden,
+ cl::desc("Add .note.gnu.property with BTI to assembly files"),
+ cl::init(false));
+
//
// AArch64TargetStreamer Implemenation
//
@@ -37,8 +48,50 @@ void AArch64TargetStreamer::emitCurrentConstantPool() {
ConstantPools->emitForCurrentSection(Streamer);
}
-// finish() - write out any non-empty assembler constant pools.
-void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+// finish() - write out any non-empty assembler constant pools and
+// write out note.gnu.properties if need.
+void AArch64TargetStreamer::finish() {
+ ConstantPools->emitAll(Streamer);
+
+ if (MarkBTIProperty)
+ emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
+}
+
+void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
+ if (Flags == 0)
+ return;
+
+ MCStreamer &OutStreamer = getStreamer();
+ MCContext &Context = OutStreamer.getContext();
+ // Emit a .note.gnu.property section with the flags.
+ MCSectionELF *Nt = Context.getELFSection(".note.gnu.property", ELF::SHT_NOTE,
+ ELF::SHF_ALLOC);
+ if (Nt->isRegistered()) {
+ SMLoc Loc;
+ Context.reportWarning(
+ Loc,
+ "The .note.gnu.property is not emitted because it is already present.");
+ return;
+ }
+ MCSection *Cur = OutStreamer.getCurrentSectionOnly();
+ OutStreamer.SwitchSection(Nt);
+
+ // Emit the note header.
+ OutStreamer.emitValueToAlignment(Align(8).value());
+ OutStreamer.emitIntValue(4, 4); // data size for "GNU\0"
+ OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size
+ OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
+ OutStreamer.emitBytes(StringRef("GNU", 4)); // note name
+
+ // Emit the PAC/BTI properties.
+ OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+ OutStreamer.emitIntValue(4, 4); // data size
+ OutStreamer.emitIntValue(Flags, 4); // data
+ OutStreamer.emitIntValue(0, 4); // pad
+
+ OutStreamer.endSection(Nt);
+ OutStreamer.SwitchSection(Cur);
+}
void AArch64TargetStreamer::emitInst(uint32_t Inst) {
char Buffer[4];
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 1af978a806d1..73dc1e5d4d2a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -33,6 +33,9 @@ public:
/// Emit contents of constant pool for the current section.
void emitCurrentConstantPool();
+ /// Callback used to implement the .note.gnu.property section.
+ void emitNoteSection(unsigned Flags);
+
/// Callback used to implement the .inst directive.
virtual void emitInst(uint32_t Inst);
@@ -40,12 +43,14 @@ public:
virtual void emitDirectiveVariantPCS(MCSymbol *Symbol) {};
virtual void EmitARM64WinCFIAllocStack(unsigned Size) {}
+ virtual void EmitARM64WinCFISaveR19R20X(int Offset) {}
virtual void EmitARM64WinCFISaveFPLR(int Offset) {}
virtual void EmitARM64WinCFISaveFPLRX(int Offset) {}
virtual void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) {}
virtual void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) {}
virtual void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) {}
virtual void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {}
+ virtual void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) {}
virtual void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) {}
virtual void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {}
virtual void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {}
@@ -53,9 +58,14 @@ public:
virtual void EmitARM64WinCFISetFP() {}
virtual void EmitARM64WinCFIAddFP(unsigned Size) {}
virtual void EmitARM64WinCFINop() {}
+ virtual void EmitARM64WinCFISaveNext() {}
virtual void EmitARM64WinCFIPrologEnd() {}
virtual void EmitARM64WinCFIEpilogStart() {}
virtual void EmitARM64WinCFIEpilogEnd() {}
+ virtual void EmitARM64WinCFITrapFrame() {}
+ virtual void EmitARM64WinCFIMachineFrame() {}
+ virtual void EmitARM64WinCFIContext() {}
+ virtual void EmitARM64WinCFIClearUnwoundToCall() {}
private:
std::unique_ptr<AssemblerConstantPools> ConstantPools;
@@ -86,12 +96,14 @@ public:
// The unwind codes on ARM64 Windows are documented at
// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
void EmitARM64WinCFIAllocStack(unsigned Size) override;
+ void EmitARM64WinCFISaveR19R20X(int Offset) override;
void EmitARM64WinCFISaveFPLR(int Offset) override;
void EmitARM64WinCFISaveFPLRX(int Offset) override;
void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override;
void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override;
void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override;
void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override;
+ void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override;
void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override;
void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override;
void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override;
@@ -99,9 +111,15 @@ public:
void EmitARM64WinCFISetFP() override;
void EmitARM64WinCFIAddFP(unsigned Size) override;
void EmitARM64WinCFINop() override;
+ void EmitARM64WinCFISaveNext() override;
void EmitARM64WinCFIPrologEnd() override;
void EmitARM64WinCFIEpilogStart() override;
void EmitARM64WinCFIEpilogEnd() override;
+ void EmitARM64WinCFITrapFrame() override;
+ void EmitARM64WinCFIMachineFrame() override;
+ void EmitARM64WinCFIContext() override;
+ void EmitARM64WinCFIClearUnwoundToCall() override;
+
private:
void EmitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 03fbab5142a2..1c50706a26f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -28,6 +28,7 @@ public:
void EmitWinEHHandlerData(SMLoc Loc) override;
void EmitWindowsUnwindTables() override;
+ void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
void finishImpl() override;
};
@@ -36,7 +37,12 @@ void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section!
- EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+ EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo(),
+ /* HandlerData = */ true);
+}
+
+void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+ EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
}
void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
@@ -85,6 +91,10 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAllocStack(unsigned Size) {
EmitARM64WinUnwindCode(Op, -1, Size);
}
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveR19R20X(int Offset) {
+ EmitARM64WinUnwindCode(Win64EH::UOP_SaveR19R20X, -1, Offset);
+}
+
void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLR(int Offset) {
EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset);
}
@@ -115,6 +125,11 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegPX(unsigned Reg,
EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset);
}
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveLRPair(unsigned Reg,
+ int Offset) {
+ EmitARM64WinUnwindCode(Win64EH::UOP_SaveLRPair, Reg, Offset);
+}
+
void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFReg(unsigned Reg,
int Offset) {
assert(Offset >= 0 && Offset <= 504 &&
@@ -150,6 +165,10 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFINop() {
EmitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0);
}
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveNext() {
+ EmitARM64WinUnwindCode(Win64EH::UOP_SaveNext, -1, 0);
+}
+
// The functions below handle opcodes that can end up in either a prolog or
// an epilog, but not both.
void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
@@ -188,6 +207,22 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
CurrentEpilog = nullptr;
}
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFITrapFrame() {
+ EmitARM64WinUnwindCode(Win64EH::UOP_TrapFrame, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIMachineFrame() {
+ EmitARM64WinUnwindCode(Win64EH::UOP_PushMachFrame, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIContext() {
+ EmitARM64WinUnwindCode(Win64EH::UOP_Context, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIClearUnwoundToCall() {
+ EmitARM64WinUnwindCode(Win64EH::UOP_ClearUnwoundToCall, -1, 0);
+}
+
MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e86f2a6ebde4..4eecf72862a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -206,10 +206,20 @@ def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>",
def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
-def SVEArithUImmPat : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
+def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>;
+def SVEArithUImm16Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>;
+def SVEArithUImm32Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>;
+def SVEArithUImm64Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i64>", []>;
def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
-def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
+def SVEShiftImmL8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>", []>;
+def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>;
+def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>;
+def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>;
+def SVEShiftImmR8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8, true>", []>;
+def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>;
+def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
+def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
let Name = "SVEExactFPImmOperand" # Suffix;
@@ -270,6 +280,8 @@ class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
let Inst{3-0} = Pd;
let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
+ let ElementSize = pprty.ElementSize;
+ let isReMaterializable = 1;
}
multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
@@ -305,6 +317,18 @@ class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
: Pat<(vtd (op vt1:$Op1)),
(inst $Op1)>;
+class SVE_1_Op_Passthru_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+ ValueType vts, Instruction inst>
+: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)),
+ (inst $Op3, $Op1, $Op2)>;
+
+// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
+// type of rounding. This is matched by timm0_1 in pattern below and ignored.
+class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+ ValueType vts, Instruction inst>
+: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)),
+ (inst $Op3, $Op1, $Op2)>;
+
class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
@@ -315,16 +339,6 @@ class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
(inst $Op1, i32:$imm, i32:$shift)>;
-class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
- ValueType it, ComplexPattern cpx, Instruction inst>
- : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
- (inst $Op1, i32:$imm)>;
-
-class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
- ZPRRegOp zprty, Operand ImmTy, Instruction inst>
- : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
- (inst $Op1, ImmTy:$imm)>;
-
class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
@@ -340,10 +354,11 @@ class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
(inst $Op1, $Op2)>;
-class SVE_2_Op_Pat_Reduce_To_Neon<ValueType vtd, SDPatternOperator op, ValueType vt1,
- ValueType vt2, Instruction inst, SubRegIndex sub>
-: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
- (INSERT_SUBREG (vtd (IMPLICIT_DEF)), (inst $Op1, $Op2), sub)>;
+class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
+ ValueType pt, ValueType vt1, ValueType vt2,
+ Instruction inst>
+: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)),
+ (inst $Op1, $Op2)>;
class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, Instruction inst>
@@ -403,6 +418,23 @@ class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
(inst (ptrue 31), $Op1, $Op2)>;
+class SVE_InReg_Extend<ValueType vt, SDPatternOperator op, ValueType pt,
+ ValueType inreg_vt, Instruction inst>
+: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)),
+ (inst $PassThru, $Pg, $Src)>;
+
+class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
+ ValueType pt, ValueType it,
+ ComplexPattern cast, Instruction inst>
+: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+ (inst $Pg, $Rn, i32:$imm)>;
+
+class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
+ ValueType pt, ValueType it,
+ ComplexPattern cast, Instruction inst>
+: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+ (inst $Rn, i32:$imm)>;
+
//
// Pseudo -> Instruction mappings
//
@@ -479,6 +511,8 @@ class sve_int_pfalse<bits<6> opc, string asm>
let Inst{9} = opc{0};
let Inst{8-4} = 0b00000;
let Inst{3-0} = Pd;
+
+ let isReMaterializable = 1;
}
class sve_int_ptest<bits<6> opc, string asm>
@@ -499,6 +533,7 @@ class sve_int_ptest<bits<6> opc, string asm>
let Inst{4-0} = 0b00000;
let Defs = [NZCV];
+ let isCompare = 1;
}
class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
@@ -979,8 +1014,8 @@ multiclass sve_int_perm_dup_i<string asm> {
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
}
-class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
- ZPRRegOp zprty, RegisterOperand VecList>
+class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty,
+ RegisterOperand VecList>
: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
@@ -1022,6 +1057,8 @@ multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>;
}
multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
@@ -1064,6 +1101,11 @@ multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
(nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0,
nxv2f64:$Op2, zsub1),
nxv2i64:$Op3))>;
+
+ def : Pat<(nxv8bf16 (op nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
+ (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+ nxv8bf16:$Op2, zsub1),
+ nxv8i16:$Op3))>;
}
class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -1099,6 +1141,8 @@ multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>;
}
class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -1129,6 +1173,8 @@ multiclass sve_int_perm_reverse_z<string asm, SDPatternOperator op> {
def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_1_Op_Pat<nxv8bf16, op, nxv8bf16, !cast<Instruction>(NAME # _H)>;
}
class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
@@ -1241,6 +1287,8 @@ multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, bf16, !cast<Instruction>(NAME # _H)>;
}
//===----------------------------------------------------------------------===//
@@ -1327,6 +1375,8 @@ multiclass sve_int_sel_vvv<string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
@@ -1389,7 +1439,6 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
!cast<Instruction>(NAME), PTRUE_D>;
}
-
//===----------------------------------------------------------------------===//
// SVE Logical Mask Immediate Group
//===----------------------------------------------------------------------===//
@@ -1642,7 +1691,6 @@ multiclass sve_fp_ftmad<string asm, SDPatternOperator op> {
(!cast<Instruction>(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, imm32_0_7:$imm)>;
}
-
//===----------------------------------------------------------------------===//
// SVE Floating Point Arithmetic - Unpredicated Group
//===----------------------------------------------------------------------===//
@@ -1665,7 +1713,8 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zd;
}
-multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op,
+ SDPatternOperator predicated_op = null_frag> {
def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
@@ -1674,6 +1723,9 @@ multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_2_Op_Pred_All_Active<nxv8f16, predicated_op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pred_All_Active<nxv4f32, predicated_op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pred_All_Active<nxv2f64, predicated_op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
@@ -2065,7 +2117,8 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
let ElementSize = zprty.ElementSize;
}
-multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
@@ -2217,7 +2270,11 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm,
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f16, op, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2f16, op, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _D)>;
+ def : SVE_2_Op_Pat<nxv2f32, op, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
}
//===----------------------------------------------------------------------===//
@@ -2225,7 +2282,7 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm,
//===----------------------------------------------------------------------===//
class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
- RegisterOperand o_zprtype, ElementSizeEnum size>
+ RegisterOperand o_zprtype, ElementSizeEnum Sz>
: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
@@ -2244,17 +2301,51 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
- let ElementSize = size;
+ let ElementSize = Sz;
}
multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
RegisterOperand i_zprtype,
RegisterOperand o_zprtype,
- SDPatternOperator op, ValueType vt1,
+ SDPatternOperator int_op,
+ SDPatternOperator ir_op, ValueType vt1,
ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
- def : SVE_3_Op_Pat<vt1, op, vt1, vt2, vt3, !cast<Instruction>(NAME)>;
+ // convert vt1 to a packed type for the intrinsic patterns
+ defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
+ !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
+ !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32,
+ 1 : vt1);
+
+ // convert vt3 to a packed type for the intrinsic patterns
+ defvar packedvt3 = !cond(!eq(!cast<string>(vt3), "nxv2f16"): nxv8f16,
+ !eq(!cast<string>(vt3), "nxv4f16"): nxv8f16,
+ !eq(!cast<string>(vt3), "nxv2f32"): nxv4f32,
+ 1 : vt3);
+
+ def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>;
+
+ def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+}
+
+multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
+ RegisterOperand i_zprtype,
+ RegisterOperand o_zprtype,
+ SDPatternOperator int_op,
+ SDPatternOperator ir_op, ValueType vt1,
+ ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
+ def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+
+ // convert vt1 to a packed type for the intrinsic patterns
+ defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
+ !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
+ !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32,
+ 1 : vt1);
+
+ def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
+
+ def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
}
multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
@@ -2262,9 +2353,12 @@ multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
- def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
@@ -2372,11 +2466,19 @@ multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps,
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
- def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
- def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
- def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, string Ps,
+ SDPatternOperator op,
+ DestructiveInstTypeEnum flags> {
+ let DestructiveInstType = flags in {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>,
+ SVEPseudo2Instr<Ps # _B, 1>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>;
+ }
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -2384,11 +2486,19 @@ multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, SDPatternOperator op
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
- def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
- def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
- def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, string Ps,
+ SDPatternOperator op,
+ DestructiveInstTypeEnum flags> {
+ let DestructiveInstType = flags in {
+ def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>,
+ SVEPseudo2Instr<Ps # _B, 1>;
+ def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps # _H, 1>;
+ def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
+ SVEPseudo2Instr<Ps # _S, 1>;
+ def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
+ SVEPseudo2Instr<Ps # _D, 1>;
+ }
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -2478,7 +2588,8 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op,
+ SDPatternOperator outerop, SDPatternOperator mulop> {
def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
@@ -2488,6 +2599,15 @@ multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op>
def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)),
+ (!cast<Instruction>(NAME # _B) $pred, $Op1, $Op2, $Op3)>;
+ def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)),
+ (!cast<Instruction>(NAME # _H) $pred, $Op1, $Op2, $Op3)>;
+ def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)),
+ (!cast<Instruction>(NAME # _S) $pred, $Op1, $Op2, $Op3)>;
+ def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)),
+ (!cast<Instruction>(NAME # _D) $pred, $Op1, $Op2, $Op3)>;
}
//===----------------------------------------------------------------------===//
@@ -2591,7 +2711,8 @@ multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm,
// SVE2 Integer Multiply-Add Long - Indexed Group
//===----------------------------------------------------------------------===//
-multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm,
+ SDPatternOperator op> {
def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
@@ -2841,7 +2962,8 @@ class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
let Inst{4-0} = Zd;
}
-multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op,
+ SDPatternOperator op_pred = null_frag> {
def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
@@ -2851,6 +2973,11 @@ multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_2_Op_Pred_All_Active<nxv16i8, op_pred, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pred_All_Active<nxv8i16, op_pred, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pred_All_Active<nxv4i32, op_pred, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pred_All_Active<nxv2i64, op_pred, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3404,7 +3531,8 @@ multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> {
+multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm,
+ SDPatternOperator op> {
def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
@@ -3448,7 +3576,7 @@ multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm,
let Inst{19} = imm{3};
}
def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
- vecshiftR32> {
+ tvecshiftR32> {
let Inst{20-19} = imm{4-3};
}
def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
@@ -3488,7 +3616,7 @@ multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm,
let Inst{19} = imm{3};
}
def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
- vecshiftR32> {
+ tvecshiftR32> {
let Inst{20-19} = imm{4-3};
}
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
@@ -3649,10 +3777,10 @@ multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
@@ -3661,9 +3789,9 @@ multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_InReg_Extend<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>;
+ def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
@@ -3671,15 +3799,15 @@ multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm,
SDPatternOperator op> {
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
@@ -3689,25 +3817,23 @@ multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
-
- def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm,
- SDPatternOperator op> {
+multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -3876,10 +4002,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
@@ -3888,10 +4014,10 @@ multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -4004,6 +4130,7 @@ multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
+
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
@@ -4162,6 +4289,8 @@ class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
let Inst{3-0} = Pd;
let Defs = [NZCV];
+ let ElementSize = pprty.ElementSize;
+ let isPTestLike = 1;
}
multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
@@ -4234,6 +4363,7 @@ class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
let Defs = [NZCV];
let ElementSize = pprty.ElementSize;
+ let isPTestLike = 1;
}
multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
@@ -4293,6 +4423,8 @@ class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
let Inst{3-0} = Pd;
let Defs = [NZCV];
+ let ElementSize = pprty.ElementSize;
+ let isPTestLike = 1;
}
multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc,
@@ -4337,8 +4469,7 @@ class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
}
class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
- RegisterClass gprty, PPRRegOp pprty,
- ValueType vt, SDPatternOperator op>
+ RegisterClass gprty, PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
asm, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
@@ -4356,30 +4487,32 @@ class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
let Inst{3-0} = Pd;
let Defs = [NZCV];
+ let ElementSize = pprty.ElementSize;
+ let isWhile = 1;
}
multiclass sve_int_while4_rr<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8, nxv16i1, op>;
- def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16, nxv8i1, op>;
- def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32, nxv4i1, op>;
- def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64, nxv2i1, op>;
+ def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>;
+ def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>;
+ def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>;
+ def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>;
def : SVE_2_Op_Pat<nxv16i1, op, i32, i32, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>;
+ def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_while8_rr<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8, nxv16i1, op>;
- def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16, nxv8i1, op>;
- def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32, nxv4i1, op>;
- def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64, nxv2i1, op>;
+ def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>;
+ def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>;
+ def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>;
+ def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>;
def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
}
class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
@@ -4400,6 +4533,8 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
let Inst{3-0} = Pd;
let Defs = [NZCV];
+ let ElementSize = pprty.ElementSize;
+ let isWhile = 1;
}
multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
@@ -4412,7 +4547,6 @@ multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
-
}
//===----------------------------------------------------------------------===//
@@ -4443,12 +4577,14 @@ multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>;
def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>;
+ def : SVE_2_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
-
//===----------------------------------------------------------------------===//
// SVE Floating Point Accumulating Reduction Group
//===----------------------------------------------------------------------===//
@@ -4480,7 +4616,10 @@ multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> {
def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>;
def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>;
+ def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
@@ -4701,10 +4840,11 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
}
-//
+
//===----------------------------------------------------------------------===//
// SVE Bitwise Shift - Predicated Group
//===----------------------------------------------------------------------===//
+
class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
@@ -4729,38 +4869,19 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
- def _B : SVEPseudo2Instr<psName # _B, 1>,
+multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps,
+ SDPatternOperator op = null_frag> {
+ def _B : SVEPseudo2Instr<Ps # _B, 1>,
sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
- def _H : SVEPseudo2Instr<psName # _H, 1>,
- sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
- let Inst{8} = imm{3};
- }
- def _S : SVEPseudo2Instr<psName # _S, 1>,
- sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
- let Inst{9-8} = imm{4-3};
- }
- def _D : SVEPseudo2Instr<psName # _D, 1>,
- sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
- let Inst{22} = imm{5};
- let Inst{9-8} = imm{4-3};
- }
-}
-
-multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
- string psName,
- SDPatternOperator op> {
-
- def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
- def _H : SVEPseudo2Instr<psName # _H, 1>,
+ def _H : SVEPseudo2Instr<Ps # _H, 1>,
sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{8} = imm{3};
}
- def _S : SVEPseudo2Instr<psName # _S, 1>,
+ def _S : SVEPseudo2Instr<Ps # _S, 1>,
sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{9-8} = imm{4-3};
}
- def _D : SVEPseudo2Instr<psName # _D, 1>,
+ def _D : SVEPseudo2Instr<Ps # _D, 1>,
sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
@@ -4772,6 +4893,16 @@ multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
}
+// As above but shift amount takes the form of a "vector immediate".
+multiclass sve_int_bin_pred_shift_imm_left_dup<bits<4> opc, string asm,
+ string Ps, SDPatternOperator op>
+: sve_int_bin_pred_shift_imm_left<opc, asm, Ps, null_frag> {
+ def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>;
+}
+
multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, tvecshiftL8, FalseLanesZero>;
def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
@@ -4808,6 +4939,16 @@ multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
}
+// As above but shift amount takes the form of a "vector immediate".
+multiclass sve_int_bin_pred_shift_imm_right_dup<bits<4> opc, string asm,
+ string Ps, SDPatternOperator op>
+: sve_int_bin_pred_shift_imm_right<opc, asm, Ps, null_frag> {
+ def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>;
+}
+
multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
@@ -4948,10 +5089,10 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
let Inst{20-19} = imm{4-3};
}
- def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftL8, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
@@ -4968,11 +5109,12 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
let Inst{20-19} = imm{4-3};
}
- def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftR8, !cast<Instruction>(NAME # _B)>;
- def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
- def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
- def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>;
}
+
//===----------------------------------------------------------------------===//
// SVE Memory - Store Group
//===----------------------------------------------------------------------===//
@@ -5481,8 +5623,7 @@ class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm),
asm, "\t$Pd, $Pn, $Pm",
- "",
- []>, Sched<[]> {
+ "", []>, Sched<[]> {
bits<4> Pd;
bits<4> Pm;
bits<4> Pn;
@@ -5548,7 +5689,7 @@ class sve_int_rdffr_pred<bit s, string asm>
let Inst{4} = 0;
let Inst{3-0} = Pd;
- let Defs = !if(!eq (s, 1), [NZCV], []);
+ let Defs = !if(s, [NZCV], []);
let Uses = [FFR];
}
@@ -5675,9 +5816,11 @@ multiclass sve_int_perm_clast_vz<bit ab, string asm, SDPatternOperator op> {
def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
- def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<bf16, op, nxv8i1, bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
}
class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
@@ -5717,6 +5860,8 @@ multiclass sve_int_perm_clast_zz<bit ab, string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
}
class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
@@ -5779,6 +5924,8 @@ multiclass sve_int_perm_last_v<bit ab, string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_2_Op_Pat<bf16, op, nxv8i1, nxv8bf16, !cast<Instruction>(NAME # _H)>;
}
class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -5815,6 +5962,8 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
}
class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
@@ -5870,26 +6019,20 @@ multiclass sve_int_perm_rev_rbit<string asm, SDPatternOperator op> {
def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_perm_rev_revb<string asm,
- SDPatternOperator int_op,
- SDPatternOperator ir_op> {
+multiclass sve_int_perm_rev_revb<string asm, SDPatternOperator op> {
def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
- def : SVE_3_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_3_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_3_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
-
- def : SVE_1_Op_AllActive_Pat<nxv8i16, ir_op, nxv8i16, !cast<Instruction>(NAME # _H), PTRUE_H>;
- def : SVE_1_Op_AllActive_Pat<nxv4i32, ir_op, nxv4i32, !cast<Instruction>(NAME # _S), PTRUE_S>;
- def : SVE_1_Op_AllActive_Pat<nxv2i64, ir_op, nxv2i64, !cast<Instruction>(NAME # _D), PTRUE_D>;
+ def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> {
@@ -5988,7 +6131,6 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
-
def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
(!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
@@ -5997,6 +6139,9 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)),
(!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
+
+ def : Pat<(nxv8bf16 (op nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
+ (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
}
class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
@@ -6025,7 +6170,6 @@ multiclass sve_int_perm_compact<string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
-
//===----------------------------------------------------------------------===//
// SVE Memory - Contiguous Load Group
//===----------------------------------------------------------------------===//
@@ -6050,8 +6194,8 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
let Inst{4-0} = Zt;
let mayLoad = 1;
- let Uses = !if(!eq(nf, 1), [FFR], []);
- let Defs = !if(!eq(nf, 1), [FFR], []);
+ let Uses = !if(nf, [FFR], []);
+ let Defs = !if(nf, [FFR], []);
}
multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
@@ -6253,8 +6397,8 @@ class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
let Inst{4-0} = Zt;
let mayLoad = 1;
- let Uses = !if(!eq(ff, 1), [FFR], []);
- let Defs = !if(!eq(ff, 1), [FFR], []);
+ let Uses = !if(ff, [FFR], []);
+ let Defs = !if(ff, [FFR], []);
}
multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
@@ -6937,7 +7081,6 @@ multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
}
-
class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
asm, "\t$prfop, $Pg, [$Zn, $imm5]",
@@ -7021,7 +7164,6 @@ multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
}
-
//===----------------------------------------------------------------------===//
// SVE Integer Misc - Unpredicated Group
//===----------------------------------------------------------------------===//
@@ -7085,8 +7227,8 @@ multiclass sve_int_bin_cons_misc_0_c_fexpa<string asm, SDPatternOperator op> {
//===----------------------------------------------------------------------===//
class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
- ZPRRegOp zprty, RegisterClass regtype>
-: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+ ZPRRegOp zprty, FPRasZPROperand dstOpType>
+: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
"",
[]>, Sched<[]> {
@@ -7104,51 +7246,54 @@ class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
let Inst{4-0} = Vd;
}
-multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
- def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
- def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
+multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm,
+ SDPatternOperator op> {
+ def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>;
+ def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>;
+ def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>;
- def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
}
-multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm, SDPatternOperator op, SDPatternOperator opSaddv> {
- def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
- def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
- def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
- def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
+multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm,
+ SDPatternOperator op> {
+ def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>;
+ def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>;
+ def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>;
+ def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64asZPR>;
- def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
- def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
- def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
- def : SVE_2_Op_Pat<i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
- def : SVE_2_Op_Pat<i64, opSaddv, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_reduce_1<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
- def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
- def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
- def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
+multiclass sve_int_reduce_1<bits<3> opc, string asm,
+ SDPatternOperator op> {
+ def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8asZPR>;
+ def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16asZPR>;
+ def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32asZPR>;
+ def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64asZPR>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
-multiclass sve_int_reduce_2<bits<3> opc, string asm, SDPatternOperator op> {
- def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
- def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
- def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
- def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
+multiclass sve_int_reduce_2<bits<3> opc, string asm,
+ SDPatternOperator op> {
+ def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8asZPR>;
+ def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16asZPR>;
+ def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32asZPR>;
+ def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64asZPR>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
- def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
+ def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm,
@@ -7253,7 +7398,7 @@ class sve_int_brkn<bit S, string asm>
let Inst{3-0} = Pdm;
let Constraints = "$Pdm = $_Pdm";
- let Defs = !if(!eq (S, 0b1), [NZCV], []);
+ let Defs = !if(S, [NZCV], []);
}
multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> {
@@ -7755,8 +7900,8 @@ multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
- def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
- (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
+ (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
}
//===----------------------------------------------------------------------===//
@@ -7790,6 +7935,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
}
/// Addressing modes
@@ -7808,7 +7954,10 @@ multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+ def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+ def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
@@ -7833,3 +7982,19 @@ multiclass sve_int_bin_pred_sd<SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
}
+
+// Predicated pseudo integer two operand instructions. Second operand is an
+// immediate specified by imm_[bhsd].
+multiclass sve_int_shift_pred_bhsd<SDPatternOperator op,
+ ComplexPattern imm_b, ComplexPattern imm_h,
+ ComplexPattern imm_s, ComplexPattern imm_d> {
+ def _UNDEF_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, Operand<i32>, FalseLanesUndef>;
+ def _UNDEF_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, Operand<i32>, FalseLanesUndef>;
+ def _UNDEF_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, Operand<i32>, FalseLanesUndef>;
+ def _UNDEF_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, Operand<i32>, FalseLanesUndef>;
+
+ def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, imm_b, !cast<Instruction>(NAME # _UNDEF_B)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, imm_h, !cast<Instruction>(NAME # _UNDEF_H)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, imm_s, !cast<Instruction>(NAME # _UNDEF_S)>;
+ def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, imm_d, !cast<Instruction>(NAME # _UNDEF_D)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 0245dd1d611a..9911f33371c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -37,7 +37,7 @@
using namespace llvm;
using namespace llvm::PatternMatch;
-#define DEBUG_TYPE "sve-intrinsic-opts"
+#define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
namespace llvm {
void initializeSVEIntrinsicOptsPass(PassRegistry &);
@@ -177,22 +177,50 @@ bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
if (isa<PHINode>(I->getArgOperand(0)))
return processPhiNode(I);
- // If we have a reinterpret intrinsic I of type A which is converting from
- // another reinterpret Y of type B, and the source type of Y is A, then we can
- // elide away both reinterprets if there are no other users of Y.
- auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
- if (!Y)
- return false;
+ SmallVector<Instruction *, 32> CandidatesForRemoval;
+ Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
+
+ const auto *IVTy = cast<VectorType>(I->getType());
+
+ // Walk the chain of conversions.
+ while (Cursor) {
+ // If the type of the cursor has fewer lanes than the final result, zeroing
+ // must take place, which breaks the equivalence chain.
+ const auto *CursorVTy = cast<VectorType>(Cursor->getType());
+ if (CursorVTy->getElementCount().getKnownMinValue() <
+ IVTy->getElementCount().getKnownMinValue())
+ break;
+
+ // If the cursor has the same type as I, it is a viable replacement.
+ if (Cursor->getType() == IVTy)
+ EarliestReplacement = Cursor;
+
+ auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
- Value *SourceVal = Y->getArgOperand(0);
- if (I->getType() != SourceVal->getType())
+ // If this is not an SVE conversion intrinsic, this is the end of the chain.
+ if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
+ Intrinsic::aarch64_sve_convert_to_svbool ||
+ IntrinsicCursor->getIntrinsicID() ==
+ Intrinsic::aarch64_sve_convert_from_svbool))
+ break;
+
+ CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
+ Cursor = IntrinsicCursor->getOperand(0);
+ }
+
+ // If no viable replacement in the conversion chain was found, there is
+ // nothing to do.
+ if (!EarliestReplacement)
return false;
- I->replaceAllUsesWith(SourceVal);
+ I->replaceAllUsesWith(EarliestReplacement);
I->eraseFromParent();
- if (Y->use_empty())
- Y->eraseFromParent();
+ while (!CandidatesForRemoval.empty()) {
+ Instruction *Candidate = CandidatesForRemoval.pop_back_val();
+ if (Candidate->use_empty())
+ Candidate->eraseFromParent();
+ }
return true;
}
@@ -248,10 +276,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
- for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
- auto *Inst = dyn_cast<Instruction>(*I++);
- Functions.insert(Inst->getFunction());
- }
+ for (User *U : F.users())
+ Functions.insert(cast<Instruction>(U)->getFunction());
break;
default:
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index c27fc7a112ec..ac59d73fd9fd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -26,6 +26,13 @@ namespace llvm {
namespace llvm {
+ namespace AArch64DBnXS {
+#define GET_DBNXS_IMPL
+#include "AArch64GenSystemOperands.inc"
+ }
+}
+
+namespace llvm {
namespace AArch64DB {
#define GET_DB_IMPL
#include "AArch64GenSystemOperands.inc"
@@ -158,7 +165,7 @@ std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
namespace llvm {
namespace AArch64TLBI {
-#define GET_TLBI_IMPL
+#define GET_TLBITable_IMPL
#include "AArch64GenSystemOperands.inc"
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 4e289fbe2325..1b13c94389cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -338,6 +338,14 @@ struct SysAliasReg : SysAlias {
: SysAlias(N, E, F), NeedsReg(R) {}
};
+struct SysAliasImm : SysAlias {
+ uint16_t ImmValue;
+ constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I)
+ : SysAlias(N, E), ImmValue(I) {}
+ constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I, FeatureBitset F)
+ : SysAlias(N, E, F), ImmValue(I) {}
+};
+
namespace AArch64AT{
struct AT : SysAlias {
using SysAlias::SysAlias;
@@ -354,6 +362,14 @@ namespace AArch64DB {
#include "AArch64GenSystemOperands.inc"
}
+namespace AArch64DBnXS {
+ struct DBnXS : SysAliasImm {
+ using SysAliasImm::SysAliasImm;
+ };
+ #define GET_DBNXS_DECL
+ #include "AArch64GenSystemOperands.inc"
+}
+
namespace AArch64DC {
struct DC : SysAlias {
using SysAlias::SysAlias;
@@ -552,7 +568,7 @@ namespace AArch64TLBI {
struct TLBI : SysAliasReg {
using SysAliasReg::SysAliasReg;
};
- #define GET_TLBI_DECL
+ #define GET_TLBITable_DECL
#include "AArch64GenSystemOperands.inc"
}
@@ -606,7 +622,7 @@ namespace AArch64II {
MO_HI12 = 7,
/// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
- /// reference is actually to the ".refptrp.FOO" symbol. This is used for
+ /// reference is actually to the ".refptr.FOO" symbol. This is used for
/// stub symbols on windows.
MO_COFFSTUB = 0x8,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
index 88c79665be60..677c49331cd5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -10,8 +10,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
-#include "llvm/IR/IntrinsicsR600.h" // TODO: Sink this.
-#include "llvm/IR/IntrinsicsAMDGPU.h" // TODO: Sink this.
+#include "llvm/IR/PassManager.h"
#include "llvm/Support/CodeGen.h"
namespace llvm {
@@ -52,7 +51,6 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
-FunctionPass *createSIFixupVectorISelPass();
FunctionPass *createSIAddIMGInitPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
@@ -69,12 +67,25 @@ FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
+FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
FunctionPass *createAMDGPURewriteOutArgumentsPass();
FunctionPass *createSIModeRegisterPass();
+struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
+ AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
+struct AMDGPUUseNativeCallsPass : PassInfoMixin<AMDGPUUseNativeCallsPass> {
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
@@ -106,12 +117,35 @@ ModulePass *createAMDGPULowerKernelAttributesPass();
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
extern char &AMDGPULowerKernelAttributesID;
+struct AMDGPULowerKernelAttributesPass
+ : PassInfoMixin<AMDGPULowerKernelAttributesPass> {
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
extern char &AMDGPUPropagateAttributesEarlyID;
+struct AMDGPUPropagateAttributesEarlyPass
+ : PassInfoMixin<AMDGPUPropagateAttributesEarlyPass> {
+ AMDGPUPropagateAttributesEarlyPass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
extern char &AMDGPUPropagateAttributesLateID;
+struct AMDGPUPropagateAttributesLatePass
+ : PassInfoMixin<AMDGPUPropagateAttributesLatePass> {
+ AMDGPUPropagateAttributesLatePass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -148,9 +182,6 @@ extern char &SIFixSGPRCopiesID;
void initializeSIFixVGPRCopiesPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
-void initializeSIFixupVectorISelPass(PassRegistry &);
-extern char &SIFixupVectorISelID;
-
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
@@ -202,11 +233,37 @@ FunctionPass *createAMDGPUPromoteAllocaToVector();
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaToVectorID;
+struct AMDGPUPromoteAllocaPass : PassInfoMixin<AMDGPUPromoteAllocaPass> {
+ AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
+struct AMDGPUPromoteAllocaToVectorPass
+ : PassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
+ AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(
TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
+
+struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
+ AMDGPUAlwaysInlinePass(bool GlobalOpt = true) : GlobalOpt(GlobalOpt) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+ bool GlobalOpt;
+};
+
ModulePass *createR600OpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
@@ -214,10 +271,19 @@ ModulePass *createAMDGPUPrintfRuntimeBinding();
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
extern char &AMDGPUPrintfRuntimeBindingID;
+struct AMDGPUPrintfRuntimeBindingPass
+ : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
ModulePass* createAMDGPUUnifyMetadataPass();
void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
extern char &AMDGPUUnifyMetadataID;
+struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
extern char &SIOptimizeExecMaskingPreRAID;
@@ -227,6 +293,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
extern char &AMDGPUCodeGenPrepareID;
+void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
+extern char &AMDGPULateCodeGenPrepareID;
+
void initializeSIAnnotateControlFlowPass(PassRegistry&);
extern char &SIAnnotateControlFlowPassID;
@@ -258,9 +327,6 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
-Pass *createAMDGPUFunctionInliningPass();
-void initializeAMDGPUInlinerPass(PassRegistry&);
-
ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
@@ -281,8 +347,6 @@ enum TargetIndex {
};
}
-} // End namespace llvm
-
/// OpenCL uses address spaces to differentiate between
/// various memory regions on the hardware. On the CPU
/// all of the address spaces point to the same memory,
@@ -339,4 +403,17 @@ namespace AMDGPUAS {
};
}
+namespace AMDGPU {
+
+// FIXME: Missing constant_32bit
+inline bool isFlatGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+}
+}
+
+} // End namespace llvm
+
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
index e32f0fcc4771..c352c0097c5c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -90,7 +90,7 @@ def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
"UnalignedBufferAccess",
"true",
- "Support unaligned global loads and stores"
+ "Hardware supports unaligned global loads and stores"
>;
def FeatureTrapHandler: SubtargetFeature<"trap-handler",
@@ -105,6 +105,12 @@ def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
"Support unaligned scratch loads and stores"
>;
+def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
+ "UnalignedDSAccess",
+ "true",
+ "Hardware supports unaligned local and region loads and stores"
+>;
+
def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
@@ -123,10 +129,10 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
-def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support",
- "DoesNotSupportXNACK",
+def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
+ "SupportsXNACK",
"true",
- "Hardware does not support XNACK"
+ "Hardware supports XNACK"
>;
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
@@ -157,7 +163,7 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
"LDSMisalignedBug",
"true",
- "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
+ "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode"
>;
def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
@@ -220,6 +226,18 @@ def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
"Branch offset of 3f hardware bug"
>;
+def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
+ "HasImageStoreD16Bug",
+ "true",
+ "Image Store D16 hardware bug"
+>;
+
+def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
+ "HasImageGather4D16Bug",
+ "true",
+ "Image Gather4 D16 hardware bug"
+>;
+
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -473,16 +491,16 @@ def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
[FeatureFlatGlobalInsts]
>;
-def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support",
- "DoesNotSupportSRAMECC",
+def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support",
+ "SupportsSRAMECC",
"true",
- "Hardware does not support SRAM ECC"
+ "Hardware supports SRAMECC"
>;
-def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
+def FeatureSRAMECC : SubtargetFeature<"sramecc",
"EnableSRAMECC",
"true",
- "Enable SRAM ECC"
+ "Enable SRAMECC"
>;
def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
@@ -626,19 +644,21 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
"Hardware automatically inserts waitcnt before barrier"
>;
-def FeatureCodeObjectV3 : SubtargetFeature <
- "code-object-v3",
- "CodeObjectV3",
- "true",
- "Generate code object version 3"
->;
-
def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
"HasTrigReducedRange",
"true",
"Requires use of fract on arguments to trig instructions"
>;
+// Alignment enforcement is controlled by a configuration register:
+// SH_MEM_CONFIG.alignment_mode
+def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
+ "UnalignedAccessMode",
+ "true",
+ "Enable unaligned global, local and region loads and stores if the hardware"
+ " supports it"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -655,8 +675,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
- FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
- FeatureDoesNotSupportXNACK]
+ FeatureTrigReducedRange]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -665,7 +684,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC]
+ FeatureDsSrc2Insts, FeatureUnalignedBufferAccess]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -678,8 +697,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32
- ]
+ FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess]
>;
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
@@ -695,8 +713,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
- FeatureFastDenormalF32
- ]
+ FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+ FeatureSupportsXNACK]
>;
def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
@@ -712,8 +730,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
FeatureVOP3Literal, FeatureDPP8,
- FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC,
- FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+ FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
]
>;
@@ -724,102 +743,92 @@ class FeatureSet<list<SubtargetFeature> Features_> {
def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands,
FeatureFastFMAF32,
HalfRate64Ops,
- FeatureLDSBankCount32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureLDSBankCount32]>;
def FeatureISAVersion6_0_1 : FeatureSet<
[FeatureSouthernIslands,
- FeatureLDSBankCount32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion6_0_2 : FeatureSet<
+ [FeatureSouthernIslands,
+ FeatureLDSBankCount32]>;
def FeatureISAVersion7_0_0 : FeatureSet<
[FeatureSeaIslands,
- FeatureLDSBankCount32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureLDSBankCount32]>;
def FeatureISAVersion7_0_1 : FeatureSet<
[FeatureSeaIslands,
HalfRate64Ops,
FeatureLDSBankCount32,
- FeatureFastFMAF32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureFastFMAF32]>;
def FeatureISAVersion7_0_2 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount16,
- FeatureFastFMAF32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureFastFMAF32]>;
def FeatureISAVersion7_0_3 : FeatureSet<
[FeatureSeaIslands,
- FeatureLDSBankCount16,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureLDSBankCount16]>;
def FeatureISAVersion7_0_4 : FeatureSet<
[FeatureSeaIslands,
- FeatureLDSBankCount32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureLDSBankCount32]>;
+
+def FeatureISAVersion7_0_5 : FeatureSet<
+ [FeatureSeaIslands,
+ FeatureLDSBankCount16]>;
def FeatureISAVersion8_0_1 : FeatureSet<
[FeatureVolcanicIslands,
FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32,
- FeatureXNACK,
- FeatureUnpackedD16VMem,
- FeatureCodeObjectV3]>;
+ FeatureSupportsXNACK,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_0_2 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
FeatureSGPRInitBug,
- FeatureUnpackedD16VMem,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_0_3 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
- FeatureUnpackedD16VMem,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureUnpackedD16VMem]>;
+
+def FeatureISAVersion8_0_5 : FeatureSet<
+ [FeatureVolcanicIslands,
+ FeatureLDSBankCount32,
+ FeatureSGPRInitBug,
+ FeatureUnpackedD16VMem]>;
def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount16,
- FeatureXNACK,
- FeatureCodeObjectV3]>;
+ FeatureSupportsXNACK,
+ FeatureImageStoreD16Bug,
+ FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_0 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureCodeObjectV3,
- FeatureDoesNotSupportXNACK,
- FeatureDoesNotSupportSRAMECC]>;
+ FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureXNACK,
- FeatureDoesNotSupportSRAMECC,
- FeatureCodeObjectV3]>;
+ FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,
FeatureLDSBankCount32,
FeatureFmaMixInsts,
- FeatureDoesNotSupportXNACK,
- FeatureDoesNotSupportSRAMECC,
- FeatureCodeObjectV3]>;
+ FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_6 : FeatureSet<
[FeatureGFX9,
@@ -829,8 +838,8 @@ def FeatureISAVersion9_0_6 : FeatureSet<
FeatureDLInsts,
FeatureDot1Insts,
FeatureDot2Insts,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureSupportsSRAMECC,
+ FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_8 : FeatureSet<
[FeatureGFX9,
@@ -847,16 +856,22 @@ def FeatureISAVersion9_0_8 : FeatureSet<
FeatureMAIInsts,
FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,
- FeatureSRAMECC,
+ FeatureSupportsSRAMECC,
FeatureMFMAInlineLiteralBug,
- FeatureCodeObjectV3]>;
+ FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_9 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureImageGather4D16Bug]>;
+
+def FeatureISAVersion9_0_C : FeatureSet<
+ [FeatureGFX9,
+ FeatureMadMixInsts,
+ FeatureLDSBankCount32,
FeatureXNACK,
- FeatureCodeObjectV3]>;
+ FeatureImageGather4D16Bug]>;
// TODO: Organize more features into groups.
def FeatureGroup {
@@ -889,8 +904,7 @@ def FeatureISAVersion10_1_0 : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3])>;
+ FeatureSupportsXNACK])>;
def FeatureISAVersion10_1_1 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -910,8 +924,8 @@ def FeatureISAVersion10_1_1 : FeatureSet<
FeatureSMemTimeInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3])>;
+ FeatureLdsMisalignedBug,
+ FeatureSupportsXNACK])>;
def FeatureISAVersion10_1_2 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -932,8 +946,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3])>;
+ FeatureSupportsXNACK])>;
def FeatureISAVersion10_3_0 : FeatureSet<
[FeatureGFX10,
@@ -946,9 +959,7 @@ def FeatureISAVersion10_3_0 : FeatureSet<
FeatureDot5Insts,
FeatureDot6Insts,
FeatureNSAEncoding,
- FeatureWavefrontSize32,
- FeatureDoesNotSupportXNACK,
- FeatureCodeObjectV3]>;
+ FeatureWavefrontSize32]>;
//===----------------------------------------------------------------------===//
@@ -1095,6 +1106,11 @@ def isGFX10Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of FeatureGFX10Insts)>;
+def isGFX10Before1030 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 &&"
+ "!Subtarget->hasGFX10_3Insts()">,
+ AssemblerPredicate<(all_of FeatureGFX10Insts,(not FeatureGFX10_3Insts))>;
+
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
@@ -1107,6 +1123,9 @@ def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts(
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
+def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
+ AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+
def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
@@ -1225,6 +1244,9 @@ def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
AssemblerPredicate<(all_of FeatureMAIInsts)>;
+def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">,
+ AssemblerPredicate<(all_of FeatureSMemRealTime)>;
+
def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
@@ -1236,12 +1258,12 @@ def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
+def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
+ AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+
def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
-def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">,
- AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>;
-
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
@@ -1251,6 +1273,13 @@ def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
+def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
+
+def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
+
+def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">,
+ AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>;
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index bb2aba044974..0ed89e9ca8d6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -10,27 +10,15 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUAliasAnalysis.h"
-#include "AMDGPU.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
+#include "llvm/IR/Instructions.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-aa"
+AnalysisKey AMDGPUAA::Key;
+
// Register this pass...
char AMDGPUAAWrapperPass::ID = 0;
char AMDGPUExternalAAWrapper::ID = 0;
@@ -85,6 +73,44 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
if (Result == NoAlias)
return Result;
+ // In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE
+ // pointers. However, as LOCAL or PRIVATE pointers point to local objects, in
+ // certain cases, it's still viable to check whether a FLAT pointer won't
+ // alias to a LOCAL or PRIVATE pointer.
+ MemoryLocation A = LocA;
+ MemoryLocation B = LocB;
+ // Canonicalize the location order to simplify the following alias check.
+ if (asA != AMDGPUAS::FLAT_ADDRESS) {
+ std::swap(asA, asB);
+ std::swap(A, B);
+ }
+ if (asA == AMDGPUAS::FLAT_ADDRESS &&
+ (asB == AMDGPUAS::LOCAL_ADDRESS || asB == AMDGPUAS::PRIVATE_ADDRESS)) {
+ const auto *ObjA =
+ getUnderlyingObject(A.Ptr->stripPointerCastsAndInvariantGroups());
+ if (const LoadInst *LI = dyn_cast<LoadInst>(ObjA)) {
+ // If a generic pointer is loaded from the constant address space, it
+ // could only be a GLOBAL or CONSTANT one as that address space is soley
+ // prepared on the host side, where only GLOBAL or CONSTANT variables are
+ // visible. Note that this even holds for regular functions.
+ if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+ return NoAlias;
+ } else if (const Argument *Arg = dyn_cast<Argument>(ObjA)) {
+ const Function *F = Arg->getParent();
+ switch (F->getCallingConv()) {
+ case CallingConv::AMDGPU_KERNEL:
+ // In the kernel function, kernel arguments won't alias to (local)
+ // variables in shared or private address space.
+ return NoAlias;
+ default:
+ // TODO: In the regular function, if that local variable in the
+ // location B is not captured, that argument pointer won't alias to it
+ // as well.
+ break;
+ }
+ }
+ }
+
// Forward the query to the next alias analysis.
return AAResultBase::alias(LocA, LocB, AAQI);
}
@@ -96,7 +122,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
- const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
+ const Value *Base = getUnderlyingObject(Loc.Ptr);
AS = Base->getType()->getPointerAddressSpace();
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index fd8889ea5c0d..44de40d4aa7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -13,13 +13,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
#include "AMDGPU.h"
-#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include <algorithm>
-#include <memory>
namespace llvm {
@@ -34,15 +28,17 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
const DataLayout &DL;
public:
- explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
- DL(DL) {}
+ explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {}
AMDGPUAAResult(AMDGPUAAResult &&Arg)
: AAResultBase(std::move(Arg)), DL(Arg.DL) {}
/// Handle invalidation events from the new pass manager.
///
/// By definition, this result is stateless and so remains valid.
- bool invalidate(Function &, const PreservedAnalyses &) { return false; }
+ bool invalidate(Function &, const PreservedAnalyses &,
+ FunctionAnalysisManager::Invalidator &Inv) {
+ return false;
+ }
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
AAQueryInfo &AAQI);
@@ -54,14 +50,13 @@ public:
class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
friend AnalysisInfoMixin<AMDGPUAA>;
- static char PassID;
+ static AnalysisKey Key;
public:
using Result = AMDGPUAAResult;
AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
- return AMDGPUAAResult(F.getParent()->getDataLayout(),
- Triple(F.getParent()->getTargetTriple()));
+ return AMDGPUAAResult(F.getParent()->getDataLayout());
}
};
@@ -80,8 +75,7 @@ public:
const AMDGPUAAResult &getResult() const { return *Result; }
bool doInitialization(Module &M) override {
- Result.reset(new AMDGPUAAResult(M.getDataLayout(),
- Triple(M.getTargetTriple())));
+ Result.reset(new AMDGPUAAResult(M.getDataLayout()));
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 22947544ac07..51af25050950 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -15,9 +15,9 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
using namespace llvm;
@@ -32,8 +32,6 @@ static cl::opt<bool> StressCalls(
class AMDGPUAlwaysInline : public ModulePass {
bool GlobalOpt;
- void recursivelyVisitUsers(GlobalValue &GV,
- SmallPtrSetImpl<Function *> &FuncsToAlwaysInline);
public:
static char ID;
@@ -53,16 +51,13 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
char AMDGPUAlwaysInline::ID = 0;
-void AMDGPUAlwaysInline::recursivelyVisitUsers(
- GlobalValue &GV,
- SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
- SmallVector<User *, 16> Stack;
+static void
+recursivelyVisitUsers(GlobalValue &GV,
+ SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
+ SmallVector<User *, 16> Stack(GV.users());
SmallPtrSet<const Value *, 8> Visited;
- for (User *U : GV.users())
- Stack.push_back(U);
-
while (!Stack.empty()) {
User *U = Stack.pop_back_val();
if (!Visited.insert(U).second)
@@ -86,12 +81,11 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers(
continue;
}
- for (User *UU : U->users())
- Stack.push_back(UU);
+ append_range(Stack, U->users());
}
}
-bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
std::vector<GlobalAlias*> AliasesToRemove;
SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
@@ -157,7 +151,16 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
}
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+ return alwaysInlineImpl(M, GlobalOpt);
+}
+
ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
return new AMDGPUAlwaysInline(GlobalOpt);
}
+PreservedAnalyses AMDGPUAlwaysInlinePass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ alwaysInlineImpl(M, GlobalOpt);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 625074569cfa..a4e72f787230 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -12,27 +12,12 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
+#include "GCNSubtarget.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 45f515c5115e..c2a4d67ea98e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -18,11 +18,8 @@
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -108,9 +105,11 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
for (auto &BB : Checklist) {
BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
BasicBlock::iterator(Load) : BB->end();
- auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true,
- StartIt, BB, Load);
- if (Q.isClobber() || Q.isUnknown())
+ auto Q = MDR->getPointerDependencyFrom(
+ MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
+ if (Q.isClobber() || Q.isUnknown() ||
+ // Store defines the load and thus clobbers it.
+ (Q.isDef() && Q.getInst()->mayWriteToMemory()))
return true;
}
return false;
@@ -140,10 +139,11 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
}
bool NotClobbered = false;
+ bool GlobalLoad = isGlobalLoad(I);
if (PtrI)
- NotClobbered = !isClobberedInFunction(&I);
+ NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
- if (isGlobalLoad(I) && !isClobberedInFunction(&I)) {
+ if (GlobalLoad && !isClobberedInFunction(&I)) {
NotClobbered = true;
// Lookup for the existing GEP
if (noClobberClones.count(Ptr)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d078fc147a36..fb273a1650ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -6,11 +6,13 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPU.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 576e6cfe929e..139ac3bab14c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -9,14 +9,13 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
-#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/Pass.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
namespace llvm {
class Function;
+class LLT;
class raw_ostream;
class TargetRegisterClass;
class TargetRegisterInfo;
@@ -27,7 +26,7 @@ private:
friend class AMDGPUArgumentUsageInfo;
union {
- Register Reg;
+ MCRegister Reg;
unsigned StackOffset;
};
@@ -69,7 +68,7 @@ public:
return !IsStack;
}
- Register getRegister() const {
+ MCRegister getRegister() const {
assert(!IsStack);
return Reg;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index eef8fe2fc3b7..c655e5ec87b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -17,37 +17,27 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDKernelCodeT.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "R600AsmPrinter.h"
-#include "R600Defines.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/AMDGPUMetadata.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
using namespace llvm::AMDGPU;
-using namespace llvm::AMDGPU::HSAMD;
// We need to tell the runtime some amount ahead of time if we don't know the
// true stack size. Assume a smaller number if this is only due to dynamic /
@@ -108,10 +98,13 @@ extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() {
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
- if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
- HSAMetadataStream.reset(new MetadataStreamerV3());
- else
- HSAMetadataStream.reset(new MetadataStreamerV2());
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ if (isHsaAbiVersion2(getGlobalSTI())) {
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
+ } else {
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
+ }
+ }
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -129,7 +122,7 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
- if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
+ if (isHsaAbiVersion3(getGlobalSTI())) {
std::string ExpectedTarget;
raw_string_ostream ExpectedTargetOS(ExpectedTarget);
IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
@@ -147,7 +140,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
+ if (isHsaAbiVersion3(getGlobalSTI()))
return;
// HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -165,7 +158,8 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
if (!getTargetStreamer())
return;
- if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
+ isHsaAbiVersion2(getGlobalSTI())) {
// Emit ISA Version (NT_AMD_AMDGPU_ISA).
std::string ISAVersionString;
raw_string_ostream ISAVersionStream(ISAVersionString);
@@ -203,7 +197,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
const Function &F = MF->getFunction();
- if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
+ if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
amd_kernel_code_t KernelCode;
@@ -220,8 +214,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
if (!MFI.isEntryFunction())
return;
- if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) ||
- TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
+ isHsaAbiVersion2(getGlobalSTI()))
return;
auto &Streamer = getTargetStreamer()->getStreamer();
@@ -256,8 +250,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
- if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
- TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
+ isHsaAbiVersion3(getGlobalSTI())) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
@@ -334,7 +328,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// causing stale data in caches. Arguably this should be done by the linker,
// which is why this isn't done for Mesa.
const MCSubtargetInfo &STI = *getGlobalSTI();
- if (AMDGPU::isGFX10(STI) &&
+ if (AMDGPU::isGFX10Plus(STI) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
@@ -410,12 +404,12 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
assert(isUInt<32>(PI.ScratchSize));
- assert(isUInt<32>(PI.ComputePGMRSrc1));
+ assert(isUInt<32>(PI.getComputePGMRSrc1()));
assert(isUInt<32>(PI.ComputePGMRSrc2));
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
- KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+ KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
@@ -442,7 +436,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->SwitchSection(ConfigSection);
}
- if (MFI->isEntryFunction()) {
+ if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
} else {
auto I = CallGraphResourceInfo.insert(
@@ -452,9 +446,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Info = analyzeResourceUsage(MF);
}
- if (STM.isAmdPalOS())
- EmitPALMetadata(MF, CurrentProgramInfo);
- else if (!STM.isAmdHsaOS()) {
+ if (STM.isAmdPalOS()) {
+ if (MFI->isEntryFunction())
+ EmitPALMetadata(MF, CurrentProgramInfo);
+ else if (MFI->isModuleEntryFunction())
+ emitPALFunctionMetadata(MF);
+ } else if (!STM.isAmdHsaOS()) {
EmitProgramInfoSI(MF, CurrentProgramInfo);
}
@@ -532,6 +529,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+ Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
OutStreamer->emitRawComment(
@@ -741,7 +741,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
llvm_unreachable("src_pops_exiting_wave_id should not be used");
case AMDGPU::NoRegister:
- assert(MI.isDebugInstr());
+ assert(MI.isDebugInstr() && "Instruction uses invalid noreg register");
continue;
case AMDGPU::VCC:
@@ -915,7 +915,22 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
= TII->getNamedOperand(MI, AMDGPU::OpName::callee);
const Function *Callee = getCalleeFunction(*CalleeOp);
- if (!Callee || Callee->isDeclaration()) {
+ DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
+ CallGraphResourceInfo.end();
+ bool IsExternal = !Callee || Callee->isDeclaration();
+ if (!IsExternal)
+ I = CallGraphResourceInfo.find(Callee);
+
+ if (IsExternal || I == CallGraphResourceInfo.end()) {
+ // Avoid crashing on undefined behavior with an illegal call to a
+ // kernel. If a callsite's calling convention doesn't match the
+ // function's, it's undefined behavior. If the callsite calling
+ // convention does match, that would have errored earlier.
+ // FIXME: The verifier shouldn't allow this.
+ if (!IsExternal &&
+ AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+ report_fatal_error("invalid call to entry function");
+
// If this is a call to an external function, we can't do much. Make
// conservative guesses.
@@ -936,19 +951,6 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// We force CodeGen to run in SCC order, so the callee's register
// usage etc. should be the cumulative usage of all callees.
- auto I = CallGraphResourceInfo.find(Callee);
- if (I == CallGraphResourceInfo.end()) {
- // Avoid crashing on undefined behavior with an illegal call to a
- // kernel. If a callsite's calling convention doesn't match the
- // function's, it's undefined behavior. If the callsite calling
- // convention does match, that would have errored earlier.
- // FIXME: The verifier shouldn't allow this.
- if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
- report_fatal_error("invalid call to entry function");
-
- llvm_unreachable("callee should have been handled before caller");
- }
-
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
@@ -989,7 +991,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.FlatUsed = Info.UsesFlatScratch;
ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
- if (!isUInt<32>(ProgInfo.ScratchSize)) {
+ const uint64_t MaxScratchPerWorkitem =
+ GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+ if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
ProgInfo.ScratchSize, DS_Error);
MF.getFunction().getContext().diagnose(DiagStackSize);
@@ -1023,18 +1027,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Account for extra SGPRs and VGPRs reserved for debugger use.
ProgInfo.NumSGPR += ExtraSGPRs;
+ const Function &F = MF.getFunction();
+
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
- for (auto &Arg : MF.getFunction().args()) {
- unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
- if (Arg.hasAttribute(Attribute::InReg))
- WaveDispatchNumSGPR += NumRegs;
- else
- WaveDispatchNumVGPR += NumRegs;
+
+ if (isShader(F.getCallingConv())) {
+ // FIXME: We should be using the number of registers determined during
+ // calling convention lowering to legalize the types.
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto &Arg : F.args()) {
+ unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
+ if (Arg.hasAttribute(Attribute::InReg))
+ WaveDispatchNumSGPR += NumRegs;
+ else
+ WaveDispatchNumVGPR += NumRegs;
+ }
+ ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
+ ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
}
- ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
- ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
@@ -1129,18 +1141,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.MemOrdered = 1;
}
- ProgInfo.ComputePGMRSrc1 =
- S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
- S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
- S_00B848_PRIORITY(ProgInfo.Priority) |
- S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
- S_00B848_PRIV(ProgInfo.Priv) |
- S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
- S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
- S_00B848_IEEE_MODE(ProgInfo.IEEEMode) |
- S_00B848_WGP_MODE(ProgInfo.WgpMode) |
- S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
-
// 0 = X, 1 = XY, 2 = XYZ
unsigned TIDIGCompCnt = 0;
if (MFI->hasWorkItemIDZ())
@@ -1189,7 +1189,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
- OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc1);
+ OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
@@ -1238,12 +1238,10 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setEntryPoint(CC, MF.getFunction().getName());
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
- if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1);
+ MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
+ if (AMDGPU::isCompute(CC)) {
MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
} else {
- MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
- S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks));
if (CurrentProgramInfo.ScratchBlocks > 0)
MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
}
@@ -1260,6 +1258,16 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setWave32(MF.getFunction().getCallingConv());
}
+void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
+ auto *MD = getTargetStreamer()->getPALMetadata();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MD->setFunctionScratchSize(MF, MFI.getStackSize());
+ // Set compute registers
+ MD->setRsrc1(CallingConv::AMDGPU_CS,
+ CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
+ MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+}
+
// This is supposed to be log2(Size)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
switch (Size) {
@@ -1287,7 +1295,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
Out.compute_pgm_resource_registers =
- CurrentProgramInfo.ComputePGMRSrc1 |
+ CurrentProgramInfo.getComputePGMRSrc1() |
(CurrentProgramInfo.ComputePGMRSrc2 << 32);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
@@ -1296,7 +1304,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
AMD_HSA_BITS_SET(Out.code_properties,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
- getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+ getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
if (MFI->hasPrivateSegmentBuffer()) {
Out.code_properties |=
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 54e8338ab4b0..9e1e26d65d8c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -14,19 +14,10 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
-#include "AMDGPUHSAMetadataStreamer.h"
#include "SIProgramInfo.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
+
+struct amd_kernel_code_t;
namespace llvm {
@@ -36,6 +27,16 @@ class MCCodeEmitter;
class MCOperand;
class GCNSubtarget;
+namespace AMDGPU {
+namespace HSAMD {
+class MetadataStreamer;
+}
+} // namespace AMDGPU
+
+namespace amdhsa {
+struct kernel_descriptor_t;
+}
+
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
// Track resource usage for callee functions.
@@ -78,6 +79,7 @@ private:
const SIProgramInfo &KernelInfo);
void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
+ void emitPALFunctionMetadata(const MachineFunction &MF);
void emitCommonFunctionComments(uint32_t NumVGPR,
Optional<uint32_t> NumAGPR,
uint32_t TotalNumVGPR,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index c9d25d4250d5..aae2a54c198b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -14,13 +14,14 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
+#include "GCNSubtarget.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#define DEBUG_TYPE "amdgpu-atomic-optimizer"
@@ -404,6 +405,11 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
}
}
+static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
+ const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
+ return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
+}
+
void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
AtomicRMWInst::BinOp Op,
unsigned ValIdx,
@@ -523,7 +529,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// old value times the number of active lanes.
Value *const Ctpop = B.CreateIntCast(
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- NewV = B.CreateMul(V, Ctpop);
+ NewV = buildMul(B, V, Ctpop);
break;
}
@@ -543,7 +549,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// old value times the parity of the number of active lanes.
Value *const Ctpop = B.CreateIntCast(
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
+ NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
break;
}
}
@@ -622,7 +628,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
- LaneOffset = B.CreateMul(V, Mbcnt);
+ LaneOffset = buildMul(B, V, Mbcnt);
break;
case AtomicRMWInst::And:
case AtomicRMWInst::Or:
@@ -633,7 +639,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
LaneOffset = B.CreateSelect(Cond, Identity, V);
break;
case AtomicRMWInst::Xor:
- LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+ LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
break;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 949dcea3aa18..852a05b3c181 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -14,31 +14,45 @@
#include "AMDGPUCallLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUISelLowering.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPULegalizerInfo.h"
#include "AMDGPUTargetMachine.h"
-#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+#define DEBUG_TYPE "amdgpu-call-lowering"
using namespace llvm;
namespace {
-struct OutgoingValueHandler : public CallLowering::ValueHandler {
- OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
+struct AMDGPUValueHandler : public CallLowering::ValueHandler {
+ AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B,
+ MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+ : ValueHandler(IsIncoming, B, MRI, AssignFn) {}
- MachineInstrBuilder MIB;
+ /// Wrapper around extendRegister to ensure we extend to a full 32-bit
+ /// register.
+ Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to
+ // extend and do a 32-bit copy to avoid the verifier complaining about it.
+ return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+ }
+
+ return extendRegister(ValVReg, VA);
+ }
+};
+
+struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
+ AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {}
- bool isIncomingArgumentHandler() const override { return false; }
+ MachineInstrBuilder MIB;
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -52,13 +66,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
- Register ExtReg;
- if (VA.getLocVT().getSizeInBits() < 32) {
- // 16-bit types are reported as legal for 32-bit registers. We need to
- // extend and do a 32-bit copy to avoid the verifier complaining about it.
- ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
- } else
- ExtReg = extendRegister(ValVReg, VA);
+ Register ExtReg = extendRegisterMin32(ValVReg, VA);
// If this is a scalar return, insert a readfirstlane just in case the value
// ends up in a VGPR.
@@ -85,12 +93,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
}
};
-struct IncomingArgHandler : public CallLowering::ValueHandler {
+struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
uint64_t StackUsed = 0;
- IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : ValueHandler(B, MRI, AssignFn) {}
+ AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : AMDGPUValueHandler(true, B, MRI, AssignFn) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -148,21 +156,107 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
/// parameters (it's a basic-block live-in), and a call instruction
/// (it's an implicit-def of the BL).
virtual void markPhysRegUsed(unsigned PhysReg) = 0;
-
- // FIXME: What is the point of this being a callback?
- bool isIncomingArgumentHandler() const override { return true; }
};
-struct FormalArgHandler : public IncomingArgHandler {
+struct FormalArgHandler : public AMDGPUIncomingArgHandler {
FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
CCAssignFn *AssignFn)
- : IncomingArgHandler(B, MRI, AssignFn) {}
+ : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
+struct CallReturnHandler : public AMDGPUIncomingArgHandler {
+ CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIB.addDef(PhysReg, RegState::Implicit);
+ }
+
+ MachineInstrBuilder MIB;
+};
+
+struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
+ MachineInstrBuilder MIB;
+ CCAssignFn *AssignFnVarArg;
+
+ /// For tail calls, the byte offset of the call's argument area from the
+ /// callee's. Unused elsewhere.
+ int FPDiff;
+
+ // Cache the SP register vreg if we need it more than once in this call site.
+ Register SPReg;
+
+ bool IsTailCall;
+
+ AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
+ CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg,
+ bool IsTailCall = false, int FPDiff = 0)
+ : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB),
+ AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) {
+ }
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
+ const LLT S32 = LLT::scalar(32);
+
+ if (IsTailCall) {
+ llvm_unreachable("implement me");
+ }
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (!SPReg)
+ SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
+
+ auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
+
+ auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
+ MPO = MachinePointerInfo::getStack(MF, Offset);
+ return AddrReg.getReg(0);
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ MIB.addUse(PhysReg, RegState::Implicit);
+ Register ExtReg = extendRegisterMin32(ValVReg, VA);
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ MachineFunction &MF = MIRBuilder.getMF();
+ uint64_t LocMemOffset = VA.getLocMemOffset();
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+
+ auto MMO = MF.getMachineMemOperand(
+ MPO, MachineMemOperand::MOStore, Size,
+ commonAlignment(ST.getStackAlignment(), LocMemOffset));
+ MIRBuilder.buildStore(ValVReg, Addr, *MMO);
+ }
+
+ void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
+ uint64_t MemSize, MachinePointerInfo &MPO,
+ CCValAssign &VA) override {
+ Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
+ ? extendRegister(Arg.Regs[0], VA)
+ : Arg.Regs[0];
+
+ // If we extended the value type we might need to adjust the MMO's
+ // Size. This happens if ComputeValueVTs widened a small type value to a
+ // legal register type (e.g. s8->s16)
+ const LLT RegTy = MRI.getType(ValVReg);
+ MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
+ assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
+ }
+};
}
AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
@@ -183,48 +277,64 @@ static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
}
}
-void AMDGPUCallLowering::splitToValueTypes(
- MachineIRBuilder &B,
- const ArgInfo &OrigArg, unsigned OrigArgIdx,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, CallingConv::ID CallConv,
- SplitArgTy PerformArgSplit) const {
+// FIXME: This should move to generic code.
+void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B,
+ const ArgInfo &OrigArg,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL,
+ CallingConv::ID CallConv) const {
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
- if (OrigArg.Ty->isVoidTy())
- return;
-
SmallVector<EVT, 4> SplitVTs;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
assert(OrigArg.Regs.size() == SplitVTs.size());
- int SplitIdx = 0;
- for (EVT VT : SplitVTs) {
- Register Reg = OrigArg.Regs[SplitIdx];
- Type *Ty = VT.getTypeForEVT(Ctx);
- LLT LLTy = getLLTForType(*Ty, DL);
+ if (SplitVTs.size() == 0)
+ return;
- if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) {
- unsigned ExtendOp = TargetOpcode::G_ANYEXT;
- if (OrigArg.Flags[0].isSExt()) {
- assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
- ExtendOp = TargetOpcode::G_SEXT;
- } else if (OrigArg.Flags[0].isZExt()) {
- assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
- ExtendOp = TargetOpcode::G_ZEXT;
- }
+ if (SplitVTs.size() == 1) {
+ // No splitting to do, but we want to replace the original type (e.g. [1 x
+ // double] -> double).
+ SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
+ OrigArg.Flags[0], OrigArg.IsFixed);
+ return;
+ }
- EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
- extOpcodeToISDExtOpcode(ExtendOp));
- if (ExtVT != VT) {
- VT = ExtVT;
- Ty = ExtVT.getTypeForEVT(Ctx);
- LLTy = getLLTForType(*Ty, DL);
- Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
- }
- }
+ // Create one ArgInfo for each virtual register in the original ArgInfo.
+ assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
+
+ bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
+ OrigArg.Ty, CallConv, false);
+ for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
+ Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
+ SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
+ OrigArg.IsFixed);
+ if (NeedsRegBlock)
+ SplitArgs.back().Flags[0].setInConsecutiveRegs();
+ }
+
+ SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
+}
+
+void AMDGPUCallLowering::processSplitArgs(
+ MachineIRBuilder &B, const ArgInfo &OrigArg,
+ const SmallVectorImpl<ArgInfo> &SplitArg,
+ SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL,
+ CallingConv::ID CallConv, bool IsOutgoing,
+ SplitArgTy PerformArgSplit) const {
+ LLVMContext &Ctx = OrigArg.Ty->getContext();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
+ // FIXME: This is mostly nasty pre-processing before handleAssignments. Most
+ // of this should be performed by handleAssignments.
+
+ for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) {
+ const ArgInfo &CurSplitArg = SplitArg[SplitIdx];
+ Register Reg = OrigArg.Regs[SplitIdx];
+ EVT VT = EVT::getEVT(CurSplitArg.Ty);
+ LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL);
unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
@@ -232,9 +342,8 @@ void AMDGPUCallLowering::splitToValueTypes(
if (NumParts == 1) {
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
- SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
-
- ++SplitIdx;
+ SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags,
+ OrigArg.IsFixed);
continue;
}
@@ -252,21 +361,9 @@ void AMDGPUCallLowering::splitToValueTypes(
}
PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
-
- ++SplitIdx;
}
}
-// Get the appropriate type to make \p OrigTy \p Factor times bigger.
-static LLT getMultipleType(LLT OrigTy, int Factor) {
- if (OrigTy.isVector()) {
- return LLT::vector(OrigTy.getNumElements() * Factor,
- OrigTy.getElementType());
- }
-
- return LLT::scalar(OrigTy.getSizeInBits() * Factor);
-}
-
// TODO: Move to generic code
static void unpackRegsToOrigType(MachineIRBuilder &B,
ArrayRef<Register> DstRegs,
@@ -276,34 +373,67 @@ static void unpackRegsToOrigType(MachineIRBuilder &B,
LLT PartTy) {
assert(DstRegs.size() > 1 && "Nothing to unpack");
- const unsigned SrcSize = SrcTy.getSizeInBits();
const unsigned PartSize = PartTy.getSizeInBits();
if (SrcTy.isVector() && !PartTy.isVector() &&
PartSize > SrcTy.getElementType().getSizeInBits()) {
// Vector was scalarized, and the elements extended.
- auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
- SrcReg);
+ auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
for (int i = 0, e = DstRegs.size(); i != e; ++i)
B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
return;
}
- if (SrcSize % PartSize == 0) {
+ LLT GCDTy = getGCDType(SrcTy, PartTy);
+ if (GCDTy == PartTy) {
+ // If this already evenly divisible, we can create a simple unmerge.
B.buildUnmerge(DstRegs, SrcReg);
return;
}
- const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT DstTy = MRI.getType(DstRegs[0]);
+ LLT LCMTy = getLCMType(SrcTy, PartTy);
+
+ const unsigned LCMSize = LCMTy.getSizeInBits();
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+
+ Register UnmergeSrc = SrcReg;
+ if (LCMSize != SrcSize) {
+ // Widen to the common type.
+ Register Undef = B.buildUndef(SrcTy).getReg(0);
+ SmallVector<Register, 8> MergeParts(1, SrcReg);
+ for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
+ MergeParts.push_back(Undef);
- LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
- auto ImpDef = B.buildUndef(BigTy);
+ UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
+ }
- auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+ // Unmerge to the original registers and pad with dead defs.
+ SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
+ for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
+ Size += DstSize) {
+ UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
+ }
- int64_t Offset = 0;
- for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
- B.buildExtract(DstRegs[i], Big, Offset);
+ B.buildUnmerge(UnmergeResults, UnmergeSrc);
+}
+
+bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
+ CallingConv::ID CallConv,
+ SmallVectorImpl<BaseArgInfo> &Outs,
+ bool IsVarArg) const {
+ // For shaders. Vector types should be explicitly handled by CC.
+ if (AMDGPU::isEntryFunctionCC(CallConv))
+ return true;
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
+ MF.getFunction().getContext());
+
+ return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
}
/// Lower the return value for the already existing \p Ret. This assumes that
@@ -318,31 +448,77 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
const auto &F = MF.getFunction();
const DataLayout &DL = MF.getDataLayout();
MachineRegisterInfo *MRI = B.getMRI();
+ LLVMContext &Ctx = F.getContext();
CallingConv::ID CC = F.getCallingConv();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
- ArgInfo OrigRetInfo(VRegs, Val->getType());
- setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
- SmallVector<ArgInfo, 4> SplitRetInfos;
+ SmallVector<EVT, 8> SplitEVTs;
+ ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
+ assert(VRegs.size() == SplitEVTs.size() &&
+ "For each split Type there should be exactly one VReg.");
+
+ // We pre-process the return value decomposed into EVTs.
+ SmallVector<ArgInfo, 8> PreSplitRetInfos;
+
+ // Further processing is applied to split the arguments from PreSplitRetInfos
+ // into 32-bit pieces in SplitRetInfos before passing off to
+ // handleAssignments.
+ SmallVector<ArgInfo, 8> SplitRetInfos;
- splitToValueTypes(
- B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC,
- [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
- int VTSplitIdx) {
- unpackRegsToOrigType(B, Regs, SrcReg,
- SplitRetInfos[VTSplitIdx],
- LLTy, PartLLT);
- });
+ for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
+ EVT VT = SplitEVTs[i];
+ Register Reg = VRegs[i];
+ ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
+ setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+
+ if (VT.isScalarInteger()) {
+ unsigned ExtendOp = TargetOpcode::G_ANYEXT;
+ if (RetInfo.Flags[0].isSExt()) {
+ assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
+ ExtendOp = TargetOpcode::G_SEXT;
+ } else if (RetInfo.Flags[0].isZExt()) {
+ assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
+ ExtendOp = TargetOpcode::G_ZEXT;
+ }
+
+ EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
+ extOpcodeToISDExtOpcode(ExtendOp));
+ if (ExtVT != VT) {
+ RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
+ LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
+ Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
+ }
+ }
+
+ if (Reg != RetInfo.Regs[0]) {
+ RetInfo.Regs[0] = Reg;
+ // Reset the arg flags after modifying Reg.
+ setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+ }
+
+ splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC);
+
+ // FIXME: This splitting should mostly be done by handleAssignments
+ processSplitArgs(B, RetInfo,
+ PreSplitRetInfos, SplitRetInfos, DL, CC, true,
+ [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy,
+ LLT PartLLT, int VTSplitIdx) {
+ unpackRegsToOrigType(B, Regs, SrcReg,
+ PreSplitRetInfos[VTSplitIdx], LLTy,
+ PartLLT);
+ });
+ PreSplitRetInfos.clear();
+ }
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
- OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
+ AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
return handleAssignments(B, SplitRetInfos, RetHandler);
}
-bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
- const Value *Val,
- ArrayRef<Register> VRegs) const {
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
MachineFunction &MF = B.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -353,8 +529,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
const bool IsShader = AMDGPU::isShader(CC);
- const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
- AMDGPU::isKernel(CC);
+ const bool IsWaveEnd =
+ (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
if (IsWaveEnd) {
B.buildInstr(AMDGPU::S_ENDPGM)
.addImm(0);
@@ -373,7 +549,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
Ret.addUse(ReturnAddrVReg);
}
- if (!lowerReturnVal(B, Val, VRegs, Ret))
+ if (!FLI.CanLowerReturn)
+ insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
+ else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
@@ -389,24 +567,19 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
return true;
}
-Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
- Type *ParamTy,
- uint64_t Offset) const {
-
+void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
+ Type *ParamTy,
+ uint64_t Offset) const {
MachineFunction &MF = B.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = MF.getFunction();
- const DataLayout &DL = F.getParent()->getDataLayout();
- PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
- LLT PtrType = getLLTForType(*PtrTy, DL);
Register KernArgSegmentPtr =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
- return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
+ B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
}
void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
@@ -417,7 +590,10 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
const DataLayout &DL = F.getParent()->getDataLayout();
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
+
+ LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
+ lowerParameterPtr(PtrReg, B, ParamTy, Offset);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo,
@@ -504,12 +680,15 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
// TODO: Align down to dword alignment and extract bits for extending loads.
for (auto &Arg : F.args()) {
- Type *ArgTy = Arg.getType();
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
if (AllocSize == 0)
continue;
- Align ABIAlign = DL.getABITypeAlign(ArgTy);
+ MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
+ if (!ABIAlign)
+ ABIAlign = DL.getABITypeAlign(ArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
@@ -519,16 +698,34 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
continue;
}
- ArrayRef<Register> OrigArgRegs = VRegs[i];
- Register ArgReg =
- OrigArgRegs.size() == 1
- ? OrigArgRegs[0]
- : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
-
Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
- lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
- if (OrigArgRegs.size() > 1)
- unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
+
+ if (IsByRef) {
+ unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
+
+ assert(VRegs[i].size() == 1 &&
+ "expected only one register for byval pointers");
+ if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
+ lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
+ } else {
+ const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
+ lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
+
+ B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
+ }
+ } else {
+ ArrayRef<Register> OrigArgRegs = VRegs[i];
+ Register ArgReg =
+ OrigArgRegs.size() == 1
+ ? OrigArgRegs[0]
+ : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
+
+ lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
+ if (OrigArgRegs.size() > 1)
+ unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
+ }
+
++i;
}
@@ -649,8 +846,8 @@ static void packSplitRegsToOrigType(MachineIRBuilder &B,
}
bool AMDGPUCallLowering::lowerFormalArguments(
- MachineIRBuilder &B, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const {
+ MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
CallingConv::ID CC = F.getCallingConv();
// The infrastructure for normal calling convention lowering is essentially
@@ -659,7 +856,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (CC == CallingConv::AMDGPU_KERNEL)
return lowerFormalArgumentsKernel(B, F, VRegs);
- const bool IsShader = AMDGPU::isShader(CC);
+ const bool IsGraphics = AMDGPU::isGraphics(CC);
const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
MachineFunction &MF = B.getMF();
@@ -688,11 +885,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
-
+ SmallVector<ArgInfo, 8> SplitArg;
SmallVector<ArgInfo, 32> SplitArgs;
unsigned Idx = 0;
unsigned PSInputNum = 0;
+ // Insert the hidden sret parameter if the return value won't fit in the
+ // return registers.
+ if (!FLI.CanLowerReturn)
+ insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
+
for (auto &Arg : F.args()) {
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;
@@ -700,7 +902,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const bool InReg = Arg.hasAttribute(Attribute::InReg);
// SGPR arguments to functions not implemented.
- if (!IsShader && InReg)
+ if (!IsGraphics && InReg)
return false;
if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -733,16 +935,18 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
setArgFlags(OrigArg, OrigArgIdx, DL, F);
- splitToValueTypes(
- B, OrigArg, OrigArgIdx, SplitArgs, DL, CC,
- // FIXME: We should probably be passing multiple registers to
- // handleAssignments to do this
- [&](ArrayRef<Register> Regs, Register DstReg,
- LLT LLTy, LLT PartLLT, int VTSplitIdx) {
- assert(DstReg == VRegs[Idx][VTSplitIdx]);
- packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
- LLTy, PartLLT);
- });
+ SplitArg.clear();
+ splitToValueTypes(B, OrigArg, SplitArg, DL, CC);
+
+ processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false,
+ // FIXME: We should probably be passing multiple registers
+ // to handleAssignments to do this
+ [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy,
+ LLT PartLLT, int VTSplitIdx) {
+ assert(DstReg == VRegs[Idx][VTSplitIdx]);
+ packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
+ LLTy, PartLLT);
+ });
++Idx;
}
@@ -811,9 +1015,10 @@ bool AMDGPUCallLowering::lowerFormalArguments(
// Start adding system SGPRs.
if (IsEntryFunc) {
- TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
} else {
- CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ if (!Subtarget.enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -822,3 +1027,368 @@ bool AMDGPUCallLowering::lowerFormalArguments(
return true;
}
+
+bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
+ CCState &CCInfo,
+ SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
+ CallLoweringInfo &Info) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+
+ const AMDGPUFunctionArgInfo *CalleeArgInfo
+ = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
+
+
+ // TODO: Unify with private memory register handling. This is complicated by
+ // the fact that at least in kernels, the input argument is not necessarily
+ // in the same location as the input.
+ AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
+ AMDGPUFunctionArgInfo::DISPATCH_PTR,
+ AMDGPUFunctionArgInfo::QUEUE_PTR,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
+ AMDGPUFunctionArgInfo::DISPATCH_ID,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
+ };
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const AMDGPULegalizerInfo *LI
+ = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
+
+ for (auto InputID : InputRegs) {
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+ LLT ArgTy;
+
+ std::tie(OutgoingArg, ArgRC, ArgTy) =
+ CalleeArgInfo->getPreloadedValue(InputID);
+ if (!OutgoingArg)
+ continue;
+
+ const ArgDescriptor *IncomingArg;
+ const TargetRegisterClass *IncomingArgRC;
+ std::tie(IncomingArg, IncomingArgRC, ArgTy) =
+ CallerArgInfo.getPreloadedValue(InputID);
+ assert(IncomingArgRC == ArgRC);
+
+ Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
+
+ if (IncomingArg) {
+ LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
+ } else {
+ assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
+ if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+ report_fatal_error("failed to allocate implicit input argument");
+ } else {
+ LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
+ return false;
+ }
+ }
+
+ // Pack workitem IDs into a single register or pass it as is if already
+ // packed.
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+ LLT ArgTy;
+
+ std::tie(OutgoingArg, ArgRC, ArgTy) =
+ CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC, ArgTy) =
+ CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC, ArgTy) =
+ CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ if (!OutgoingArg)
+ return false;
+
+ auto WorkitemIDX =
+ CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ auto WorkitemIDY =
+ CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ auto WorkitemIDZ =
+ CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+
+ const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
+ const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
+ const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
+ const LLT S32 = LLT::scalar(32);
+
+ // If incoming ids are not packed we need to pack them.
+ // FIXME: Should consider known workgroup size to eliminate known 0 cases.
+ Register InputReg;
+ if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
+ InputReg = MRI.createGenericVirtualRegister(S32);
+ LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
+ std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
+ }
+
+ if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
+ Register Y = MRI.createGenericVirtualRegister(S32);
+ LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
+ std::get<2>(WorkitemIDY));
+
+ Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
+ InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
+ }
+
+ if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
+ Register Z = MRI.createGenericVirtualRegister(S32);
+ LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
+ std::get<2>(WorkitemIDZ));
+
+ Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
+ InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
+ }
+
+ if (!InputReg) {
+ InputReg = MRI.createGenericVirtualRegister(S32);
+
+ // Workitem ids are already packed, any of present incoming arguments will
+ // carry all required fields.
+ ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+ IncomingArgX ? *IncomingArgX :
+ IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
+ LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
+ &AMDGPU::VGPR_32RegClass, S32);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
+ if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+ report_fatal_error("failed to allocate implicit input argument");
+ } else {
+ LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
+ return false;
+ }
+
+ return true;
+}
+
+/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
+/// CC.
+static std::pair<CCAssignFn *, CCAssignFn *>
+getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
+ return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
+}
+
+static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
+ bool IsTailCall) {
+ return AMDGPU::SI_CALL;
+}
+
+// Add operands to call instruction to track the callee.
+static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
+ MachineIRBuilder &MIRBuilder,
+ AMDGPUCallLowering::CallLoweringInfo &Info) {
+ if (Info.Callee.isReg()) {
+ CallInst.addReg(Info.Callee.getReg());
+ CallInst.addImm(0);
+ } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
+ // The call lowering lightly assumed we can directly encode a call target in
+ // the instruction, which is not the case. Materialize the address here.
+ const GlobalValue *GV = Info.Callee.getGlobal();
+ auto Ptr = MIRBuilder.buildGlobalValue(
+ LLT::pointer(GV->getAddressSpace(), 64), GV);
+ CallInst.addReg(Ptr.getReg(0));
+ CallInst.add(Info.Callee);
+ } else
+ return false;
+
+ return true;
+}
+
+bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ if (Info.IsVarArg) {
+ LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
+ return false;
+ }
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ CallingConv::ID CallConv = F.getCallingConv();
+
+ if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
+ LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
+ return false;
+ }
+
+ if (AMDGPU::isShader(CallConv)) {
+ LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
+ return false;
+ }
+
+ SmallVector<ArgInfo, 8> OutArgs;
+
+ SmallVector<ArgInfo, 8> SplitArg;
+ for (auto &OrigArg : Info.OrigArgs) {
+ splitToValueTypes(MIRBuilder, OrigArg, SplitArg, DL, Info.CallConv);
+
+ processSplitArgs(
+ MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true,
+ // FIXME: We should probably be passing multiple registers to
+ // handleAssignments to do this
+ [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
+ int VTSplitIdx) {
+ unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
+ });
+
+ SplitArg.clear();
+ }
+
+ // If we can lower as a tail call, do that instead.
+ bool CanTailCallOpt = false;
+
+ // We must emit a tail call if we have musttail.
+ if (Info.IsMustTailCall && !CanTailCallOpt) {
+ LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
+ return false;
+ }
+
+ // Find out which ABI gets to decide where things go.
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) =
+ getAssignFnsForCC(Info.CallConv, TLI);
+
+ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
+ .addImm(0)
+ .addImm(0);
+
+ // Create a temporarily-floating call instruction so we can add the implicit
+ // uses of arg registers.
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ MIB.addDef(TRI->getReturnAddressReg(MF));
+
+ if (!addCallTargetOperands(MIB, MIRBuilder, Info))
+ return false;
+
+ // Tell the call which registers are clobbered.
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
+ MIB.addRegMask(Mask);
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
+
+ // We could pass MIB and directly add the implicit uses to the call
+ // now. However, as an aesthetic choice, place implicit argument operands
+ // after the ordinary user argument registers.
+ SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
+
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ // With a fixed ABI, allocate fixed registers before user arguments.
+ if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
+ return false;
+ }
+
+ // Do the actual argument marshalling.
+ SmallVector<Register, 8> PhysRegs;
+ AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+ AssignFnVarArg, false);
+ if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
+ return false;
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (!ST.enableFlatScratch()) {
+ // Insert copies for the SRD. In the HSA case, this should be an identity
+ // copy.
+ auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
+ MFI->getScratchRSrcReg());
+ MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+ }
+
+ for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
+ MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
+ MIB.addReg(ArgReg.first, RegState::Implicit);
+ }
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ // If Callee is a reg, since it is used by a target specific
+ // instruction, it must have a register class matching the
+ // constraint of that instruction.
+
+ // FIXME: We should define regbankselectable call instructions to handle
+ // divergent call targets.
+ if (MIB->getOperand(1).isReg()) {
+ MIB->getOperand(1).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *ST.getInstrInfo(),
+ *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
+ 1));
+ }
+
+ auto OrigInsertPt = MIRBuilder.getInsertPt();
+
+ // Now we can add the actual call instruction to the correct position.
+ MIRBuilder.insertInstr(MIB);
+
+ // Insert this now to give us an anchor point for managing the insert point.
+ MachineInstrBuilder CallSeqEnd =
+ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
+
+ SmallVector<ArgInfo, 8> InArgs;
+ if (!Info.CanLowerReturn) {
+ insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
+ Info.DemoteRegister, Info.DemoteStackIndex);
+ } else if (!Info.OrigRet.Ty->isVoidTy()) {
+ SmallVector<ArgInfo, 8> PreSplitRetInfos;
+
+ splitToValueTypes(
+ MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv);
+
+ processSplitArgs(MIRBuilder, Info.OrigRet,
+ PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false,
+ [&](ArrayRef<Register> Regs, Register DstReg,
+ LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
+ packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
+ Regs, LLTy, PartLLT);
+ });
+ }
+
+ // Make sure the raw argument copies are inserted before the marshalling to
+ // the original types.
+ MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
+
+ // Finally we can copy the returned value back into its virtual-register. In
+ // symmetry with the arguments, the physical register must be an
+ // implicit-define of the call instruction.
+ if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
+ CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
+ Info.IsVarArg);
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
+ if (!handleAssignments(MIRBuilder, InArgs, Handler))
+ return false;
+ }
+
+ uint64_t CalleePopBytes = NumBytes;
+ CallSeqEnd.addImm(0)
+ .addImm(CalleePopBytes);
+
+ // Restore the insert point to after the call sequence.
+ MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 446619d1502e..1312388e4a38 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -14,7 +14,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
-#include "AMDGPU.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
namespace llvm {
@@ -22,9 +21,9 @@ namespace llvm {
class AMDGPUTargetLowering;
class MachineInstrBuilder;
-class AMDGPUCallLowering: public CallLowering {
- Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy,
- uint64_t Offset) const;
+class AMDGPUCallLowering final : public CallLowering {
+ void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy,
+ uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
Align Alignment, Register DstReg) const;
@@ -32,13 +31,20 @@ class AMDGPUCallLowering: public CallLowering {
/// A function of this type is used to perform value split action.
using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
- void splitToValueTypes(MachineIRBuilder &B,
- const ArgInfo &OrigArgInfo,
- unsigned OrigArgIdx,
+ void splitToValueTypes(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL,
- CallingConv::ID CallConv,
- SplitArgTy SplitArg) const;
+ const DataLayout &DL, CallingConv::ID CallConv) const;
+
+ void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
+ const SmallVectorImpl<ArgInfo> &SplitArg,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, CallingConv::ID CallConv,
+ bool IsOutgoing,
+ SplitArgTy PerformArgSplit) const;
+
+ bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
+ SmallVectorImpl<BaseArgInfo> &Outs,
+ bool IsVarArg) const override;
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
@@ -47,13 +53,24 @@ public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &B, const Value *Val,
- ArrayRef<Register> VRegs) const override;
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const;
bool lowerFormalArguments(MachineIRBuilder &B, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
+
+ bool passSpecialInputs(MachineIRBuilder &MIRBuilder,
+ CCState &CCInfo,
+ SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
+ CallLoweringInfo &Info) const;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
+
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 7c83b6dcb44b..250c42776297 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -16,7 +16,75 @@ class CCIfExtend<CCAction A>
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
// Calling convention for SI
-def CC_SI : CallingConv<[
+def CC_SI_Gfx : CallingConv<[
+ // 0-3 are reserved for the stack buffer descriptor
+ // 30-31 are reserved for the return address
+ // 32 is reserved for the stack pointer
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+ SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+ ]>>>,
+
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+ ]>>>,
+
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+ CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+ CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+ CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+ CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+ CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def RetCC_SI_Gfx : CallingConv<[
+ // 0-3 are reserved for the stack buffer descriptor
+ // 32 is reserved for the stack pointer
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+ SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+ SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+ SGPR40, SGPR41, SGPR42, SGPR43
+ ]>>>,
+
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+ VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+ VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+ VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+ VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+ VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+ VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+ VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+ VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+ VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+ VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+ VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+ VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+ ]>>>,
+
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+ CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+ CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+ CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+ CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+ CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def CC_SI_SHADER : CallingConv<[
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
@@ -125,11 +193,13 @@ def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
>;
+def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
+
// Calling convention for leaf functions
def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i1], CCPromoteToType<i32>>,
- CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
@@ -159,7 +229,7 @@ def CC_AMDGPU : CallingConv<[
CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
- CCDelegateTo<CC_SI>>,
+ CCDelegateTo<CC_SI_SHADER>>,
CCIf<"static_cast<const GCNSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index a79549301740..2556996df97f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -13,40 +13,19 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "llvm/ADT/FloatingPointMode.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
-#include <cassert>
-#include <iterator>
#define DEBUG_TYPE "amdgpu-codegenprepare"
@@ -60,6 +39,12 @@ static cl::opt<bool> WidenLoads(
cl::ReallyHidden,
cl::init(false));
+static cl::opt<bool> Widen16BitOps(
+ "amdgpu-codegenprepare-widen-16-bit-ops",
+ cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(true));
+
static cl::opt<bool> UseMul24Intrin(
"amdgpu-codegenprepare-mul24",
cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -269,6 +254,9 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
}
bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+ if (!Widen16BitOps)
+ return false;
+
const IntegerType *IntTy = dyn_cast<IntegerType>(T);
if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
return true;
@@ -751,6 +739,11 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Type *Ty = FDiv.getType()->getScalarType();
+ // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
+ // expansion around them in codegen.
+ if (Ty->isDoubleTy())
+ return false;
+
// No intrinsic for fdiv16 if target does not support f16.
if (Ty->isHalfTy() && !ST->has16BitInsts())
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index faaf9168d0dd..a8399176bb4a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -11,22 +11,22 @@ include "llvm/Target/GlobalISel/Combine.td"
// TODO: This really belongs after legalization after scalarization.
// TODO: GICombineRules should accept subtarget predicates
-def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">;
+def fmin_fmax_legacy_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::FMinFMaxLegacyInfo">;
def fcmp_select_to_fmin_fmax_legacy : GICombineRule<
(defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo),
(match (wip_match_opcode G_SELECT):$select,
- [{ return matchFMinFMaxLegacy(*${select}, MRI, *MF, ${matchinfo}); }]),
- (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
+ [{ return PostLegalizerHelper.matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]),
+ (apply [{ PostLegalizerHelper.applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
def uchar_to_float : GICombineRule<
(defs root:$itofp),
(match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,
- [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]),
- (apply [{ applyUCharToFloat(*${itofp}); }])>;
+ [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
+ (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
-def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
+def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
def cvt_f32_ubyteN : GICombineRule<
(defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
@@ -34,33 +34,25 @@ def cvt_f32_ubyteN : GICombineRule<
G_AMDGPU_CVT_F32_UBYTE1,
G_AMDGPU_CVT_F32_UBYTE2,
G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
- [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]),
- (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
+ (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
// Combines which should only apply on SI/VI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
- "AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
- elide_br_by_inverting_cond]> {
+ "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
}
-
-// FIXME: combines_for_extload can introduce illegal extloads which
-// aren't re-legalized.
-// FIXME: Is there a way to remove a single item from all_combines?
-def all_combines_minus_extload : GICombineGroup<[trivial_combines,
- ptr_add_immed_chain, combine_indexed_load_store, undef_combines,
- identity_combines]
->;
-
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
- [all_combines_minus_extload, gfx6gfx7_combines,
+ [all_combines, gfx6gfx7_combines,
uchar_to_float, cvt_f32_ubyteN]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
+ let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
+ let AdditionalArguments = [];
}
def AMDGPURegBankCombinerHelper : GICombinerHelper<
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
index 25c82ed61fc2..bed0707f3aa7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -12,9 +12,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUExportClustering.h"
-#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
using namespace llvm;
@@ -27,15 +27,13 @@ public:
};
static bool isExport(const SUnit &SU) {
- const MachineInstr *MI = SU.getInstr();
- return MI->getOpcode() == AMDGPU::EXP ||
- MI->getOpcode() == AMDGPU::EXP_DONE;
+ return SIInstrInfo::isEXP(*SU.getInstr());
}
static bool isPositionExport(const SIInstrInfo *TII, SUnit *SU) {
const MachineInstr *MI = SU->getInstr();
- int Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
- return Imm >= 12 && Imm <= 15;
+ unsigned Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
+ return Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST;
}
static void sortChain(const SIInstrInfo *TII, SmallVector<SUnit *, 8> &Chain,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
index 58491d0671e4..041d6deef243 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -6,7 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 3f12addbcc79..bba03736d01a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -51,6 +51,11 @@ def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
+// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
+def gi_vop3opsel :
+ GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
+ GIComplexPatternEquiv<VOP3OpSel>;
+
def gi_smrd_imm :
GIComplexOperandMatcher<s64, "selectSmrdImm">,
GIComplexPatternEquiv<SMRDImm>;
@@ -63,19 +68,15 @@ def gi_smrd_sgpr :
GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
GIComplexPatternEquiv<SMRDSgpr>;
-// FIXME: Why are the atomic versions separated?
def gi_flat_offset :
GIComplexOperandMatcher<s64, "selectFlatOffset">,
GIComplexPatternEquiv<FLATOffset>;
def gi_flat_offset_signed :
GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
GIComplexPatternEquiv<FLATOffsetSigned>;
-def gi_flat_atomic :
- GIComplexOperandMatcher<s64, "selectFlatOffset">,
- GIComplexPatternEquiv<FLATAtomic>;
-def gi_flat_signed_atomic :
- GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
- GIComplexPatternEquiv<FLATSignedAtomic>;
+def gi_global_saddr :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
+ GIComplexPatternEquiv<GlobalSAddr>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -84,6 +85,14 @@ def gi_mubuf_scratch_offen :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
GIComplexPatternEquiv<MUBUFScratchOffen>;
+def gi_flat_scratch_offset :
+ GIComplexOperandMatcher<s32, "selectFlatOffsetSigned">,
+ GIComplexPatternEquiv<ScratchOffset>;
+
+def gi_flat_scratch_saddr :
+ GIComplexOperandMatcher<s32, "selectScratchSAddr">,
+ GIComplexPatternEquiv<ScratchSAddr>;
+
def gi_ds_1addr_1offset :
GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
GIComplexPatternEquiv<DS1Addr1Offset>;
@@ -92,6 +101,10 @@ def gi_ds_64bit_4byte_aligned :
GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
GIComplexPatternEquiv<DS64Bit4ByteAligned>;
+def gi_ds_128bit_8byte_aligned :
+ GIComplexOperandMatcher<s64, "selectDS128Bit8ByteAligned">,
+ GIComplexPatternEquiv<DS128Bit8ByteAligned>;
+
def gi_mubuf_addr64 :
GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
GIComplexPatternEquiv<MUBUFAddr64>;
@@ -133,6 +146,9 @@ def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> {
bit CheckMMOIsAtomic = 1;
}
+def : GINodeEquiv<G_STORE, AMDGPUatomic_st_glue> {
+ bit CheckMMOIsAtomic = 1;
+}
def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>;
@@ -181,6 +197,11 @@ def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
+
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
@@ -194,6 +215,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
@@ -312,3 +334,6 @@ def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;
+
+def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
+ GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 600b351f9ea1..bfeee37feb4b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -37,8 +37,9 @@ enum PartialMappingIdx {
PM_AGPR32 = 31,
PM_AGPR64 = 32,
PM_AGPR128 = 33,
- PM_AGPR512 = 34,
- PM_AGPR1024 = 35
+ PM_AGPR256 = 34,
+ PM_AGPR512 = 35,
+ PM_AGPR1024 = 36
};
const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -69,6 +70,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
{0, 32, AGPRRegBank}, // AGPR begin
{0, 64, AGPRRegBank},
{0, 128, AGPRRegBank},
+ {0, 256, AGPRRegBank},
{0, 512, AGPRRegBank},
{0, 1024, AGPRRegBank}
};
@@ -115,9 +117,9 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{&PartMappings[20], 1}, // 32
{&PartMappings[21], 1}, // 64
{&PartMappings[22], 1}, // 128
- {nullptr, 0},
- {&PartMappings[23], 1}, // 512
- {&PartMappings[24], 1} // 1024
+ {&PartMappings[23], 1}, // 256
+ {&PartMappings[24], 1}, // 512
+ {&PartMappings[25], 1} // 1024
};
const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 989937a597fb..b3bafc5b2720 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -13,11 +13,11 @@
using namespace llvm;
using namespace MIPatternMatch;
-std::tuple<Register, unsigned, MachineInstr *>
+std::pair<Register, unsigned>
AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (!Def)
- return std::make_tuple(Reg, 0, nullptr);
+ return std::make_pair(Reg, 0);
if (Def->getOpcode() == TargetOpcode::G_CONSTANT) {
unsigned Offset;
@@ -27,21 +27,21 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
else
Offset = Op.getCImm()->getZExtValue();
- return std::make_tuple(Register(), Offset, Def);
+ return std::make_pair(Register(), Offset);
}
int64_t Offset;
if (Def->getOpcode() == TargetOpcode::G_ADD) {
// TODO: Handle G_OR used for add case
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset)))
- return std::make_tuple(Def->getOperand(1).getReg(), Offset, Def);
+ return std::make_pair(Def->getOperand(1).getReg(), Offset);
// FIXME: matcher should ignore copies
if (mi_match(Def->getOperand(2).getReg(), MRI, m_Copy(m_ICst(Offset))))
- return std::make_tuple(Def->getOperand(1).getReg(), Offset, Def);
+ return std::make_pair(Def->getOperand(1).getReg(), Offset);
}
- return std::make_tuple(Reg, 0, Def);
+ return std::make_pair(Reg, 0);
}
bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 766750758efc..404e0fcd1166 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,53 +9,21 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
-#include "AMDGPUInstrInfo.h"
#include "llvm/CodeGen/Register.h"
-#include <tuple>
+#include <utility>
namespace llvm {
-class MachineInstr;
class MachineRegisterInfo;
namespace AMDGPU {
-/// Returns Base register, constant offset, and offset def point.
-std::tuple<Register, unsigned, MachineInstr *>
+/// Returns base register and constant offset.
+std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
-/// Return number of address arguments, and the number of gradients for an image
-/// intrinsic.
-inline std::pair<int, int>
-getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
- const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
- const AMDGPU::MIMGDimInfo *DimInfo
- = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
-
- int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
- int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
- int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
- int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
- return {NumVAddr, NumGradients};
-}
-
-/// Return index of dmask in an gMIR image intrinsic
-inline int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
- int NumDefs) {
- assert(!BaseOpcode->Atomic);
- return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
-}
-
-/// Return first address operand index in a gMIR image intrinsic.
-inline int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
- int NumDefs) {
- if (BaseOpcode->Atomic)
- return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
- return getDMaskIdx(BaseOpcode, NumDefs) + 1;
-}
-
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index c6f6a3b84e36..39f9092ce77c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -14,15 +14,27 @@
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
+ const DataLayout &DL) {
+ Type *Ty = Arg.getType();
+ MaybeAlign ArgAlign;
+ if (Arg.hasByRefAttr()) {
+ Ty = Arg.getParamByRefType();
+ ArgAlign = Arg.getParamAlign();
+ }
+
+ if (!ArgAlign)
+ ArgAlign = DL.getABITypeAlign(Ty);
+
+ return std::make_pair(Ty, *ArgAlign);
+}
namespace llvm {
@@ -47,7 +59,7 @@ void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
HSAMD::Metadata FromHSAMetadataString;
- if (fromString(std::string(HSAMetadataString), FromHSAMetadataString)) {
+ if (fromString(HSAMetadataString, FromHSAMetadataString)) {
errs() << "FAIL\n";
return;
}
@@ -311,23 +323,28 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
if (Node && ArgNo < Node->getNumOperands())
TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
- Type *Ty = Arg.getType();
const DataLayout &DL = Func->getParent()->getDataLayout();
MaybeAlign PointeeAlign;
- if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+ if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ // FIXME: Should report this for all address spaces
PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
PtrTy->getElementType());
}
}
- emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName),
- PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
+ Type *ArgTy;
+ Align ArgAlign;
+ std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
+
+ emitKernelArg(DL, ArgTy, ArgAlign,
+ getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name,
+ TypeName, BaseTypeName, AccQual, TypeQual);
}
void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
- ValueKind ValueKind,
+ Align Alignment, ValueKind ValueKind,
MaybeAlign PointeeAlign, StringRef Name,
StringRef TypeName,
StringRef BaseTypeName,
@@ -338,7 +355,7 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
Arg.mName = std::string(Name);
Arg.mTypeName = std::string(TypeName);
Arg.mSize = DL.getTypeAllocSize(Ty);
- Arg.mAlign = DL.getABITypeAlign(Ty).value();
+ Arg.mAlign = Alignment.value();
Arg.mValueKind = ValueKind;
Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0;
@@ -374,11 +391,11 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
auto Int64Ty = Type::getInt64Ty(Func.getContext());
if (HiddenArgNumBytes >= 8)
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+ emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetX);
if (HiddenArgNumBytes >= 16)
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+ emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetY);
if (HiddenArgNumBytes >= 24)
- emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+ emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetZ);
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
AMDGPUAS::GLOBAL_ADDRESS);
@@ -387,31 +404,31 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
// "none" argument.
if (HiddenArgNumBytes >= 32) {
if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer);
else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
// The printf runtime binding pass should have ensured that hostcall and
// printf are not used in the same module.
assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer);
} else
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
}
// Emit "default queue" and "completion action" arguments if enqueue kernel is
// used, otherwise emit dummy "none" arguments.
if (HiddenArgNumBytes >= 48) {
if (Func.hasFnAttribute("calls-enqueue-kernel")) {
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenDefaultQueue);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction);
} else {
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
}
}
// Emit the pointer argument for multi-grid object.
if (HiddenArgNumBytes >= 56)
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg);
+ emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
}
bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
@@ -699,10 +716,12 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
if (Node && ArgNo < Node->getNumOperands())
TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
- Type *Ty = Arg.getType();
const DataLayout &DL = Func->getParent()->getDataLayout();
MaybeAlign PointeeAlign;
+ Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType();
+
+ // FIXME: Need to distinguish in memory alignment from pointer alignment.
if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
@@ -710,19 +729,21 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
}
}
- emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
- getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset,
- Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual,
- TypeQual);
+ // There's no distinction between byval aggregates and raw aggregates.
+ Type *ArgTy;
+ Align ArgAlign;
+ std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
+
+ emitKernelArg(DL, ArgTy, ArgAlign,
+ getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
+ PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
}
-void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
- StringRef ValueKind, unsigned &Offset,
- msgpack::ArrayDocNode Args,
- MaybeAlign PointeeAlign, StringRef Name,
- StringRef TypeName,
- StringRef BaseTypeName,
- StringRef AccQual, StringRef TypeQual) {
+void MetadataStreamerV3::emitKernelArg(
+ const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
+ unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
+ StringRef Name, StringRef TypeName, StringRef BaseTypeName,
+ StringRef AccQual, StringRef TypeQual) {
auto Arg = Args.getDocument()->getMapNode();
if (!Name.empty())
@@ -730,7 +751,6 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
if (!TypeName.empty())
Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
auto Size = DL.getTypeAllocSize(Ty);
- Align Alignment = DL.getABITypeAlign(Ty);
Arg[".size"] = Arg.getDocument()->getNode(Size);
Offset = alignTo(Offset, Alignment);
Arg[".offset"] = Arg.getDocument()->getNode(Offset);
@@ -777,11 +797,14 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
auto Int64Ty = Type::getInt64Ty(Func.getContext());
if (HiddenArgNumBytes >= 8)
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args);
+ emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
+ Args);
if (HiddenArgNumBytes >= 16)
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args);
+ emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
+ Args);
if (HiddenArgNumBytes >= 24)
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args);
+ emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
+ Args);
auto Int8PtrTy =
Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
@@ -790,31 +813,36 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
// "none" argument.
if (HiddenArgNumBytes >= 32) {
if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+ Args);
else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
// The printf runtime binding pass should have ensured that hostcall and
// printf are not used in the same module.
assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
- emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
+ Args);
} else
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
// Emit "default queue" and "completion action" arguments if enqueue kernel is
// used, otherwise emit dummy "none" arguments.
if (HiddenArgNumBytes >= 48) {
if (Func.hasFnAttribute("calls-enqueue-kernel")) {
- emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args);
- emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+ Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
+ Args);
} else {
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
// Emit the pointer argument for multi-grid object.
if (HiddenArgNumBytes >= 56)
- emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+ Args);
}
msgpack::MapDocNode
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 9534fffd228d..1c6db14b85cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,9 +15,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Alignment.h"
@@ -87,11 +84,12 @@ private:
void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args);
- void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
- unsigned &Offset, msgpack::ArrayDocNode Args,
- MaybeAlign PointeeAlign = None, StringRef Name = "",
- StringRef TypeName = "", StringRef BaseTypeName = "",
- StringRef AccQual = "", StringRef TypeQual = "");
+ void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
+ StringRef ValueKind, unsigned &Offset,
+ msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign = None,
+ StringRef Name = "", StringRef TypeName = "",
+ StringRef BaseTypeName = "", StringRef AccQual = "",
+ StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
msgpack::ArrayDocNode Args);
@@ -156,10 +154,11 @@ private:
void emitKernelArg(const Argument &Arg);
- void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
- MaybeAlign PointeeAlign = None, StringRef Name = "",
- StringRef TypeName = "", StringRef BaseTypeName = "",
- StringRef AccQual = "", StringRef TypeQual = "");
+ void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
+ ValueKind ValueKind, MaybeAlign PointeeAlign = None,
+ StringRef Name = "", StringRef TypeName = "",
+ StringRef BaseTypeName = "", StringRef AccQual = "",
+ StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index aaf448346b53..340f4ac6f57a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -12,48 +12,21 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUArgumentUsageInfo.h"
-#include "AMDGPUISelLowering.h" // For AMDGPUISD
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUPerfHintAnalysis.h"
-#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
+
#ifdef EXPENSIVE_CHECKS
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Dominators.h"
#endif
-#include "llvm/IR/Instruction.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <new>
-#include <vector>
#define DEBUG_TYPE "isel"
@@ -191,6 +164,9 @@ private:
bool isUniformLoad(const SDNode *N) const;
bool isUniformBr(const SDNode *N) const;
+ bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
+ SDValue &RHS) const;
+
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
@@ -200,11 +176,16 @@ private:
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool isDSOffsetLegal(SDValue Base, unsigned Offset,
- unsigned OffsetBits) const;
+ bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
+ bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
+ unsigned Size) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
+ bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+ SDValue &Offset1) const;
+ bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+ SDValue &Offset1, unsigned Size) const;
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
@@ -233,11 +214,11 @@ private:
template <bool IsSigned>
bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &Offset, SDValue &SLC) const;
- bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &Offset, SDValue &SLC) const;
- bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &Offset, SDValue &SLC) const;
+ SDValue &Offset) const;
+ bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &Offset) const;
+ bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &Offset) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
@@ -252,11 +233,15 @@ private:
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
+ bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
+ bool AllowAbs = true) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp, SDValue &Omod) const;
+ bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp, SDValue &Omod) const;
bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp, SDValue &Omod) const;
@@ -519,8 +504,8 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
return true;
// TODO: Move into isKnownNeverNaN
- if (N->getFlags().isDefined())
- return N->getFlags().hasNoNaNs();
+ if (N->getFlags().hasNoNaNs())
+ return true;
return CurDAG->isKnownNeverNaN(N);
}
@@ -557,8 +542,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
unsigned OpNo) const {
if (!N->isMachineOpcode()) {
if (N->getOpcode() == ISD::CopyToReg) {
- unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
- if (Register::isVirtualRegister(Reg)) {
+ Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+ if (Reg.isVirtual()) {
MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
return MRI.getRegClass(Reg);
}
@@ -716,8 +701,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX ||
- Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) {
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
@@ -920,6 +904,53 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
Term->getMetadata("structurizecfg.uniform");
}
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+ SDValue &N0, SDValue &N1) {
+ if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+ Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+ // (i64 (bitcast (v2i32 (build_vector
+ // (or (extract_vector_elt V, 0), OFFSET),
+ // (extract_vector_elt V, 1)))))
+ SDValue Lo = Addr.getOperand(0).getOperand(0);
+ if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+ SDValue BaseLo = Lo.getOperand(0);
+ SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+ // Check that split base (Lo and Hi) are extracted from the same one.
+ if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+ // Lo is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+ BaseLo.getConstantOperandVal(1) == 0 &&
+ // Hi is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+ BaseHi.getConstantOperandVal(1) == 1) {
+ N0 = BaseLo.getOperand(0).getOperand(0);
+ N1 = Lo.getOperand(1);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
+ SDValue &RHS) const {
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
+ return true;
+ }
+
+ if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
+ assert(LHS && RHS && isa<ConstantSDNode>(RHS));
+ return true;
+ }
+
+ return false;
+}
+
StringRef AMDGPUDAGToDAGISel::getPassName() const {
return "AMDGPU DAG->DAG Pattern Instruction Selection";
}
@@ -994,7 +1025,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
static const unsigned OpcMap[2][2][2] = {
{{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
- {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}},
+ {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
{{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
{AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
@@ -1073,7 +1104,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
}
if (IsVALU) {
- unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
CurDAG->SelectNodeTo(
N, Opc, N->getVTList(),
@@ -1099,7 +1130,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
Ops[8] = N->getOperand(0);
Ops[9] = N->getOperand(4);
- CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+ CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops);
}
void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
@@ -1124,9 +1155,14 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
assert(VT == MVT::f32 || VT == MVT::f64);
unsigned Opc
- = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+ // omod
+ SDValue Ops[8];
+ SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
+ SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
@@ -1135,7 +1171,7 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
SDLoc SL(N);
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
- unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
+ unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
@@ -1143,13 +1179,11 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
- unsigned OffsetBits) const {
- if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
- (OffsetBits == 8 && !isUInt<8>(Offset)))
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
+ if (!isUInt<16>(Offset))
return false;
- if (Subtarget->hasUsableDSOffset() ||
+ if (!Base || Subtarget->hasUsableDSOffset() ||
Subtarget->unsafeDSOffsetFoldingEnabled())
return true;
@@ -1165,7 +1199,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+ if (isDSOffsetLegal(N0, C1->getSExtValue())) {
// (add n0, c0)
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
@@ -1175,7 +1209,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
// sub C, x -> add (sub 0, x), C
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
int64_t ByteOffset = C->getSExtValue();
- if (isUInt<16>(ByteOffset)) {
+ if (isDSOffsetLegal(SDValue(), ByteOffset)) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
// XXX - This is kind of hacky. Create a dummy sub node so we can check
@@ -1184,13 +1218,13 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
Zero, Addr.getOperand(1));
- if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ if (isDSOffsetLegal(Sub, ByteOffset)) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(Zero);
Opnds.push_back(Addr.getOperand(1));
// FIXME: Select to VOP3 version for with-carry.
- unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
if (Subtarget->hasAddNoCarry()) {
SubOp = AMDGPU::V_SUB_U32_e64;
Opnds.push_back(
@@ -1214,7 +1248,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
SDLoc DL(Addr);
- if (isUInt<16>(CAddr->getZExtValue())) {
+ if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
DL, MVT::i32, Zero);
@@ -1230,75 +1264,104 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
return true;
}
+bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
+ unsigned Offset1,
+ unsigned Size) const {
+ if (Offset0 % Size != 0 || Offset1 % Size != 0)
+ return false;
+ if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
+ return false;
+
+ if (!Base || Subtarget->hasUsableDSOffset() ||
+ Subtarget->unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return CurDAG->SignBitIsZero(Base);
+}
+
// TODO: If offset is too big, put low 16-bit into offset.
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
SDValue &Offset0,
SDValue &Offset1) const {
+ return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
+ SDValue &Offset0,
+ SDValue &Offset1) const {
+ return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
+ SDValue &Offset0, SDValue &Offset1,
+ unsigned Size) const {
SDLoc DL(Addr);
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- unsigned DWordOffset0 = C1->getZExtValue() / 4;
- unsigned DWordOffset1 = DWordOffset0 + 1;
+ unsigned OffsetValue0 = C1->getZExtValue();
+ unsigned OffsetValue1 = OffsetValue0 + Size;
+
// (add n0, c0)
- if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+ if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
Base = N0;
- Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
- Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
return true;
}
} else if (Addr.getOpcode() == ISD::SUB) {
// sub C, x -> add (sub 0, x), C
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
- unsigned DWordOffset0 = C->getZExtValue() / 4;
- unsigned DWordOffset1 = DWordOffset0 + 1;
+ if (const ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ unsigned OffsetValue0 = C->getZExtValue();
+ unsigned OffsetValue1 = OffsetValue0 + Size;
- if (isUInt<8>(DWordOffset0)) {
+ if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
SDLoc DL(Addr);
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
// XXX - This is kind of hacky. Create a dummy sub node so we can check
// the known bits in isDSOffsetLegal. We need to emit the selected node
// here, so this is thrown away.
- SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ SDValue Sub =
+ CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
- if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+ if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(Zero);
Opnds.push_back(Addr.getOperand(1));
- unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
if (Subtarget->hasAddNoCarry()) {
SubOp = AMDGPU::V_SUB_U32_e64;
Opnds.push_back(
CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
}
- MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
+ MachineSDNode *MachineSub = CurDAG->getMachineNode(
+ SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
Base = SDValue(MachineSub, 0);
- Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
- Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
return true;
}
}
}
} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
- unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
- unsigned DWordOffset1 = DWordOffset0 + 1;
- assert(4 * DWordOffset0 == CAddr->getZExtValue());
+ unsigned OffsetValue0 = CAddr->getZExtValue();
+ unsigned OffsetValue1 = OffsetValue0 + Size;
- if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+ if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
- MachineSDNode *MovZero
- = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- DL, MVT::i32, Zero);
+ MachineSDNode *MovZero =
+ CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
Base = SDValue(MovZero, 0);
- Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
- Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
return true;
}
}
@@ -1454,22 +1517,16 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
SDLoc DL(N);
- const MachineFunction &MF = CurDAG->getMachineFunction();
- const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-
- if (auto FI = dyn_cast<FrameIndexSDNode>(N)) {
- SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
- FI->getValueType(0));
- // If we can resolve this to a frame index access, this will be relative to
- // either the stack or frame pointer SGPR.
- return std::make_pair(
- TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32));
- }
+ auto *FI = dyn_cast<FrameIndexSDNode>(N);
+ SDValue TFI =
+ FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
- // If we don't know this private access is a local stack object, it needs to
- // be relative to the entry point's scratch wave offset.
- return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
+ // We rebase the base address into an absolute stack address and hence
+ // use constant 0 for soffset. This value must be retained until
+ // frame elimination and eliminateFrameIndex will choose the appropriate
+ // frame register if need be.
+ return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
}
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1628,155 +1685,245 @@ static MemSDNode* findMemSDNode(SDNode *N) {
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
-static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
- SDValue &N0, SDValue &N1) {
- if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
- Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
- // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
- // (i64 (bitcast (v2i32 (build_vector
- // (or (extract_vector_elt V, 0), OFFSET),
- // (extract_vector_elt V, 1)))))
- SDValue Lo = Addr.getOperand(0).getOperand(0);
- if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
- SDValue BaseLo = Lo.getOperand(0);
- SDValue BaseHi = Addr.getOperand(0).getOperand(1);
- // Check that split base (Lo and Hi) are extracted from the same one.
- if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
- // Lo is statically extracted from index 0.
- isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
- BaseLo.getConstantOperandVal(1) == 0 &&
- // Hi is statically extracted from index 0.
- isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
- BaseHi.getConstantOperandVal(1) == 1) {
- N0 = BaseLo.getOperand(0).getOperand(0);
- N1 = Lo.getOperand(1);
- return true;
- }
- }
- }
- return false;
-}
-
template <bool IsSigned>
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDValue Addr,
SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
+ SDValue &Offset) const {
int64_t OffsetVal = 0;
+ unsigned AS = findMemSDNode(N)->getAddressSpace();
+
if (Subtarget->hasFlatInstOffsets() &&
(!Subtarget->hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+ AS != AMDGPUAS::FLAT_ADDRESS)) {
SDValue N0, N1;
- if (CurDAG->isBaseWithConstantOffset(Addr)) {
- N0 = Addr.getOperand(0);
- N1 = Addr.getOperand(1);
- } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
- assert(N0 && N1 && isa<ConstantSDNode>(N1));
- }
- if (N0 && N1) {
+ if (isBaseWithConstantOffset64(Addr, N0, N1)) {
uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- unsigned AS = findMemSDNode(N)->getAddressSpace();
if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
// If the offset doesn't fit, put the low bits into the offset field and
// add the rest.
+ //
+ // For a FLAT instruction the hardware decides whether to access
+ // global/scratch/shared memory based on the high bits of vaddr,
+ // ignoring the offset field, so we have to ensure that when we add
+ // remainder to vaddr it still points into the same underlying object.
+ // The easiest way to do that is to make sure that we split the offset
+ // into two pieces that are both >= 0 or both <= 0.
SDLoc DL(N);
- uint64_t ImmField;
- const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
- if (IsSigned) {
- ImmField = SignExtend64(COffsetVal, NumBits);
-
- // Don't use a negative offset field if the base offset is positive.
- // Since the scheduler currently relies on the offset field, doing so
- // could result in strange scheduling decisions.
-
- // TODO: Should we not do this in the opposite direction as well?
- if (static_cast<int64_t>(COffsetVal) > 0) {
- if (static_cast<int64_t>(ImmField) < 0) {
- const uint64_t OffsetMask =
- maskTrailingOnes<uint64_t>(NumBits - 1);
- ImmField = COffsetVal & OffsetMask;
- }
- }
- } else {
- // TODO: Should we do this for a negative offset?
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
- ImmField = COffsetVal & OffsetMask;
- }
+ uint64_t RemainderOffset;
- uint64_t RemainderOffset = COffsetVal - ImmField;
+ std::tie(OffsetVal, RemainderOffset)
+ = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
- assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
- assert(RemainderOffset + ImmField == COffsetVal);
-
- OffsetVal = ImmField;
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
- // TODO: Should this try to use a scalar add pseudo if the base address
- // is uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+ if (Addr.getValueType().getSizeInBits() == 32) {
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(N0);
+ Opnds.push_back(AddOffsetLo);
+ unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ AddOp = AMDGPU::V_ADD_U32_e64;
+ Opnds.push_back(Clamp);
+ }
+ Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+ } else {
+ // TODO: Should this try to use a scalar add pseudo if the base address
+ // is uniform and saddr is usable?
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
- MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
- MVT::i32, N0, Sub1);
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub1);
- SDValue AddOffsetLo =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue AddOffsetHi =
- getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
- SDNode *Add =
- CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs),
- 0);
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
}
VAddr = Addr;
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
- SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
return true;
}
-bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
- return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC);
+// If this matches zero_extend i32:x, return x
+static SDValue matchZExtFromI32(SDValue Op) {
+ if (Op.getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ SDValue ExtSrc = Op.getOperand(0);
+ return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
+}
+
+// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
+ SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &Offset) const {
+ int64_t ImmOffset = 0;
+
+ // Match the immediate offset first, which canonically is moved as low as
+ // possible.
+
+ SDValue LHS, RHS;
+ if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
+ int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+ if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ Addr = LHS;
+ ImmOffset = COffsetVal;
+ } else if (!LHS->isDivergent() && COffsetVal > 0) {
+ SDLoc SL(N);
+ // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset)
+ = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
+
+ if (isUInt<32>(RemainderOffset)) {
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ SAddr = LHS;
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
+ }
+ }
+
+ // Match the variable offset.
+ if (Addr.getOpcode() != ISD::ADD) {
+ if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+ isa<ConstantSDNode>(Addr))
+ return false;
+
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ SAddr = Addr;
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+ CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
+
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
+
+ if (!LHS->isDivergent()) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ SAddr = LHS;
+ VOffset = ZextRHS;
+ }
+ }
+
+ if (!SAddr && !RHS->isDivergent()) {
+ // add (zero_extend (i32 vgpr)), (i64 sgpr)
+ if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ SAddr = RHS;
+ VOffset = ZextLHS;
+ }
+ }
+
+ if (!SAddr)
+ return false;
+
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
}
-bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
- return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
+// Match (32-bit SGPR base) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
+ SDValue Addr,
+ SDValue &SAddr,
+ SDValue &Offset) const {
+ if (Addr->isDivergent())
+ return false;
+
+ SAddr = Addr;
+ int64_t COffsetVal = 0;
+
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+ SAddr = Addr.getOperand(0);
+ }
+
+ if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
+ SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ } else if (SAddr.getOpcode() == ISD::ADD &&
+ isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
+ // Materialize this into a scalar move for scalar address to avoid
+ // readfirstlane.
+ auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+ FI->getValueType(0));
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
+ MVT::i32, TFI, SAddr.getOperand(1)),
+ 0);
+ }
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+ if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ int64_t RemainderOffset = COffsetVal;
+ int64_t ImmField = 0;
+ const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
+ // Use signed division by a power of two to truncate towards 0.
+ int64_t D = 1LL << (NumBits - 1);
+ RemainderOffset = (COffsetVal / D) * D;
+ ImmField = COffsetVal - RemainderOffset;
+
+ assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
+ assert(RemainderOffset + ImmField == COffsetVal);
+
+ COffsetVal = ImmField;
+
+ SDLoc DL(N);
+ SDValue AddOffset =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
+ SAddr, AddOffset), 0);
+ }
+
+ Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -2223,11 +2370,12 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
SDValue CmpVal = Mem->getOperand(2);
+ SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
// XXX - Do we care about glue operands?
SDValue Ops[] = {
- CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+ CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
@@ -2241,8 +2389,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
SDValue CmpVal = Mem->getOperand(2);
+ SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
SDValue Ops[] = {
- CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+ CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
@@ -2284,7 +2433,7 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
SDValue PtrOffset = Ptr.getOperand(1);
const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
- if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+ if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
N = glueCopyToM0(N, PtrBase);
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
}
@@ -2379,15 +2528,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
SDValue Chain = N->getOperand(0);
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
- // TODO: Can this just be removed from the instruction?
- SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
-
const unsigned Opc = gwsIntrinToOpcode(IntrID);
SmallVector<SDValue, 5> Ops;
if (HasVSrc)
Ops.push_back(N->getOperand(2));
Ops.push_back(OffsetField);
- Ops.push_back(GDS);
Ops.push_back(Chain);
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
@@ -2511,7 +2656,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
}
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
- unsigned &Mods) const {
+ unsigned &Mods,
+ bool AllowAbs) const {
Mods = 0;
Src = In;
@@ -2520,7 +2666,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
Src = Src.getOperand(0);
}
- if (Src.getOpcode() == ISD::FABS) {
+ if (AllowAbs && Src.getOpcode() == ISD::FABS) {
Mods |= SISrcMods::ABS;
Src = Src.getOperand(0);
}
@@ -2539,6 +2685,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
return false;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods;
+ if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
SelectVOP3Mods(In, Src, SrcMods);
@@ -2563,6 +2720,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
+ SDValue &SrcMods, SDValue &Clamp,
+ SDValue &Omod) const {
+ SDLoc DL(In);
+ Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ return SelectVOP3BMods(In, Src, SrcMods);
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
SDValue &Clamp, SDValue &Omod) const {
Src = In;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 940ec6f31c69..0b4b4776ad39 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -14,25 +14,17 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUCallLowering.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "SIInstrInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUMachineFunction.h"
+#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+
using namespace llvm;
#include "AMDGPUGenCallingConv.inc"
@@ -320,6 +312,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+ setOperationAction(ISD::FREM, MVT::f16, Custom);
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
@@ -396,6 +389,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTL, MVT::i64, Expand);
setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ setOperationAction(ISD::MULHU, MVT::i16, Expand);
+ setOperationAction(ISD::MULHS, MVT::i16, Expand);
+
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::MULHU, MVT::i64, Expand);
setOperationAction(ISD::MULHS, MVT::i64, Expand);
@@ -569,6 +565,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
}
+bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
+ if (getTargetMachine().Options.NoSignedZerosFPMath)
+ return true;
+
+ const auto Flags = Op.getNode()->getFlags();
+ if (Flags.hasNoSignedZeros())
+ return true;
+
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// Target Information
//===----------------------------------------------------------------------===//
@@ -598,6 +605,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMED3:
+ // TODO: handle llvm.amdgcn.fma.legacy
return true;
default:
return false;
@@ -781,34 +789,27 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
return true;
}
-bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
switch (N->getOpcode()) {
- default:
- return false;
- case ISD::EntryToken:
- case ISD::TokenFactor:
+ case ISD::EntryToken:
+ case ISD::TokenFactor:
+ return true;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
return true;
- case ISD::INTRINSIC_WO_CHAIN:
- {
- unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- switch (IntrID) {
- default:
- return false;
- case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane:
- return true;
- }
}
- break;
- case ISD::LOAD:
- {
- if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
- AMDGPUAS::CONSTANT_ADDRESS_32BIT)
- return true;
- return false;
- }
- break;
+ return false;
}
+ case ISD::LOAD:
+ if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
+ AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return true;
+ return false;
+ }
+ return false;
}
SDValue AMDGPUTargetLowering::getNegatedExpression(
@@ -944,6 +945,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::Cold:
return CC_AMDGPU_Func;
+ case CallingConv::AMDGPU_Gfx:
+ return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
default:
@@ -965,6 +968,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
+ case CallingConv::AMDGPU_Gfx:
+ return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
@@ -1017,10 +1022,14 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
unsigned InIndex = 0;
for (const Argument &Arg : Fn.args()) {
+ const bool IsByRef = Arg.hasByRefAttr();
Type *BaseArgTy = Arg.getType();
- Align Alignment = DL.getABITypeAlign(BaseArgTy);
- MaxAlign = std::max(Alignment, MaxAlign);
- unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+ Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
+ MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
+ if (!Alignment)
+ Alignment = DL.getABITypeAlign(MemArgTy);
+ MaxAlign = max(Alignment, MaxAlign);
+ uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
@@ -1224,7 +1233,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
switch (Op.getOpcode()) {
default:
Op->print(errs(), &DAG);
- llvm_unreachable("Custom lowering code for this"
+ llvm_unreachable("Custom lowering code for this "
"instruction is not implemented yet!");
break;
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
@@ -1295,7 +1304,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isEntryFunction()) {
+ if (!MFI->isModuleEntryFunction()) {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
@@ -1539,7 +1548,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr, SrcValue, LoMemVT,
BaseAlign, Load->getMemOperand()->getFlags());
- SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
+ SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
SDValue HiLoad =
DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
@@ -1564,17 +1573,25 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
return DAG.getMergeValues(Ops, SL);
}
-// Widen a vector load from vec3 to vec4.
-SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
+ SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT VT = Op.getValueType();
- assert(VT.getVectorNumElements() == 3);
SDValue BasePtr = Load->getBasePtr();
EVT MemVT = Load->getMemoryVT();
SDLoc SL(Op);
const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
unsigned BaseAlign = Load->getAlignment();
+ unsigned NumElements = MemVT.getVectorNumElements();
+
+ // Widen from vec3 to vec4 when the load is at least 8-byte aligned
+ // or 16-byte fully dereferenceable. Otherwise, split the vector load.
+ if (NumElements != 3 ||
+ (BaseAlign < 8 &&
+ !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
+ return SplitVectorLoad(Op, DAG);
+
+ assert(NumElements == 3);
EVT WideVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
@@ -2075,20 +2092,19 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
return DAG.getMergeValues(Res, DL);
}
-// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
EVT VT = Op.getValueType();
+ auto Flags = Op->getFlags();
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
- // TODO: Should this propagate fast-math-flags?
-
- SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
- SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
-
- return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+ SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
+ // TODO: For f32 use FMAD instead if !hasFastFMA32?
+ return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
}
SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
@@ -2698,14 +2714,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
// TODO: Factor out code common with LowerFP_TO_UINT.
EVT SrcVT = Src.getValueType();
- if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ if (SrcVT == MVT::f16 ||
+ (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
SDLoc DL(Op);
- SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
- SDValue FpToInt32 =
- DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
-
- return FpToInt32;
+ SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
}
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
@@ -2721,14 +2735,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
// TODO: Factor out code common with LowerFP_TO_SINT.
EVT SrcVT = Src.getValueType();
- if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+ if (SrcVT == MVT::f16 ||
+ (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
SDLoc DL(Op);
- SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
- SDValue FpToInt32 =
- DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
-
- return FpToInt32;
+ SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
}
if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
@@ -3204,7 +3216,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
SDValue Elt0 = Vec.getOperand(0);
EVT EltVT = Elt0.getValueType();
- if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+ if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
if (EltVT.isFloatingPoint()) {
Elt0 = DAG.getNode(ISD::BITCAST, SL,
EltVT.changeTypeToInteger(), Elt0);
@@ -3287,17 +3299,13 @@ static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
}
- // Because we want to eliminate extension instructions before the
- // operation, we need to create a single user here (i.e. not the separate
- // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
-
- unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
+ unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
- SDValue Mul = DAG.getNode(MulOpc, SL,
- DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
+ SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
+ SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
- return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
- Mul.getValue(0), Mul.getValue(1));
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
}
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -3395,29 +3403,6 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
return DAG.getZExtOrTrunc(Mulhi, DL, VT);
}
-SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
- SDNode *N, DAGCombinerInfo &DCI) const {
- SelectionDAG &DAG = DCI.DAG;
-
- // Simplify demanded bits before splitting into multiple users.
- if (SDValue V = simplifyI24(N, DCI))
- return V;
-
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
-
- unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
- unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
-
- SDLoc SL(N);
-
- SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
- SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
- return DAG.getMergeValues({ MulLo, MulHi }, SL);
-}
-
static bool isNegativeOne(SDValue Val) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
return C->isAllOnesValue();
@@ -3730,6 +3715,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
}
case ISD::FMA:
case ISD::FMAD: {
+ // TODO: handle llvm.amdgcn.fma.legacy
if (!mayIgnoreSignedZero(N0))
return SDValue();
@@ -3795,8 +3781,15 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
if (Res.getOpcode() != AMDGPUISD::FMED3)
return SDValue(); // Op got folded away.
- if (!N0.hasOneUse())
- DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+
+ if (!N0.hasOneUse()) {
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
+ DAG.ReplaceAllUsesWith(N0, Neg);
+
+ for (SDNode *U : Neg->uses())
+ DCI.AddToWorklist(U);
+ }
+
return Res;
}
case ISD::FP_EXTEND:
@@ -3933,7 +3926,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
}
}
- if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+ if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
break;
// Fold bitcasts of constants.
@@ -3942,14 +3935,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// TODO: Generalize and move to DAGCombiner
SDValue Src = N->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
- if (Src.getValueType() == MVT::i64) {
- SDLoc SL(N);
- uint64_t CVal = C->getZExtValue();
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
- DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
- DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
- return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
- }
+ SDLoc SL(N);
+ uint64_t CVal = C->getZExtValue();
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3999,9 +3990,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return V;
return SDValue();
}
- case AMDGPUISD::MUL_LOHI_I24:
- case AMDGPUISD::MUL_LOHI_U24:
- return performMulLoHi24Combine(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::FNEG:
@@ -4159,9 +4147,9 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
- return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
+ return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachineMemOperand::MOInvariant);
}
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
@@ -4173,7 +4161,7 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
- SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
+ SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
MachineMemOperand::MODereferenceable);
return Store;
}
@@ -4285,8 +4273,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MULHI_U24)
NODE_NAME_CASE(MULHI_I24)
- NODE_NAME_CASE(MUL_LOHI_U24)
- NODE_NAME_CASE(MUL_LOHI_I24)
NODE_NAME_CASE(MAD_U24)
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(MAD_I64_I32)
@@ -4336,7 +4322,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ATOMIC_DEC)
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
- NODE_NAME_CASE(ATOMIC_LOAD_CSUB)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -4365,8 +4350,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
- NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
- NODE_NAME_CASE(ATOMIC_PK_FADD)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
@@ -4718,6 +4701,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case Intrinsic::amdgcn_fdot2:
// TODO: Refine on operand
return SNaN;
+ case Intrinsic::amdgcn_fma_legacy:
+ if (SNaN)
+ return true;
+ return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+ DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
+ DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
default:
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 85f23c81db17..ce3618f83130 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -15,10 +15,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
-#include "AMDGPU.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -90,7 +88,6 @@ protected:
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -125,8 +122,9 @@ protected:
/// Split a vector load into 2 loads of half the vector.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
- /// Widen a vector load from vec3 to vec4.
- SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+ /// Widen a suitably aligned v3 load. For all other cases, split the input
+ /// vector load.
+ SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
/// Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -145,16 +143,7 @@ protected:
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
- bool mayIgnoreSignedZero(SDValue Op) const {
- if (getTargetMachine().Options.NoSignedZerosFPMath)
- return true;
-
- const auto Flags = Op.getNode()->getFlags();
- if (Flags.isDefined())
- return Flags.hasNoSignedZeros();
-
- return false;
- }
+ bool mayIgnoreSignedZero(SDValue Op) const;
static inline SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
@@ -440,8 +429,6 @@ enum NodeType : unsigned {
MAD_I24,
MAD_U64_U32,
MAD_I64_I32,
- MUL_LOHI_I24,
- MUL_LOHI_U24,
PERM,
TEXTURE_FETCH,
R600_EXPORT,
@@ -508,7 +495,6 @@ enum NodeType : unsigned {
ATOMIC_DEC,
ATOMIC_LOAD_FMIN,
ATOMIC_LOAD_FMAX,
- ATOMIC_LOAD_CSUB,
BUFFER_LOAD,
BUFFER_LOAD_UBYTE,
BUFFER_LOAD_USHORT,
@@ -537,8 +523,6 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
- BUFFER_ATOMIC_PK_FADD,
- ATOMIC_PK_FADD,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
deleted file mode 100644
index 3b5d91133a2f..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This is AMDGPU specific replacement of the standard inliner.
-/// The main purpose is to account for the fact that calls not only expensive
-/// on the AMDGPU, but much more expensive if a private memory pointer is
-/// passed to a function as an argument. In this situation, we are unable to
-/// eliminate private memory in the caller unless inlined and end up with slow
-/// and expensive scratch access. Thus, we boost the inline threshold for such
-/// functions here.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/Inliner.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline"
-
-static cl::opt<int>
-ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000),
- cl::desc("Cost of alloca argument"));
-
-// If the amount of scratch memory to eliminate exceeds our ability to allocate
-// it into registers we gain nothing by aggressively inlining functions for that
-// heuristic.
-static cl::opt<unsigned>
-ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
- cl::desc("Maximum alloca size to use for inline cost"));
-
-// Inliner constraint to achieve reasonable compilation time
-static cl::opt<size_t>
-MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
- cl::desc("Maximum BB number allowed in a function after inlining"
- " (compile time constraint)"));
-
-namespace {
-
-class AMDGPUInliner : public LegacyInlinerBase {
-
-public:
- AMDGPUInliner() : LegacyInlinerBase(ID) {
- initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
- Params = getInlineParams();
- }
-
- static char ID; // Pass identification, replacement for typeid
-
- unsigned getInlineThreshold(CallBase &CB) const;
-
- InlineCost getInlineCost(CallBase &CB) override;
-
- bool runOnSCC(CallGraphSCC &SCC) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
- TargetTransformInfoWrapperPass *TTIWP;
-
- InlineParams Params;
-};
-
-} // end anonymous namespace
-
-char AMDGPUInliner::ID = 0;
-INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
- "AMDGPU Function Integration/Inlining", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
- "AMDGPU Function Integration/Inlining", false, false)
-
-Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
-
-bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
- TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
- return LegacyInlinerBase::runOnSCC(SCC);
-}
-
-void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- LegacyInlinerBase::getAnalysisUsage(AU);
-}
-
-unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const {
- int Thres = Params.DefaultThreshold;
-
- Function *Caller = CB.getCaller();
- // Listen to the inlinehint attribute when it would increase the threshold
- // and the caller does not need to minimize its size.
- Function *Callee = CB.getCalledFunction();
- bool InlineHint = Callee && !Callee->isDeclaration() &&
- Callee->hasFnAttribute(Attribute::InlineHint);
- if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
- && !Caller->hasFnAttribute(Attribute::MinSize))
- Thres = Params.HintThreshold.getValue() *
- TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
-
- const DataLayout &DL = Caller->getParent()->getDataLayout();
- if (!Callee)
- return (unsigned)Thres;
-
- // If we have a pointer to private array passed into a function
- // it will not be optimized out, leaving scratch usage.
- // Increase the inline threshold to allow inliniting in this case.
- uint64_t AllocaSize = 0;
- SmallPtrSet<const AllocaInst *, 8> AIVisited;
- for (Value *PtrArg : CB.args()) {
- PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
- if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
- Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
- continue;
-
- PtrArg = GetUnderlyingObject(PtrArg, DL);
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
- if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
- continue;
- AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
- // If the amount of stack memory is excessive we will not be able
- // to get rid of the scratch anyway, bail out.
- if (AllocaSize > ArgAllocaCutoff) {
- AllocaSize = 0;
- break;
- }
- }
- }
- if (AllocaSize)
- Thres += ArgAllocaCost;
-
- return (unsigned)Thres;
-}
-
-// Check if call is just a wrapper around another call.
-// In this case we only have call and ret instructions.
-static bool isWrapperOnlyCall(CallBase &CB) {
- Function *Callee = CB.getCalledFunction();
- if (!Callee || Callee->size() != 1)
- return false;
- const BasicBlock &BB = Callee->getEntryBlock();
- if (const Instruction *I = BB.getFirstNonPHI()) {
- if (!isa<CallInst>(I)) {
- return false;
- }
- if (isa<ReturnInst>(*std::next(I->getIterator()))) {
- LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
- << Callee->getName() << '\n');
- return true;
- }
- }
- return false;
-}
-
-InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) {
- Function *Callee = CB.getCalledFunction();
- Function *Caller = CB.getCaller();
-
- if (!Callee || Callee->isDeclaration())
- return llvm::InlineCost::getNever("undefined callee");
-
- if (CB.isNoInline())
- return llvm::InlineCost::getNever("noinline");
-
- TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
- if (!TTI.areInlineCompatible(Caller, Callee))
- return llvm::InlineCost::getNever("incompatible");
-
- if (CB.hasFnAttr(Attribute::AlwaysInline)) {
- auto IsViable = isInlineViable(*Callee);
- if (IsViable.isSuccess())
- return llvm::InlineCost::getAlways("alwaysinline viable");
- return llvm::InlineCost::getNever(IsViable.getFailureReason());
- }
-
- if (isWrapperOnlyCall(CB))
- return llvm::InlineCost::getAlways("wrapper-only call");
-
- InlineParams LocalParams = Params;
- LocalParams.DefaultThreshold = (int)getInlineThreshold(CB);
- bool RemarksEnabled = false;
- const auto &BBs = Caller->getBasicBlockList();
- if (!BBs.empty()) {
- auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
- if (DI.isEnabled())
- RemarksEnabled = true;
- }
-
- OptimizationRemarkEmitter ORE(Caller);
- auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & {
- return ACT->getAssumptionCache(F);
- };
-
- auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI,
- GetAssumptionCache, GetTLI, nullptr, PSI,
- RemarksEnabled ? &ORE : nullptr);
-
- if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
- // Single BB does not increase total BB amount, thus subtract 1
- size_t Size = Caller->size() + Callee->size() - 1;
- if (MaxBB && Size > MaxBB)
- return llvm::InlineCost::getNever("max number of bb exceeded");
- }
- return IC;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
new file mode 100644
index 000000000000..06aa0055e4bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -0,0 +1,1075 @@
+//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "GCNSubtarget.h"
+#include "R600Subtarget.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+ unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+ const APFloat &Src2) {
+ APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+ APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+ assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+ if (Cmp0 == APFloat::cmpEqual)
+ return maxnum(Src1, Src2);
+
+ APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+ assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+ if (Cmp1 == APFloat::cmpEqual)
+ return maxnum(Src0, Src2);
+
+ return maxnum(Src0, Src1);
+}
+
+// Check if a value can be converted to a 16-bit value without losing
+// precision.
+static bool canSafelyConvertTo16Bit(Value &V) {
+ Type *VTy = V.getType();
+ if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
+ // The value is already 16-bit, so we don't want to convert to 16-bit again!
+ return false;
+ }
+ if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+ // We need to check that if we cast the index down to a half, we do not lose
+ // precision.
+ APFloat FloatValue(ConstFloat->getValueAPF());
+ bool LosesInfo = true;
+ FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
+ return !LosesInfo;
+ }
+ Value *CastSrc;
+ if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
+ match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
+ match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+ Type *CastSrcTy = CastSrc->getType();
+ if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
+ return true;
+ }
+
+ return false;
+}
+
+// Convert a value to 16-bit.
+static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
+ Type *VTy = V.getType();
+ if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
+ return cast<Instruction>(&V)->getOperand(0);
+ if (VTy->isIntegerTy())
+ return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
+ if (VTy->isFloatingPointTy())
+ return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
+
+ llvm_unreachable("Should never be called!");
+}
+
+static Optional<Instruction *>
+simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
+ IntrinsicInst &II, InstCombiner &IC) {
+ if (!ST->hasA16() && !ST->hasG16())
+ return None;
+
+ bool FloatCoord = false;
+ // true means derivatives can be converted to 16 bit, coordinates not
+ bool OnlyDerivatives = false;
+
+ for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+ OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
+ Value *Coord = II.getOperand(OperandIndex);
+ // If the values are not derived from 16-bit values, we cannot optimize.
+ if (!canSafelyConvertTo16Bit(*Coord)) {
+ if (OperandIndex < ImageDimIntr->CoordStart ||
+ ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
+ return None;
+ }
+ // All gradients can be converted, so convert only them
+ OnlyDerivatives = true;
+ break;
+ }
+
+ assert(OperandIndex == ImageDimIntr->GradientStart ||
+ FloatCoord == Coord->getType()->isFloatingPointTy());
+ FloatCoord = Coord->getType()->isFloatingPointTy();
+ }
+
+ if (OnlyDerivatives) {
+ if (!ST->hasG16())
+ return None;
+ } else {
+ if (!ST->hasA16())
+ OnlyDerivatives = true; // Only supports G16
+ }
+
+ Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
+ : Type::getInt16Ty(II.getContext());
+
+ SmallVector<Type *, 4> ArgTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+ return None;
+
+ ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
+ if (!OnlyDerivatives)
+ ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
+ Function *I =
+ Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
+
+ SmallVector<Value *, 8> Args(II.arg_operands());
+
+ unsigned EndIndex =
+ OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
+ for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+ OperandIndex < EndIndex; OperandIndex++) {
+ Args[OperandIndex] =
+ convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
+ }
+
+ CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+ NewCall->takeName(&II);
+ NewCall->copyMetadata(II);
+ if (isa<FPMathOperator>(NewCall))
+ NewCall->copyFastMathFlags(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+}
+
+bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
+ InstCombiner &IC) const {
+ // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+ // infinity, gives +0.0. If we can prove we don't have one of the special
+ // cases then we can use a normal multiply instead.
+ // TODO: Create and use isKnownFiniteNonZero instead of just matching
+ // constants here.
+ if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
+ match(Op1, PatternMatch::m_FiniteNonZero())) {
+ // One operand is not zero or infinity or NaN.
+ return true;
+ }
+ auto *TLI = &IC.getTargetLibraryInfo();
+ if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
+ isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
+ // Neither operand is infinity or NaN.
+ return true;
+ }
+ return false;
+}
+
+Optional<Instruction *>
+GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::amdgcn_rcp: {
+ Value *Src = II.getArgOperand(0);
+
+ // TODO: Move to ConstantFolding/InstSimplify?
+ if (isa<UndefValue>(Src)) {
+ Type *Ty = II.getType();
+ auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+ return IC.replaceInstUsesWith(II, QNaN);
+ }
+
+ if (II.isStrictFP())
+ break;
+
+ if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+ const APFloat &ArgVal = C->getValueAPF();
+ APFloat Val(ArgVal.getSemantics(), 1);
+ Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+ // This is more precise than the instruction may give.
+ //
+ // TODO: The instruction always flushes denormal results (except for f16),
+ // should this also?
+ return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_rsq: {
+ Value *Src = II.getArgOperand(0);
+
+ // TODO: Move to ConstantFolding/InstSimplify?
+ if (isa<UndefValue>(Src)) {
+ Type *Ty = II.getType();
+ auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+ return IC.replaceInstUsesWith(II, QNaN);
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_frexp_exp: {
+ Value *Src = II.getArgOperand(0);
+ if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+ int Exp;
+ APFloat Significand =
+ frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
+
+ if (IID == Intrinsic::amdgcn_frexp_mant) {
+ return IC.replaceInstUsesWith(
+ II, ConstantFP::get(II.getContext(), Significand));
+ }
+
+ // Match instruction special case behavior.
+ if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+ Exp = 0;
+
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+ }
+
+ if (isa<UndefValue>(Src)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_class: {
+ enum {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
+ };
+
+ const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+ N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
+ P_NORMAL | P_INFINITY;
+
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+ const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+ if (!CMask) {
+ if (isa<UndefValue>(Src0)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ if (isa<UndefValue>(Src1)) {
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), false));
+ }
+ break;
+ }
+
+ uint32_t Mask = CMask->getZExtValue();
+
+ // If all tests are made, it doesn't matter what the value is.
+ if ((Mask & FullMask) == FullMask) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
+ }
+
+ if ((Mask & FullMask) == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
+ }
+
+ if (Mask == (S_NAN | Q_NAN)) {
+ // Equivalent of isnan. Replace with standard fcmp.
+ Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
+ FCmp->takeName(&II);
+ return IC.replaceInstUsesWith(II, FCmp);
+ }
+
+ if (Mask == (N_ZERO | P_ZERO)) {
+ // Equivalent of == 0.
+ Value *FCmp =
+ IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
+
+ FCmp->takeName(&II);
+ return IC.replaceInstUsesWith(II, FCmp);
+ }
+
+ // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
+ if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
+ isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
+ return IC.replaceOperand(
+ II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
+ }
+
+ const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+ if (!CVal) {
+ if (isa<UndefValue>(Src0)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ // Clamp mask to used bits
+ if ((Mask & FullMask) != Mask) {
+ CallInst *NewCall = IC.Builder.CreateCall(
+ II.getCalledFunction(),
+ {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
+
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ break;
+ }
+
+ const APFloat &Val = CVal->getValueAPF();
+
+ bool Result =
+ ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+ ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+ ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+ ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+ ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+ ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+ ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+ ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+ ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+ ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
+ }
+ case Intrinsic::amdgcn_cvt_pkrtz: {
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+ if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+ if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+ const fltSemantics &HalfSem =
+ II.getType()->getScalarType()->getFltSemantics();
+ bool LosesInfo;
+ APFloat Val0 = C0->getValueAPF();
+ APFloat Val1 = C1->getValueAPF();
+ Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+ Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+ Constant *Folded =
+ ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
+ ConstantFP::get(II.getContext(), Val1)});
+ return IC.replaceInstUsesWith(II, Folded);
+ }
+ }
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_ubfe:
+ case Intrinsic::amdgcn_sbfe: {
+ // Decompose simple cases into standard shifts.
+ Value *Src = II.getArgOperand(0);
+ if (isa<UndefValue>(Src)) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+
+ unsigned Width;
+ Type *Ty = II.getType();
+ unsigned IntSize = Ty->getIntegerBitWidth();
+
+ ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ if (CWidth) {
+ Width = CWidth->getZExtValue();
+ if ((Width & (IntSize - 1)) == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
+ }
+
+ // Hardware ignores high bits, so remove those.
+ if (Width >= IntSize) {
+ return IC.replaceOperand(
+ II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
+ }
+ }
+
+ unsigned Offset;
+ ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ if (COffset) {
+ Offset = COffset->getZExtValue();
+ if (Offset >= IntSize) {
+ return IC.replaceOperand(
+ II, 1,
+ ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
+ }
+ }
+
+ bool Signed = IID == Intrinsic::amdgcn_sbfe;
+
+ if (!CWidth || !COffset)
+ break;
+
+ // The case of Width == 0 is handled above, which makes this tranformation
+ // safe. If Width == 0, then the ashr and lshr instructions become poison
+ // value since the shift amount would be equal to the bit size.
+ assert(Width != 0);
+
+ // TODO: This allows folding to undef when the hardware has specific
+ // behavior?
+ if (Offset + Width < IntSize) {
+ Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
+ Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
+ : IC.Builder.CreateLShr(Shl, IntSize - Width);
+ RightShift->takeName(&II);
+ return IC.replaceInstUsesWith(II, RightShift);
+ }
+
+ Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
+ : IC.Builder.CreateLShr(Src, Offset);
+
+ RightShift->takeName(&II);
+ return IC.replaceInstUsesWith(II, RightShift);
+ }
+ case Intrinsic::amdgcn_exp:
+ case Intrinsic::amdgcn_exp_compr: {
+ ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
+ unsigned EnBits = En->getZExtValue();
+ if (EnBits == 0xf)
+ break; // All inputs enabled.
+
+ bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
+ bool Changed = false;
+ for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+ if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+ (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+ Value *Src = II.getArgOperand(I + 2);
+ if (!isa<UndefValue>(Src)) {
+ IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
+ Changed = true;
+ }
+ }
+ }
+
+ if (Changed) {
+ return &II;
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_fmed3: {
+ // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+ // for the shader.
+
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+ Value *Src2 = II.getArgOperand(2);
+
+ // Checking for NaN before canonicalization provides better fidelity when
+ // mapping other operations onto fmed3 since the order of operands is
+ // unchanged.
+ CallInst *NewCall = nullptr;
+ if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
+ NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+ } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
+ NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+ } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
+ NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+ }
+
+ if (NewCall) {
+ NewCall->copyFastMathFlags(&II);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ bool Swap = false;
+ // Canonicalize constants to RHS operands.
+ //
+ // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+ if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+ std::swap(Src0, Src1);
+ Swap = true;
+ }
+
+ if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+ std::swap(Src1, Src2);
+ Swap = true;
+ }
+
+ if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+ std::swap(Src0, Src1);
+ Swap = true;
+ }
+
+ if (Swap) {
+ II.setArgOperand(0, Src0);
+ II.setArgOperand(1, Src1);
+ II.setArgOperand(2, Src2);
+ return &II;
+ }
+
+ if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+ if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+ if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+ APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+ C2->getValueAPF());
+ return IC.replaceInstUsesWith(
+ II, ConstantFP::get(IC.Builder.getContext(), Result));
+ }
+ }
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_icmp:
+ case Intrinsic::amdgcn_fcmp: {
+ const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
+ // Guard against invalid arguments.
+ int64_t CCVal = CC->getZExtValue();
+ bool IsInteger = IID == Intrinsic::amdgcn_icmp;
+ if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+ CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+ (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+ CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+ break;
+
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+
+ if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+ if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+ Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+ if (CCmp->isNullValue()) {
+ return IC.replaceInstUsesWith(
+ II, ConstantExpr::getSExt(CCmp, II.getType()));
+ }
+
+ // The result of V_ICMP/V_FCMP assembly instructions (which this
+ // intrinsic exposes) is one bit per thread, masked with the EXEC
+ // register (which contains the bitmask of live threads). So a
+ // comparison that always returns true is the same as a read of the
+ // EXEC register.
+ Function *NewF = Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::read_register, II.getType());
+ Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
+ MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+ Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+ CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+ NewCall->addAttribute(AttributeList::FunctionIndex,
+ Attribute::Convergent);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ // Canonicalize constants to RHS.
+ CmpInst::Predicate SwapPred =
+ CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+ II.setArgOperand(0, Src1);
+ II.setArgOperand(1, Src0);
+ II.setArgOperand(
+ 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
+ return &II;
+ }
+
+ if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+ break;
+
+ // Canonicalize compare eq with true value to compare != 0
+ // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+ // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+ // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+ // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+ Value *ExtSrc;
+ if (CCVal == CmpInst::ICMP_EQ &&
+ ((match(Src1, PatternMatch::m_One()) &&
+ match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
+ (match(Src1, PatternMatch::m_AllOnes()) &&
+ match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
+ ExtSrc->getType()->isIntegerTy(1)) {
+ IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
+ IC.replaceOperand(II, 2,
+ ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+ return &II;
+ }
+
+ CmpInst::Predicate SrcPred;
+ Value *SrcLHS;
+ Value *SrcRHS;
+
+ // Fold compare eq/ne with 0 from a compare result as the predicate to the
+ // intrinsic. The typical use is a wave vote function in the library, which
+ // will be fed from a user code condition compared with 0. Fold in the
+ // redundant compare.
+
+ // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+ // -> llvm.amdgcn.[if]cmp(a, b, pred)
+ //
+ // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+ // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+ if (match(Src1, PatternMatch::m_Zero()) &&
+ match(Src0, PatternMatch::m_ZExtOrSExt(
+ m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
+ PatternMatch::m_Value(SrcRHS))))) {
+ if (CCVal == CmpInst::ICMP_EQ)
+ SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+ Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
+ ? Intrinsic::amdgcn_fcmp
+ : Intrinsic::amdgcn_icmp;
+
+ Type *Ty = SrcLHS->getType();
+ if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
+ // Promote to next legal integer type.
+ unsigned Width = CmpType->getBitWidth();
+ unsigned NewWidth = Width;
+
+ // Don't do anything for i1 comparisons.
+ if (Width == 1)
+ break;
+
+ if (Width <= 16)
+ NewWidth = 16;
+ else if (Width <= 32)
+ NewWidth = 32;
+ else if (Width <= 64)
+ NewWidth = 64;
+ else if (Width > 64)
+ break; // Can't handle this.
+
+ if (Width != NewWidth) {
+ IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
+ if (CmpInst::isSigned(SrcPred)) {
+ SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
+ SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
+ } else {
+ SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
+ SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
+ }
+ }
+ } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
+ break;
+
+ Function *NewF = Intrinsic::getDeclaration(
+ II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
+ Value *Args[] = {SrcLHS, SrcRHS,
+ ConstantInt::get(CC->getType(), SrcPred)};
+ CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ if (Src->isZero()) {
+ // amdgcn.ballot(i1 0) is zero.
+ return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+ }
+
+ if (Src->isOne()) {
+ // amdgcn.ballot(i1 1) is exec.
+ const char *RegName = "exec";
+ if (II.getType()->isIntegerTy(32))
+ RegName = "exec_lo";
+ else if (!II.getType()->isIntegerTy(64))
+ break;
+
+ Function *NewF = Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::read_register, II.getType());
+ Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
+ MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+ Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+ CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+ NewCall->addAttribute(AttributeList::FunctionIndex,
+ Attribute::Convergent);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_wqm_vote: {
+ // wqm_vote is identity when the argument is constant.
+ if (!isa<Constant>(II.getArgOperand(0)))
+ break;
+
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ case Intrinsic::amdgcn_kill: {
+ const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
+ if (!C || !C->getZExtValue())
+ break;
+
+ // amdgcn.kill(i1 1) is a no-op
+ return IC.eraseInstFromFunction(II);
+ }
+ case Intrinsic::amdgcn_update_dpp: {
+ Value *Old = II.getArgOperand(0);
+
+ auto *BC = cast<ConstantInt>(II.getArgOperand(5));
+ auto *RM = cast<ConstantInt>(II.getArgOperand(3));
+ auto *BM = cast<ConstantInt>(II.getArgOperand(4));
+ if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
+ BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
+ break;
+
+ // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+ return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
+ }
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ // Discard vdst_in if it's not going to be read.
+ Value *VDstIn = II.getArgOperand(0);
+ if (isa<UndefValue>(VDstIn))
+ break;
+
+ ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
+ ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
+ if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
+ break;
+
+ return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
+ }
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ // A constant value is trivially uniform.
+ if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+ return IC.replaceInstUsesWith(II, C);
+ }
+
+ // The rest of these may not be safe if the exec may not be the same between
+ // the def and use.
+ Value *Src = II.getArgOperand(0);
+ Instruction *SrcInst = dyn_cast<Instruction>(Src);
+ if (SrcInst && SrcInst->getParent() != II.getParent())
+ break;
+
+ // readfirstlane (readfirstlane x) -> readfirstlane x
+ // readlane (readfirstlane x), y -> readfirstlane x
+ if (match(Src,
+ PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+
+ if (IID == Intrinsic::amdgcn_readfirstlane) {
+ // readfirstlane (readlane x, y) -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+ } else {
+ // readlane (readlane x, y), y -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+ PatternMatch::m_Value(),
+ PatternMatch::m_Specific(II.getArgOperand(1))))) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_ldexp: {
+ // FIXME: This doesn't introduce new instructions and belongs in
+ // InstructionSimplify.
+ Type *Ty = II.getType();
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+
+ // Folding undef to qnan is safe regardless of the FP mode.
+ if (isa<UndefValue>(Op0)) {
+ auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+ return IC.replaceInstUsesWith(II, QNaN);
+ }
+
+ const APFloat *C = nullptr;
+ match(Op0, PatternMatch::m_APFloat(C));
+
+ // FIXME: Should flush denorms depending on FP mode, but that's ignored
+ // everywhere else.
+ //
+ // These cases should be safe, even with strictfp.
+ // ldexp(0.0, x) -> 0.0
+ // ldexp(-0.0, x) -> -0.0
+ // ldexp(inf, x) -> inf
+ // ldexp(-inf, x) -> -inf
+ if (C && (C->isZero() || C->isInfinity())) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // With strictfp, be more careful about possibly needing to flush denormals
+ // or not, and snan behavior depends on ieee_mode.
+ if (II.isStrictFP())
+ break;
+
+ if (C && C->isNaN()) {
+ // FIXME: We just need to make the nan quiet here, but that's unavailable
+ // on APFloat, only IEEEfloat
+ auto *Quieted =
+ ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
+ return IC.replaceInstUsesWith(II, Quieted);
+ }
+
+ // ldexp(x, 0) -> x
+ // ldexp(x, undef) -> x
+ if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_fmul_legacy: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+
+ // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+ // infinity, gives +0.0.
+ // TODO: Move to InstSimplify?
+ if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
+ match(Op1, PatternMatch::m_AnyZeroFP()))
+ return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
+
+ // If we can prove we don't have one of the special cases then we can use a
+ // normal fmul instruction instead.
+ if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+ auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
+ FMul->takeName(&II);
+ return IC.replaceInstUsesWith(II, FMul);
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_fma_legacy: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Op2 = II.getArgOperand(2);
+
+ // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+ // infinity, gives +0.0.
+ // TODO: Move to InstSimplify?
+ if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
+ match(Op1, PatternMatch::m_AnyZeroFP())) {
+ // It's tempting to just return Op2 here, but that would give the wrong
+ // result if Op2 was -0.0.
+ auto *Zero = ConstantFP::getNullValue(II.getType());
+ auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
+ FAdd->takeName(&II);
+ return IC.replaceInstUsesWith(II, FAdd);
+ }
+
+ // If we can prove we don't have one of the special cases then we can use a
+ // normal fma instead.
+ if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+ II.setCalledOperand(Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::fma, II.getType()));
+ return &II;
+ }
+ break;
+ }
+ default: {
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+ return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
+ }
+ }
+ }
+ return None;
+}
+
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+///
+/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
+/// struct returns.
+static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+ IntrinsicInst &II,
+ APInt DemandedElts,
+ int DMaskIdx = -1) {
+
+ auto *IIVTy = cast<FixedVectorType>(II.getType());
+ unsigned VWidth = IIVTy->getNumElements();
+ if (VWidth == 1)
+ return nullptr;
+
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&II);
+
+ // Assume the arguments are unchanged and later override them, if needed.
+ SmallVector<Value *, 16> Args(II.args());
+
+ if (DMaskIdx < 0) {
+ // Buffer case.
+
+ const unsigned ActiveBits = DemandedElts.getActiveBits();
+ const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+
+ // Start assuming the prefix of elements is demanded, but possibly clear
+ // some other bits if there are trailing zeros (unused components at front)
+ // and update offset.
+ DemandedElts = (1 << ActiveBits) - 1;
+
+ if (UnusedComponentsAtFront > 0) {
+ static const unsigned InvalidOffsetIdx = 0xf;
+
+ unsigned OffsetIdx;
+ switch (II.getIntrinsicID()) {
+ case Intrinsic::amdgcn_raw_buffer_load:
+ OffsetIdx = 1;
+ break;
+ case Intrinsic::amdgcn_s_buffer_load:
+ // If resulting type is vec3, there is no point in trimming the
+ // load with updated offset, as the vec3 would most likely be widened to
+ // vec4 anyway during lowering.
+ if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
+ OffsetIdx = InvalidOffsetIdx;
+ else
+ OffsetIdx = 1;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_load:
+ OffsetIdx = 2;
+ break;
+ default:
+ // TODO: handle tbuffer* intrinsics.
+ OffsetIdx = InvalidOffsetIdx;
+ break;
+ }
+
+ if (OffsetIdx != InvalidOffsetIdx) {
+ // Clear demanded bits and update the offset.
+ DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
+ auto *Offset = II.getArgOperand(OffsetIdx);
+ unsigned SingleComponentSizeInBits =
+ IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
+ unsigned OffsetAdd =
+ UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
+ auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+ Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+ }
+ }
+ } else {
+ // Image case.
+
+ ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+ // Mask off values that are undefined because the dmask doesn't cover them
+ DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+ unsigned NewDMaskVal = 0;
+ unsigned OrigLoadIdx = 0;
+ for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+ const unsigned Bit = 1 << SrcIdx;
+ if (!!(DMaskVal & Bit)) {
+ if (!!DemandedElts[OrigLoadIdx])
+ NewDMaskVal |= Bit;
+ OrigLoadIdx++;
+ }
+ }
+
+ if (DMaskVal != NewDMaskVal)
+ Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
+ }
+
+ unsigned NewNumElts = DemandedElts.countPopulation();
+ if (!NewNumElts)
+ return UndefValue::get(II.getType());
+
+ if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+ if (DMaskIdx >= 0)
+ II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
+ return nullptr;
+ }
+
+ // Validate function argument and return types, extracting overloaded types
+ // along the way.
+ SmallVector<Type *, 6> OverloadTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
+ return nullptr;
+
+ Module *M = II.getParent()->getParent()->getParent();
+ Type *EltTy = IIVTy->getElementType();
+ Type *NewTy =
+ (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
+
+ OverloadTys[0] = NewTy;
+ Function *NewIntrin =
+ Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
+
+ CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
+ NewCall->takeName(&II);
+ NewCall->copyMetadata(II);
+
+ if (NewNumElts == 1) {
+ return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
+ NewCall,
+ DemandedElts.countTrailingZeros());
+ }
+
+ SmallVector<int, 8> EltMask;
+ unsigned NewLoadIdx = 0;
+ for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+ if (!!DemandedElts[OrigLoadIdx])
+ EltMask.push_back(NewLoadIdx++);
+ else
+ EltMask.push_back(NewNumElts);
+ }
+
+ Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+
+ return Shuffle;
+}
+
+Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ SimplifyAndSetOp) const {
+ switch (II.getIntrinsicID()) {
+ case Intrinsic::amdgcn_buffer_load:
+ case Intrinsic::amdgcn_buffer_load_format:
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_s_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load_format:
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_tbuffer_load:
+ return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
+ default: {
+ if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
+ return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
+ }
+ break;
+ }
+ }
+ return None;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 6c13bc8599db..f2d62956e25b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -13,11 +13,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUInstrInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Value.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 61b78acad3f4..8e7a6a7029c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -15,9 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
-#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
namespace llvm {
@@ -25,6 +23,7 @@ class GCNSubtarget;
class MachineFunction;
class MachineInstr;
class MachineInstrBuilder;
+class MachineMemOperand;
class AMDGPUInstrInfo {
public:
@@ -52,6 +51,28 @@ struct ImageDimIntrinsicInfo {
unsigned Intr;
unsigned BaseOpcode;
MIMGDim Dim;
+
+ uint8_t NumGradients;
+ uint8_t NumDmask;
+ uint8_t NumData;
+ uint8_t NumVAddrs;
+ uint8_t NumArgs;
+
+ uint8_t DMaskIndex;
+ uint8_t VAddrStart;
+ uint8_t GradientStart;
+ uint8_t CoordStart;
+ uint8_t LodIndex;
+ uint8_t MipIndex;
+ uint8_t VAddrEnd;
+ uint8_t RsrcIndex;
+ uint8_t SampIndex;
+ uint8_t UnormIndex;
+ uint8_t TexFailCtrlIndex;
+ uint8_t CachePolicyIndex;
+
+ uint8_t GradientTyArg;
+ uint8_t CoordTyArg;
};
const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2025c0fa5d21..bd577a6fb8c5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -12,27 +12,17 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUInstructionSelector.h"
-#include "AMDGPUInstrInfo.h"
+#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterBankInfo.h"
-#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/IR/DiagnosticInfo.h"
#define DEBUG_TYPE "amdgpu-isel"
@@ -72,13 +62,15 @@ const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
CodeGenCoverage &CoverageInfo) {
MRI = &MF.getRegInfo();
+ Subtarget = &MF.getSubtarget<GCNSubtarget>();
InstructionSelector::setupMF(MF, KB, CoverageInfo);
}
bool AMDGPUInstructionSelector::isVCC(Register Reg,
const MachineRegisterInfo &MRI) const {
- if (Register::isPhysicalRegister(Reg))
- return Reg == TRI.getVCC();
+ // The verifier is oblivious to s1 being a valid value for wavesize registers.
+ if (Reg.isPhysical())
+ return false;
auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
const TargetRegisterClass *RC =
@@ -170,24 +162,11 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
return false;
- // Don't constrain the source register to a class so the def instruction
- // handles it (unless it's undef).
- //
- // FIXME: This is a hack. When selecting the def, we neeed to know
- // specifically know that the result is VCCRegBank, and not just an SGPR
- // with size 1. An SReg_32 with size 1 is ambiguous with wave32.
- if (Src.isUndef()) {
- const TargetRegisterClass *SrcRC =
- TRI.getConstrainedRegClassForOperand(Src, *MRI);
- if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
- return false;
- }
-
return true;
}
for (const MachineOperand &MO : I.operands()) {
- if (Register::isPhysicalRegister(MO.getReg()))
+ if (MO.getReg().isPhysical())
continue;
const TargetRegisterClass *RC =
@@ -286,50 +265,24 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
}
bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
- MachineOperand &Dst = I.getOperand(0);
- MachineOperand &Src0 = I.getOperand(1);
- MachineOperand &Src1 = I.getOperand(2);
- Register DstReg = Dst.getReg();
+ Register DstReg = I.getOperand(0).getReg();
unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
- if (DstRB->getID() == AMDGPU::VCCRegBankID) {
- const TargetRegisterClass *RC = TRI.getBoolRC();
- unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
- RC == &AMDGPU::SReg_64RegClass);
- I.setDesc(TII.get(InstOpc));
- // Dead implicit-def of scc
- I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
- true, // isImp
- false, // isKill
- true)); // isDead
-
- // FIXME: Hack to avoid turning the register bank into a register class.
- // The selector for G_ICMP relies on seeing the register bank for the result
- // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
- // be ambiguous whether it's a scalar or vector bool.
- if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg()))
- MRI->setRegClass(Src0.getReg(), RC);
- if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg()))
- MRI->setRegClass(Src1.getReg(), RC);
-
- return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
- }
-
- // TODO: Should this allow an SCC bank result, and produce a copy from SCC for
- // the result?
- if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
- unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
- I.setDesc(TII.get(InstOpc));
- // Dead implicit-def of scc
- I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
- true, // isImp
- false, // isKill
- true)); // isDead
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
- }
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
+ DstRB->getID() != AMDGPU::VCCRegBankID)
+ return false;
- return false;
+ bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
+ STI.isWave64());
+ I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
+
+ // Dead implicit-def of scc
+ I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
+ true, // isImp
+ false, // isKill
+ true)); // isDead
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
@@ -365,7 +318,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
+ const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
MachineInstr *Add
@@ -403,7 +356,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
} else {
const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
Register CarryReg = MRI->createVirtualRegister(CarryRC);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
.addDef(CarryReg)
.add(Lo1)
.add(Lo2)
@@ -446,10 +399,8 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
I.getOpcode() == AMDGPU::G_USUBE;
if (isVCC(Dst1Reg, *MRI)) {
- // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
- // carry out despite the _i32 name. These were renamed in VI to _U32.
- // FIXME: We should probably rename the opcodes here.
- unsigned NoCarryOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ unsigned NoCarryOpc =
+ IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
@@ -597,8 +548,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
- const unsigned SrcFlags = getUndefRegState(Src.isUndef());
-
// Note we could have mixed SGPR and VGPR destination banks for an SGPR
// source, and this relies on the fact that the same subregister indices are
// used for both.
@@ -606,7 +555,12 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
for (int I = 0, E = NumDst; I != E; ++I) {
MachineOperand &Dst = MI.getOperand(I);
BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
- .addReg(SrcReg, SrcFlags, SubRegs[I]);
+ .addReg(SrcReg, 0, SubRegs[I]);
+
+ // Make sure the subregister index is valid for the source register.
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
+ if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ return false;
const TargetRegisterClass *DstRC =
TRI.getConstrainedRegClassForOperand(Dst, *MRI);
@@ -618,11 +572,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
return true;
}
-static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
- int64_t Val;
- return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
-}
-
bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
MachineInstr &MI) const {
if (selectImpl(MI, *CoverageInfo))
@@ -647,6 +596,24 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock *BB = MI.getParent();
+ auto ConstSrc1 =
+ getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
+ if (ConstSrc1) {
+ auto ConstSrc0 =
+ getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
+ if (ConstSrc0) {
+ const int64_t K0 = ConstSrc0->Value.getSExtValue();
+ const int64_t K1 = ConstSrc1->Value.getSExtValue();
+ uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
+ uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
+ .addImm(Lo16 | (Hi16 << 16));
+ MI.eraseFromParent();
+ return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
+ }
+ }
+
// TODO: This should probably be a combine somewhere
// (build_vector_trunc $src0, undef -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
@@ -659,7 +626,6 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
Register ShiftSrc0;
Register ShiftSrc1;
- int64_t ShiftAmt;
// With multiple uses of the shift, this will duplicate the shift and
// increase register pressure.
@@ -671,14 +637,11 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
// (build_vector_trunc $src0, $src1)
// => (S_PACK_LL_B32_B16 $src0, $src1)
- // FIXME: This is an inconvenient way to check a specific value
bool Shift0 = mi_match(
- Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
- ShiftAmt == 16;
+ Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
bool Shift1 = mi_match(
- Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
- ShiftAmt == 16;
+ Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
if (Shift0 && Shift1) {
@@ -688,7 +651,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
} else if (Shift1) {
Opc = AMDGPU::S_PACK_LH_B32_B16;
MI.getOperand(2).setReg(ShiftSrc1);
- } else if (Shift0 && isZero(Src1, *MRI)) {
+ } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
// build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
.addReg(ShiftSrc0)
@@ -738,6 +701,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
if (Offset % 32 != 0 || InsSize % 32 != 0)
return false;
+ // Currently not handled by getSubRegFromChannel.
+ if (InsSize > 128)
+ return false;
+
unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
if (SubReg == AMDGPU::NoSubRegister)
return false;
@@ -821,6 +788,63 @@ bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
return true;
}
+// Writelane is special in that it can use SGPR and M0 (which would normally
+// count as using the constant bus twice - but in this case it is allowed since
+// the lane selector doesn't count as a use of the constant bus). However, it is
+// still required to abide by the 1 SGPR rule. Fix this up if we might have
+// multiple SGPRs.
+bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
+ // With a constant bus limit of at least 2, there's no issue.
+ if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
+ return selectImpl(MI, *CoverageInfo);
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register VDst = MI.getOperand(0).getReg();
+ Register Val = MI.getOperand(2).getReg();
+ Register LaneSelect = MI.getOperand(3).getReg();
+ Register VDstIn = MI.getOperand(4).getReg();
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
+
+ Optional<ValueAndVReg> ConstSelect =
+ getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
+ if (ConstSelect) {
+ // The selector has to be an inline immediate, so we can use whatever for
+ // the other operands.
+ MIB.addReg(Val);
+ MIB.addImm(ConstSelect->Value.getSExtValue() &
+ maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
+ } else {
+ Optional<ValueAndVReg> ConstVal =
+ getConstantVRegValWithLookThrough(Val, *MRI, true, true);
+
+ // If the value written is an inline immediate, we can get away without a
+ // copy to m0.
+ if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
+ STI.hasInv2PiInlineImm())) {
+ MIB.addImm(ConstVal->Value.getSExtValue());
+ MIB.addReg(LaneSelect);
+ } else {
+ MIB.addReg(Val);
+
+ // If the lane selector was originally in a VGPR and copied with
+ // readfirstlane, there's a hazard to read the same SGPR from the
+ // VALU. Constrain to a different SGPR to help avoid needing a nop later.
+ RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
+
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(LaneSelect);
+ MIB.addReg(AMDGPU::M0);
+ }
+ }
+
+ MIB.addReg(VDstIn);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
@@ -830,12 +854,14 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
LLT Ty = MRI->getType(Dst0);
unsigned Opc;
if (Ty == LLT::scalar(32))
- Opc = AMDGPU::V_DIV_SCALE_F32;
+ Opc = AMDGPU::V_DIV_SCALE_F32_e64;
else if (Ty == LLT::scalar(64))
- Opc = AMDGPU::V_DIV_SCALE_F64;
+ Opc = AMDGPU::V_DIV_SCALE_F64_e64;
else
return false;
+ // TODO: Match source modifiers.
+
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock *MBB = MI.getParent();
@@ -847,9 +873,14 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
.addDef(Dst1)
- .addUse(Src0)
- .addUse(Denom)
- .addUse(Numer);
+ .addImm(0) // $src0_modifiers
+ .addUse(Src0) // $src0
+ .addImm(0) // $src1_modifiers
+ .addUse(Denom) // $src1
+ .addImm(0) // $src2_modifiers
+ .addUse(Numer) // $src2
+ .addImm(0) // $clamp
+ .addImm(0); // $omod
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
@@ -887,12 +918,20 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
case Intrinsic::amdgcn_wwm:
return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+ case Intrinsic::amdgcn_writelane:
+ return selectWritelane(I);
case Intrinsic::amdgcn_div_scale:
return selectDivScale(I);
case Intrinsic::amdgcn_icmp:
return selectIntrinsicIcmp(I);
case Intrinsic::amdgcn_ballot:
return selectBallot(I);
+ case Intrinsic::amdgcn_reloc_constant:
+ return selectRelocConstant(I);
+ case Intrinsic::amdgcn_groupstaticsize:
+ return selectGroupStaticSize(I);
+ case Intrinsic::returnaddress:
+ return selectReturnAddress(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -1055,7 +1094,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
if (Arg.hasValue()) {
- const int64_t Value = Arg.getValue().Value;
+ const int64_t Value = Arg.getValue().Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
@@ -1073,6 +1112,96 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
+ Register DstReg = I.getOperand(0).getReg();
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
+ if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
+ return false;
+
+ const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
+
+ Module *M = MF->getFunction().getParent();
+ const MDNode *Metadata = I.getOperand(2).getMetadata();
+ auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
+ auto RelocSymbol = cast<GlobalVariable>(
+ M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
+
+ MachineBasicBlock *BB = I.getParent();
+ BuildMI(*BB, &I, I.getDebugLoc(),
+ TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
+ .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
+ Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
+
+ Register DstReg = I.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
+ AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
+
+ if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MIB.addImm(MFI->getLDSSize());
+ } else {
+ Module *M = MF->getFunction().getParent();
+ const GlobalValue *GV
+ = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
+ MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
+ }
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
+ MachineBasicBlock *MBB = I.getParent();
+ MachineFunction &MF = *MBB->getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ MachineOperand &Dst = I.getOperand(0);
+ Register DstReg = Dst.getReg();
+ unsigned Depth = I.getOperand(2).getImm();
+
+ const TargetRegisterClass *RC
+ = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
+ !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
+ return false;
+
+ // Check for kernel and shader functions
+ if (Depth != 0 ||
+ MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
+ .addImm(0);
+ I.eraseFromParent();
+ return true;
+ }
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ // There is a call to @llvm.returnaddress in this function
+ MFI.setReturnAddressIsTaken(true);
+
+ // Get the return address reg and mark it as an implicit live-in
+ Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
+ Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
+ AMDGPU::SReg_64RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(LiveIn);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
// FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
@@ -1088,28 +1217,6 @@ bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
return true;
}
-static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
- switch (MF.getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_PS:
- return 1;
- case CallingConv::AMDGPU_VS:
- return 2;
- case CallingConv::AMDGPU_GS:
- return 3;
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_LS:
- case CallingConv::AMDGPU_ES:
- report_fatal_error("ds_ordered_count unsupported for this calling conv");
- case CallingConv::AMDGPU_CS:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::C:
- case CallingConv::Fast:
- default:
- // Assume other calling conventions are various compute callable functions
- return 0;
- }
-}
-
bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
MachineInstr &MI, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = MI.getParent();
@@ -1141,7 +1248,7 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
report_fatal_error("ds_ordered_count: bad index operand");
unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
- unsigned ShaderType = getDSShaderTypeValue(*MF);
+ unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
unsigned Offset0 = OrderedCountIndex << 2;
unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
@@ -1235,8 +1342,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addImm(0);
} else {
- std::tie(BaseOffset, ImmOffset, OffsetDef)
- = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
+ std::tie(BaseOffset, ImmOffset) =
+ AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
if (Readfirstlane) {
// We have the constant offset now, so put the readfirstlane back on the
@@ -1274,7 +1381,6 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
}
MIB.addImm(ImmOffset)
- .addImm(-1) // $gds
.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -1291,7 +1397,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
// TODO: Should this try to look through readfirstlane like GWS?
- if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+ if (!isDSOffsetLegal(PtrBase, Offset)) {
PtrBase = MI.getOperand(2).getReg();
Offset = 0;
}
@@ -1302,12 +1408,29 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
.addReg(PtrBase);
- BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
+ if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
+ return false;
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
.addImm(Offset)
.addImm(IsGDS ? -1 : 0)
.cloneMemRefs(MI);
MI.eraseFromParent();
- return true;
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
+ if (TM.getOptLevel() > CodeGenOpt::None) {
+ unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
+ if (WGSize <= STI.getWavefrontSize()) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+ return selectImpl(MI, *CoverageInfo);
}
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
@@ -1355,36 +1478,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
- const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
+ const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
- const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
- MI.getNumExplicitDefs());
- int NumVAddr, NumGradients;
- std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
+ const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
Register VDataIn, VDataOut;
LLT VDataTy;
int NumVDataDwords = -1;
bool IsD16 = false;
- // XXX - Can we just get the second to last argument for ctrl?
- unsigned CtrlIdx; // Index of texfailctrl argument
bool Unorm;
- if (!BaseOpcode->Sampler) {
+ if (!BaseOpcode->Sampler)
Unorm = true;
- CtrlIdx = VAddrIdx + NumVAddr + 1;
- } else {
- Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
- CtrlIdx = VAddrIdx + NumVAddr + 3;
- }
+ else
+ Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
bool TFE;
bool LWE;
bool IsTexFail = false;
- if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
+ if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
+ TFE, LWE, IsTexFail))
return false;
- const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
+ const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
const bool IsA16 = (Flags & 1) != 0;
const bool IsG16 = (Flags & 2) != 0;
@@ -1415,11 +1531,19 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
NumVDataDwords = Is64Bit ? 2 : 1;
}
} else {
- const int DMaskIdx = 2; // Input/output + intrinsic ID.
-
- DMask = MI.getOperand(DMaskIdx).getImm();
+ DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
+ // One memoperand is mandatory, except for getresinfo.
+ // FIXME: Check this in verifier.
+ if (!MI.memoperands_empty()) {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ // Infer d16 from the memory size, as the register type will be mangled by
+ // unpacked subtargets, or by TFE.
+ IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
+ }
+
if (BaseOpcode->Store) {
VDataIn = MI.getOperand(1).getReg();
VDataTy = MRI->getType(VDataIn);
@@ -1429,18 +1553,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
VDataTy = MRI->getType(VDataOut);
NumVDataDwords = DMaskLanes;
- // One memoperand is mandatory, except for getresinfo.
- // FIXME: Check this in verifier.
- if (!MI.memoperands_empty()) {
- const MachineMemOperand *MMO = *MI.memoperands_begin();
-
- // Infer d16 from the memory size, as the register type will be mangled by
- // unpacked subtargets, or by TFE.
- IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
-
- if (IsD16 && !STI.hasUnpackedD16VMem())
- NumVDataDwords = (DMaskLanes + 1) / 2;
- }
+ if (IsD16 && !STI.hasUnpackedD16VMem())
+ NumVDataDwords = (DMaskLanes + 1) / 2;
}
}
@@ -1448,7 +1562,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (LZMappingInfo) {
// The legalizer replaced the register with an immediate 0 if we need to
// change the opcode.
- const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+ const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
if (Lod.isImm()) {
assert(Lod.getImm() == 0);
IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
@@ -1457,7 +1571,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
// Optimize _mip away, when 'lod' is zero
if (MIPMappingInfo) {
- const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+ const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
if (Lod.isImm()) {
assert(Lod.getImm() == 0);
IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
@@ -1480,20 +1594,22 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
bool DLC = false;
if (BaseOpcode->Atomic) {
GLC = true; // TODO no-return optimization
- if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
- IsGFX10 ? &DLC : nullptr))
+ if (!parseCachePolicy(
+ MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
+ &SLC, IsGFX10Plus ? &DLC : nullptr))
return false;
} else {
- if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
- IsGFX10 ? &DLC : nullptr))
+ if (!parseCachePolicy(
+ MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
+ &SLC, IsGFX10Plus ? &DLC : nullptr))
return false;
}
int NumVAddrRegs = 0;
int NumVAddrDwords = 0;
- for (int I = 0; I < NumVAddr; ++I) {
+ for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
// Skip the $noregs and 0s inserted during legalization.
- MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
+ MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
if (!AddrOp.isReg())
continue; // XXX - Break?
@@ -1518,7 +1634,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
++NumVDataDwords;
int Opcode = -1;
- if (IsGFX10) {
+ if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
@@ -1556,36 +1672,36 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (VDataIn)
MIB.addReg(VDataIn); // vdata input
- for (int i = 0; i != NumVAddrRegs; ++i) {
- MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
+ for (int I = 0; I != NumVAddrRegs; ++I) {
+ MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
if (SrcOp.isReg()) {
assert(SrcOp.getReg() != 0);
MIB.addReg(SrcOp.getReg());
}
}
- MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
+ MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
if (BaseOpcode->Sampler)
- MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
+ MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
MIB.addImm(DMask); // dmask
- if (IsGFX10)
+ if (IsGFX10Plus)
MIB.addImm(DimInfo->Encoding);
MIB.addImm(Unorm);
- if (IsGFX10)
+ if (IsGFX10Plus)
MIB.addImm(DLC);
MIB.addImm(GLC);
MIB.addImm(SLC);
MIB.addImm(IsA16 && // a16 or r128
STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
- if (IsGFX10)
+ if (IsGFX10Plus)
MIB.addImm(IsA16 ? -1 : 0);
MIB.addImm(TFE); // tfe
MIB.addImm(LWE); // lwe
- if (!IsGFX10)
+ if (!IsGFX10Plus)
MIB.addImm(DimInfo->DA ? -1 : 0);
if (BaseOpcode->HasD16)
MIB.addImm(IsD16 ? -1 : 0);
@@ -1614,6 +1730,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);
+ case Intrinsic::amdgcn_s_barrier:
+ return selectSBarrier(I);
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ return selectGlobalAtomicFaddIntrinsic(I);
default: {
return selectImpl(I, *CoverageInfo);
}
@@ -1670,11 +1790,6 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
return Ret;
}
-bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
- initM0(I);
- return selectImpl(I, *CoverageInfo);
-}
-
static int sizeToSubRegIndex(unsigned Size) {
switch (Size) {
case 32:
@@ -1853,12 +1968,33 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
if (!DstTy.isScalar())
return false;
- if (I.getOpcode() == AMDGPU::G_ANYEXT)
- return selectCOPY(I);
-
// Artifact casts should never use vcc.
const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
+ // FIXME: This should probably be illegal and split earlier.
+ if (I.getOpcode() == AMDGPU::G_ANYEXT) {
+ if (DstSize <= 32)
+ return selectCOPY(I);
+
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+
+ Register UndefReg = MRI->createVirtualRegister(SrcRC);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
+ I.eraseFromParent();
+
+ return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
+ RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
+ }
+
if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
// 64-bit should have been split up in RegBankSelect
@@ -1873,7 +2009,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
- const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
+ const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
MachineInstr *ExtI =
BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
.addReg(SrcReg)
@@ -1944,33 +2080,36 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineOperand &ImmOp = I.getOperand(1);
+ Register DstReg = I.getOperand(0).getReg();
+ unsigned Size = MRI->getType(DstReg).getSizeInBits();
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
if (ImmOp.isFPImm()) {
const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
ImmOp.ChangeToImmediate(Imm.getZExtValue());
} else if (ImmOp.isCImm()) {
- ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
+ ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
+ } else {
+ llvm_unreachable("Not supported by g_constants");
}
- Register DstReg = I.getOperand(0).getReg();
- unsigned Size;
- bool IsSgpr;
- const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
- if (RB) {
- IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
- Size = MRI->getType(DstReg).getSizeInBits();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
+
+ unsigned Opcode;
+ if (DstRB->getID() == AMDGPU::VCCRegBankID) {
+ Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
} else {
- const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
- IsSgpr = TRI.isSGPRClass(RC);
- Size = TRI.getRegSizeInBits(*RC);
- }
+ Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
- if (Size != 32 && Size != 64)
- return false;
+ // We should never produce s1 values on banks other than VCC. If the user of
+ // this already constrained the register, we may incorrectly think it's VCC
+ // if it wasn't originally.
+ if (Size == 1)
+ return false;
+ }
- unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
- if (Size == 32) {
+ if (Size != 64) {
I.setDesc(TII.get(Opcode));
I.addImplicitDefUseOperands(*MF);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -2148,6 +2287,10 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
getAddrModeInfo(*PtrMI, MRI, AddrInfo);
}
+bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
+ return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
+}
+
bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
if (!MI.hasOneMemOperand())
return false;
@@ -2179,19 +2322,20 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
}
void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
-
const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
unsigned AS = PtrTy.getAddressSpace();
if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
STI.ldsRequiresM0Init()) {
+ MachineBasicBlock *BB = I.getParent();
+
// If DS instructions require M0 initializtion, insert it before selecting.
BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addImm(-1);
}
}
-bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
+ MachineInstr &I) const {
initM0(I);
return selectImpl(I, *CoverageInfo);
}
@@ -2242,6 +2386,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
MIB.addImm(0);
MIB.addImm(Offset);
+ MIB.addImm(1); // glc
MIB.addImm(0); // slc
MIB.cloneMemRefs(MI);
@@ -2276,8 +2421,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
CondPhysReg = AMDGPU::SCC;
BrOpcode = AMDGPU::S_CBRANCH_SCC1;
- // FIXME: Hack for isSCC tests
- ConstrainRC = &AMDGPU::SGPR_32RegClass;
+ ConstrainRC = &AMDGPU::SReg_32RegClass;
} else {
// FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
// We sort of know that a VCC producer based on the register bank, that ands
@@ -2301,7 +2445,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
return true;
}
-bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
+bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
@@ -2422,10 +2566,8 @@ computeIndirectRegIndex(MachineRegisterInfo &MRI,
unsigned EltSize) {
Register IdxBaseReg;
int Offset;
- MachineInstr *Unused;
- std::tie(IdxBaseReg, Offset, Unused)
- = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
+ std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
if (IdxBaseReg == AMDGPU::NoRegister) {
// This will happen if the index is a known constant. This should ordinarily
// be legalized out, but handle it as a register just in case.
@@ -2501,20 +2643,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
.addReg(IdxReg);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
- .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, 0, SubReg)
.addReg(SrcReg, RegState::Implicit);
MI.eraseFromParent();
return true;
}
- BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(IdxReg)
- .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
- BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
- .addReg(SrcReg, RegState::Undef, SubReg)
- .addReg(SrcReg, RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
- BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+ const MCInstrDesc &GPRIDXDesc =
+ TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
+ BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
+ .addReg(SrcReg)
+ .addReg(IdxReg)
+ .addImm(SubReg);
MI.eraseFromParent();
return true;
@@ -2568,25 +2708,27 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
MachineBasicBlock *BB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- if (IndexMode) {
- BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(IdxReg)
- .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
- } else {
+ if (!IndexMode) {
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
.addReg(IdxReg);
- }
- const MCInstrDesc &RegWriteOp
- = TII.getIndirectRegWritePseudo(VecSize, ValSize,
- VecRB->getID() == AMDGPU::SGPRRegBankID);
- BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
- .addReg(VecReg)
- .addReg(ValReg)
- .addImm(SubReg);
+ const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
+ VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
+ BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
+ .addReg(VecReg)
+ .addReg(ValReg)
+ .addImm(SubReg);
+ MI.eraseFromParent();
+ return true;
+ }
- if (IndexMode)
- BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+ const MCInstrDesc &GPRIDXDesc =
+ TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+ BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
+ .addReg(VecReg)
+ .addReg(ValReg)
+ .addReg(IdxReg)
+ .addImm(SubReg);
MI.eraseFromParent();
return true;
@@ -2731,7 +2873,7 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
}
} else if (Mask[0] == 1 && Mask[1] == 0) {
if (IsVALU) {
- BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
.addReg(SrcVec)
.addReg(SrcVec)
.addImm(16);
@@ -2751,6 +2893,130 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
return true;
}
+bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
+ MachineInstr &MI) const {
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+ Function &F = MBB->getParent()->getFunction();
+ DiagnosticInfoUnsupported
+ NoFpRet(F, "return versions of fp atomics not supported",
+ MI.getDebugLoc(), DS_Error);
+ F.getContext().diagnose(NoFpRet);
+ return false;
+ }
+
+ // FIXME: This is only needed because tablegen requires number of dst operands
+ // in match and replace pattern to be the same. Otherwise patterns can be
+ // exported from SDag path.
+ MachineOperand &VDataIn = MI.getOperand(1);
+ MachineOperand &VIndex = MI.getOperand(3);
+ MachineOperand &VOffset = MI.getOperand(4);
+ MachineOperand &SOffset = MI.getOperand(5);
+ int16_t Offset = MI.getOperand(6).getImm();
+
+ bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
+ bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
+
+ unsigned Opcode;
+ if (HasVOffset) {
+ Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
+ : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
+ } else {
+ Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
+ : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
+ }
+
+ if (MRI->getType(VDataIn.getReg()).isVector()) {
+ switch (Opcode) {
+ case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
+ Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
+ break;
+ case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
+ Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
+ break;
+ case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
+ Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
+ break;
+ case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
+ Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
+ break;
+ }
+ }
+
+ auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
+ I.add(VDataIn);
+
+ if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
+ Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
+ Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
+ .addReg(VIndex.getReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(VOffset.getReg())
+ .addImm(AMDGPU::sub1);
+
+ I.addReg(IdxReg);
+ } else if (HasVIndex) {
+ I.add(VIndex);
+ } else if (HasVOffset) {
+ I.add(VOffset);
+ }
+
+ I.add(MI.getOperand(2)); // rsrc
+ I.add(SOffset);
+ I.addImm(Offset);
+ renderExtractSLC(I, MI, 7);
+ I.cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
+ MachineInstr &MI) const{
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
+ Function &F = MBB->getParent()->getFunction();
+ DiagnosticInfoUnsupported
+ NoFpRet(F, "return versions of fp atomics not supported",
+ MI.getDebugLoc(), DS_Error);
+ F.getContext().diagnose(NoFpRet);
+ return false;
+ }
+
+ // FIXME: This is only needed because tablegen requires number of dst operands
+ // in match and replace pattern to be the same. Otherwise patterns can be
+ // exported from SDag path.
+ auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
+
+ Register Data = MI.getOperand(3).getReg();
+ const unsigned Opc = MRI->getType(Data).isVector() ?
+ AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
+ .addReg(Addr.first)
+ .addReg(Data)
+ .addImm(Addr.second)
+ .addImm(0) // SLC
+ .cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
+ MI.setDesc(TII.get(MI.getOperand(1).getImm()));
+ MI.RemoveOperand(1);
+ MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@@ -2807,6 +3073,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_PTR_ADD(I);
case TargetOpcode::G_IMPLICIT_DEF:
return selectG_IMPLICIT_DEF(I);
+ case TargetOpcode::G_FREEZE:
+ return selectCOPY(I);
case TargetOpcode::G_INSERT:
return selectG_INSERT(I);
case TargetOpcode::G_INTRINSIC:
@@ -2818,6 +3086,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_STORE:
case TargetOpcode::G_ATOMIC_CMPXCHG:
case TargetOpcode::G_ATOMICRMW_XCHG:
case TargetOpcode::G_ATOMICRMW_ADD:
@@ -2830,13 +3099,15 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMIN:
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_FADD:
- return selectG_LOAD_ATOMICRMW(I);
+ case AMDGPU::G_AMDGPU_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
+ case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
+ return selectG_LOAD_STORE_ATOMICRMW(I);
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
- case TargetOpcode::G_STORE:
- return selectG_STORE(I);
case TargetOpcode::G_TRUNC:
return selectG_TRUNC(I);
case TargetOpcode::G_SEXT:
@@ -2848,9 +3119,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_SZA_EXT(I);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
- case TargetOpcode::G_FRAME_INDEX:
case TargetOpcode::G_GLOBAL_VALUE:
- return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
+ return selectG_GLOBAL_VALUE(I);
case TargetOpcode::G_PTRMASK:
return selectG_PTRMASK(I);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -2859,10 +3129,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_INSERT_VECTOR_ELT(I);
case TargetOpcode::G_SHUFFLE_VECTOR:
return selectG_SHUFFLE_VECTOR(I);
- case AMDGPU::G_AMDGPU_ATOMIC_INC:
- case AMDGPU::G_AMDGPU_ATOMIC_DEC:
- initM0(I);
- return selectImpl(I, *CoverageInfo);
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
const AMDGPU::ImageDimIntrinsicInfo *Intr
@@ -2870,6 +3136,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
assert(Intr && "not an image intrinsic with image pseudo");
return selectImageIntrinsic(I, Intr);
}
+ case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
+ return selectBVHIntrinsic(I);
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -2885,7 +3155,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
+ bool AllowAbs) const {
Register Src = Root.getReg();
Register OrigSrc = Src;
unsigned Mods = 0;
@@ -2897,7 +3168,7 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
MI = getDefIgnoringCopies(Src, *MRI);
}
- if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
+ if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
Src = MI->getOperand(1).getReg();
Mods |= SISrcMods::ABS;
}
@@ -2944,6 +3215,20 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Root); },
@@ -2965,6 +3250,18 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
Register Reg = Root.getReg();
const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
@@ -3019,7 +3316,7 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
- if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+ if (!isKnownNeverNaN(Src, *MRI))
return None;
return {{
@@ -3112,49 +3409,234 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
}
template <bool Signed>
-InstructionSelector::ComplexRendererFns
+std::pair<Register, int>
AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
- InstructionSelector::ComplexRendererFns Default = {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
- }};
+ auto Default = std::make_pair(Root.getReg(), 0);
if (!STI.hasFlatInstOffsets())
return Default;
- const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
- if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
- return Default;
-
- Optional<int64_t> Offset =
- getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
- if (!Offset.hasValue())
+ Register PtrBase;
+ int64_t ConstOffset;
+ std::tie(PtrBase, ConstOffset) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+ if (ConstOffset == 0)
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
- if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
+ if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
return Default;
- Register BasePtr = OpDef->getOperand(1).getReg();
+ return std::make_pair(PtrBase, ConstOffset);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
}};
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
- return selectFlatOffsetImpl<false>(Root);
+AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
+ }};
}
+/// Match a zero extend from a 32-bit value to 64-bits.
+static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+ Register ZExtSrc;
+ if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+
+ // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return false;
+
+ if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ return Def->getOperand(1).getReg();
+ }
+
+ return Register();
+}
+
+// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
- return selectFlatOffsetImpl<true>(Root);
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+ Register Addr = Root.getReg();
+ Register PtrBase;
+ int64_t ConstOffset;
+ int64_t ImmOffset = 0;
+
+ // Match the immediate offset first, which canonically is moved as low as
+ // possible.
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+ if (ConstOffset != 0) {
+ if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ Addr = PtrBase;
+ ImmOffset = ConstOffset;
+ } else if (ConstOffset > 0) {
+ auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
+ if (!PtrBaseDef)
+ return None;
+
+ if (isSGPR(PtrBaseDef->Reg)) {
+ // Offset is too large.
+ //
+ // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
+ // + (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset)
+ = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
+
+ if (isUInt<32>(RemainderOffset)) {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register HighBits
+ = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(RemainderOffset);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ }};
+ }
+ }
+ }
+ }
+
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (!AddrDef)
+ return None;
+
+ // Match the variable offset.
+ if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
+ // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+ // drop this.
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+ AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
+ return None;
+
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ const Register SAddr = AddrDef->Reg;
+ if (!isSGPR(SAddr))
+ return None;
+
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ VOffset)
+ .addImm(0);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+ }
+
+ // Look through the SGPR->VGPR copy.
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+ if (!SAddr || !isSGPR(SAddr))
+ return None;
+
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+
+ // It's possible voffset is an SGPR here, but the copy to VGPR will be
+ // inserted later.
+ Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
+ if (!VOffset)
+ return None;
+
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
+ Register Addr = Root.getReg();
+ Register PtrBase;
+ int64_t ConstOffset;
+ int64_t ImmOffset = 0;
+
+ // Match the immediate offset first, which canonically is moved as low as
+ // possible.
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+
+ if (ConstOffset != 0 &&
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ Addr = PtrBase;
+ ImmOffset = ConstOffset;
+ }
+
+ auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+ if (!AddrDef)
+ return None;
+
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+ int FI = AddrDef->MI->getOperand(1).getIndex();
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+ }
+
+ Register SAddr = AddrDef->Reg;
+
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ Register LHS = AddrDef->MI->getOperand(1).getReg();
+ Register RHS = AddrDef->MI->getOperand(2).getReg();
+ auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+ auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
+
+ if (LHSDef && RHSDef &&
+ LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
+ isSGPR(RHSDef->Reg)) {
+ int FI = LHSDef->MI->getOperand(1).getIndex();
+ MachineInstr &I = *Root.getParent();
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
+ .addFrameIndex(FI)
+ .addReg(RHSDef->Reg);
+ }
+ }
+
+ if (!isSGPR(SAddr))
+ return None;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
}
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -3187,13 +3669,9 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addReg(HighBits);
},
[=](MachineInstrBuilder &MIB) { // soffset
- const MachineMemOperand *MMO = *MI->memoperands_begin();
- const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
-
- if (isStackPtrRelative(PtrInfo))
- MIB.addReg(Info->getStackPtrOffsetReg());
- else
- MIB.addImm(0);
+ // Use constant zero for soffset and rely on eliminateFrameIndex
+ // to choose the appropriate frame register if need be.
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset & 4095);
@@ -3240,15 +3718,9 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addReg(VAddr);
},
[=](MachineInstrBuilder &MIB) { // soffset
- // If we don't know this private access is a local stack object, it
- // needs to be relative to the entry point's scratch wave offset.
- // TODO: Should split large offsets that don't fit like above.
- // TODO: Don't use scratch wave offset just because the offset
- // didn't fit.
- if (!Info->isEntryFunction() && FI.hasValue())
- MIB.addReg(Info->getStackPtrOffsetReg());
- else
- MIB.addImm(0);
+ // Use constant zero for soffset and rely on eliminateFrameIndex
+ // to choose the appropriate frame register if need be.
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
@@ -3256,10 +3728,24 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
}
bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
- int64_t Offset,
- unsigned OffsetBits) const {
- if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
- (OffsetBits == 8 && !isUInt<8>(Offset)))
+ int64_t Offset) const {
+ if (!isUInt<16>(Offset))
+ return false;
+
+ if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return KnownBits->signBitIsZero(Base);
+}
+
+bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
+ int64_t Offset1,
+ unsigned Size) const {
+ if (Offset0 % Size != 0 || Offset1 % Size != 0)
+ return false;
+ if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
return false;
if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
@@ -3314,7 +3800,7 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
- if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+ if (isDSOffsetLegal(PtrBase, Offset)) {
// (add n0, c0)
return std::make_pair(PtrBase, Offset);
}
@@ -3343,9 +3829,20 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
+ return selectDSReadWrite2(Root, 4);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
+ return selectDSReadWrite2(Root, 8);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
+ unsigned Size) const {
Register Reg;
unsigned Offset;
- std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
+ std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
@@ -3354,7 +3851,8 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const
}
std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
+ unsigned Size) const {
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
if (!RootDef)
return std::make_pair(Root.getReg(), 0);
@@ -3367,11 +3865,11 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) c
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
if (Offset) {
- int64_t DWordOffset0 = Offset / 4;
- int64_t DWordOffset1 = DWordOffset0 + 1;
- if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
+ int64_t OffsetValue0 = Offset;
+ int64_t OffsetValue1 = Offset + Size;
+ if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
// (add n0, c0)
- return std::make_pair(PtrBase, DWordOffset0);
+ return std::make_pair(PtrBase, OffsetValue0 / Size);
}
} else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
// TODO
@@ -3391,7 +3889,7 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) c
std::pair<Register, int64_t>
AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
Register Root, const MachineRegisterInfo &MRI) const {
- MachineInstr *RootI = MRI.getVRegDef(Root);
+ MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
return {Root, 0};
@@ -3400,7 +3898,7 @@ AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
= getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
if (!MaybeOffset)
return {Root, 0};
- return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
+ return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
}
static void addZeroImm(MachineInstrBuilder &MIB) {
@@ -3582,6 +4080,11 @@ bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
MachineOperand &Root, Register &RSrcReg, Register &SOffset,
int64_t &Offset) const {
+
+ // FIXME: Pattern should not reach here.
+ if (STI.useFlatForGlobal())
+ return false;
+
MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
if (shouldUseAddr64(AddrData))
return false;
@@ -3723,7 +4226,7 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
static Optional<uint64_t> getConstantZext32Val(Register Reg,
const MachineRegisterInfo &MRI) {
// getConstantVRegVal sexts any values, so see if that matters.
- Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
+ Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
if (!OffsetVal || !isInt<32>(*OffsetVal))
return None;
return Lo_32(*OffsetVal);
@@ -3833,6 +4336,12 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
}
+void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ MIB.addFrameIndex((MI.getOperand(1).getIndex()));
+}
+
bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 1fe80958917d..d70f18098cd7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -13,13 +13,11 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
-#include "AMDGPU.h"
-#include "AMDGPUArgumentUsageInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
namespace {
#define GET_GLOBALISEL_PREDICATE_BITSET
@@ -37,6 +35,7 @@ struct ImageDimIntrinsicInfo;
class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
+class AMDGPUTargetMachine;
class GCNSubtarget;
class MachineInstr;
class MachineIRBuilder;
@@ -47,9 +46,10 @@ class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
-class AMDGPUInstructionSelector : public InstructionSelector {
+class AMDGPUInstructionSelector final : public InstructionSelector {
private:
MachineRegisterInfo *MRI;
+ const GCNSubtarget *Subtarget;
public:
AMDGPUInstructionSelector(const GCNSubtarget &STI,
@@ -71,6 +71,8 @@ private:
GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
};
+ bool isSGPR(Register Reg) const;
+
bool isInstrUniform(const MachineInstr &MI) const;
bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const;
@@ -105,15 +107,20 @@ private:
bool selectG_INSERT(MachineInstr &I) const;
bool selectInterpP1F16(MachineInstr &MI) const;
+ bool selectWritelane(MachineInstr &MI) const;
bool selectDivScale(MachineInstr &MI) const;
bool selectIntrinsicIcmp(MachineInstr &MI) const;
bool selectBallot(MachineInstr &I) const;
+ bool selectRelocConstant(MachineInstr &I) const;
+ bool selectGroupStaticSize(MachineInstr &I) const;
+ bool selectReturnAddress(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I) const;
bool selectEndCfIntrinsic(MachineInstr &MI) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+ bool selectSBarrier(MachineInstr &MI) const;
bool selectImageIntrinsic(MachineInstr &MI,
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
@@ -126,19 +133,21 @@ private:
bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
void initM0(MachineInstr &I) const;
- bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
+ bool selectG_LOAD_STORE_ATOMICRMW(MachineInstr &I) const;
bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const;
- bool selectG_STORE(MachineInstr &I) const;
bool selectG_SELECT(MachineInstr &I) const;
bool selectG_BRCOND(MachineInstr &I) const;
- bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const;
+ bool selectG_GLOBAL_VALUE(MachineInstr &I) const;
bool selectG_PTRMASK(MachineInstr &I) const;
bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
+ bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
+ bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
+ bool selectBVHIntrinsic(MachineInstr &I) const;
- std::pair<Register, unsigned>
- selectVOP3ModsImpl(MachineOperand &Root) const;
+ std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
+ bool AllowAbs = true) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -149,9 +158,13 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3BMods0(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3BMods(MachineOperand &Root) const;
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
@@ -175,32 +188,45 @@ private:
selectSmrdSgpr(MachineOperand &Root) const;
template <bool Signed>
- InstructionSelector::ComplexRendererFns
+ std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand &Root) const;
+
InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand &Root) const;
-
InstructionSelector::ComplexRendererFns
selectFlatOffsetSigned(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectGlobalSAddr(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectScratchSAddr(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffen(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand &Root) const;
- bool isDSOffsetLegal(Register Base, int64_t Offset,
- unsigned OffsetBits) const;
+ bool isDSOffsetLegal(Register Base, int64_t Offset) const;
+ bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
+ unsigned Size) const;
std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand &Root) const;
- std::pair<Register, unsigned>
- selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectDS64Bit4ByteAligned(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectDS128Bit8ByteAligned(MachineOperand &Root) const;
+
+ std::pair<Register, unsigned> selectDSReadWrite2Impl(MachineOperand &Root,
+ unsigned size) const;
+ InstructionSelector::ComplexRendererFns
+ selectDSReadWrite2(MachineOperand &Root, unsigned size) const;
+
std::pair<Register, int64_t>
getPtrBaseWithConstantOffset(Register Root,
const MachineRegisterInfo &MRI) const;
@@ -284,6 +310,8 @@ private:
int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
bool isInlineImmediate16(int64_t Imm) const;
bool isInlineImmediate32(int64_t Imm) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 5cb7ac320d2f..8ef9c99e8b35 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -83,9 +83,8 @@ def FalsePredicate : Predicate<"false">;
// Add a predicate to the list if does not already exist to deduplicate it.
class PredConcat<list<Predicate> lst, Predicate pred> {
list<Predicate> ret =
- !foldl([pred], lst, acc, cur,
- !listconcat(acc, !if(!eq(!cast<string>(cur),!cast<string>(pred)),
- [], [cur])));
+ !listconcat([pred], !filter(item, lst,
+ !ne(!cast<string>(item), !cast<string>(pred))));
}
class PredicateControl {
@@ -483,19 +482,20 @@ defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+let MemoryVT = v2f16 in
+defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
-
-def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
+ Aligned<8> {
let IsLoad = 1;
let IsNonExtLoad = 1;
- let MinAlignment = 8;
}
-def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
+ Aligned<16> {
let IsLoad = 1;
let IsNonExtLoad = 1;
- let MinAlignment = 16;
}
def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
@@ -596,149 +596,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt rc:$addr)
>;
-// BFI_INT patterns
-
-multiclass BFIPatterns <Instruction BFI_INT,
- Instruction LoadImm32,
- RegisterClass RC64> {
- // Definition from ISA doc:
- // (y & x) | (z & ~x)
- def : AMDGPUPat <
- (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
- (BFI_INT $x, $y, $z)
- >;
-
- // 64-bit version
- def : AMDGPUPat <
- (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
- (REG_SEQUENCE RC64,
- (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub0)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub0)),
- (i32 (EXTRACT_SUBREG RC64:$z, sub0))), sub0,
- (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$z, sub1))), sub1)
- >;
-
- // SHA-256 Ch function
- // z ^ (x & (y ^ z))
- def : AMDGPUPat <
- (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
- (BFI_INT $x, $y, $z)
- >;
-
- // 64-bit version
- def : AMDGPUPat <
- (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
- (REG_SEQUENCE RC64,
- (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub0)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub0)),
- (i32 (EXTRACT_SUBREG RC64:$z, sub0))), sub0,
- (BFI_INT (i32 (EXTRACT_SUBREG RC64:$x, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$z, sub1))), sub1)
- >;
-
- def : AMDGPUPat <
- (fcopysign f32:$src0, f32:$src1),
- (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
- >;
-
- def : AMDGPUPat <
- (f32 (fcopysign f32:$src0, f64:$src1)),
- (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0,
- (i32 (EXTRACT_SUBREG RC64:$src1, sub1)))
- >;
-
- def : AMDGPUPat <
- (f64 (fcopysign f64:$src0, f64:$src1)),
- (REG_SEQUENCE RC64,
- (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
- (BFI_INT (LoadImm32 (i32 0x7fffffff)),
- (i32 (EXTRACT_SUBREG RC64:$src0, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$src1, sub1))), sub1)
- >;
-
- def : AMDGPUPat <
- (f64 (fcopysign f64:$src0, f32:$src1)),
- (REG_SEQUENCE RC64,
- (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
- (BFI_INT (LoadImm32 (i32 0x7fffffff)),
- (i32 (EXTRACT_SUBREG RC64:$src0, sub1)),
- $src1), sub1)
- >;
-}
-
-// SHA-256 Ma patterns
-
-// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
-multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> {
- def : AMDGPUPat <
- (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
- (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
- >;
-
- def : AMDGPUPat <
- (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
- (REG_SEQUENCE RC64,
- (BFI_INT (XOR (i32 (EXTRACT_SUBREG RC64:$x, sub0)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub0))),
- (i32 (EXTRACT_SUBREG RC64:$z, sub0)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub0))), sub0,
- (BFI_INT (XOR (i32 (EXTRACT_SUBREG RC64:$x, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub1))),
- (i32 (EXTRACT_SUBREG RC64:$z, sub1)),
- (i32 (EXTRACT_SUBREG RC64:$y, sub1))), sub1)
- >;
-}
-
-// Bitfield extract patterns
-
-def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
- return isMask_32(Imm);
-}]>;
-
-def IMMPopCount : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
- MVT::i32);
-}]>;
-
-multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
- def : AMDGPUPat <
- (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
- (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
- >;
-
- // x & ((1 << y) - 1)
- def : AMDGPUPat <
- (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
- (UBFE $src, (MOV (i32 0)), $width)
- >;
-
- // x & ~(-1 << y)
- def : AMDGPUPat <
- (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
- (UBFE $src, (MOV (i32 0)), $width)
- >;
-
- // x & (-1 >> (bitwidth - y))
- def : AMDGPUPat <
- (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
- (UBFE $src, (MOV (i32 0)), $width)
- >;
-
- // x << (bitwidth - y) >> (bitwidth - y)
- def : AMDGPUPat <
- (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
- (UBFE $src, (MOV (i32 0)), $width)
- >;
-
- def : AMDGPUPat <
- (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
- (SBFE $src, (MOV (i32 0)), $width)
- >;
-}
-
// fshr pattern
class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(fshr i32:$src0, i32:$src1, i32:$src2),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
new file mode 100644
index 000000000000..8aea33cf289d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -0,0 +1,195 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "amdgpu-late-codegenprepare"
+
+using namespace llvm;
+
+// Scalar load widening needs running after load-store-vectorizer as that pass
+// doesn't handle overlapping cases. In addition, this pass enhances the
+// widening to handle cases where scalar sub-dword loads are naturally aligned
+// only but not dword aligned.
+static cl::opt<bool>
+ WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
+ cl::desc("Widen sub-dword constant address space loads in "
+ "AMDGPULateCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(true));
+
+namespace {
+
+class AMDGPULateCodeGenPrepare
+ : public FunctionPass,
+ public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
+ Module *Mod = nullptr;
+ const DataLayout *DL = nullptr;
+
+ AssumptionCache *AC = nullptr;
+ LegacyDivergenceAnalysis *DA = nullptr;
+
+public:
+ static char ID;
+
+ AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "AMDGPU IR late optimizations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.setPreservesAll();
+ }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ bool visitInstruction(Instruction &) { return false; }
+
+ // Check if the specified value is at least DWORD aligned.
+ bool isDWORDAligned(const Value *V) const {
+ KnownBits Known = computeKnownBits(V, *DL, 0, AC);
+ return Known.countMinTrailingZeros() >= 2;
+ }
+
+ bool canWidenScalarExtLoad(LoadInst &LI) const;
+ bool visitLoadInst(LoadInst &LI);
+};
+
+} // end anonymous namespace
+
+bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
+ Mod = &M;
+ DL = &Mod->getDataLayout();
+ return false;
+}
+
+bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+ bool Changed = false;
+ for (auto &BB : F)
+ for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
+ Instruction *I = &*BI++;
+ Changed |= visit(*I);
+ }
+
+ return Changed;
+}
+
+bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
+ unsigned AS = LI.getPointerAddressSpace();
+ // Skip non-constant address space.
+ if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return false;
+ // Skip non-simple loads.
+ if (!LI.isSimple())
+ return false;
+ auto *Ty = LI.getType();
+ // Skip aggregate types.
+ if (Ty->isAggregateType())
+ return false;
+ unsigned TySize = DL->getTypeStoreSize(Ty);
+ // Only handle sub-DWORD loads.
+ if (TySize >= 4)
+ return false;
+ // That load must be at least naturally aligned.
+ if (LI.getAlign() < DL->getABITypeAlign(Ty))
+ return false;
+ // It should be uniform, i.e. a scalar load.
+ return DA->isUniform(&LI);
+}
+
+bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
+ if (!WidenLoads)
+ return false;
+
+ // Skip if that load is already aligned on DWORD at least as it's handled in
+ // SDAG.
+ if (LI.getAlign() >= 4)
+ return false;
+
+ if (!canWidenScalarExtLoad(LI))
+ return false;
+
+ int64_t Offset = 0;
+ auto *Base =
+ GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
+ // If that base is not DWORD aligned, it's not safe to perform the following
+ // transforms.
+ if (!isDWORDAligned(Base))
+ return false;
+
+ int64_t Adjust = Offset & 0x3;
+ if (Adjust == 0) {
+ // With a zero adjust, the original alignment could be promoted with a
+ // better one.
+ LI.setAlignment(Align(4));
+ return true;
+ }
+
+ IRBuilder<> IRB(&LI);
+ IRB.SetCurrentDebugLocation(LI.getDebugLoc());
+
+ unsigned AS = LI.getPointerAddressSpace();
+ unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
+ auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
+
+ PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
+ auto *NewPtr = IRB.CreateBitCast(
+ IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
+ Offset - Adjust),
+ Int32PtrTy);
+ LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
+ NewLd->copyMetadata(LI);
+ NewLd->setMetadata(LLVMContext::MD_range, nullptr);
+
+ unsigned ShAmt = Adjust * 8;
+ auto *NewVal = IRB.CreateBitCast(
+ IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
+ LI.replaceAllUsesWith(NewVal);
+ RecursivelyDeleteTriviallyDeadInstructions(&LI);
+
+ return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR late optimizations", false, false)
+
+char AMDGPULateCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
+ return new AMDGPULateCodeGenPrepare();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2976794b49c3..9f359c232981 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -15,18 +15,15 @@
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
@@ -60,16 +57,30 @@ static LLT getPow2ScalarType(LLT Ty) {
return LLT::scalar(Pow2Bits);
}
+/// \returs true if this is an odd sized vector which should widen by adding an
+/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
+/// excludes s1 vectors, which should always be scalarized.
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
- return Ty.isVector() &&
- Ty.getNumElements() % 2 != 0 &&
- Ty.getElementType().getSizeInBits() < 32 &&
+ if (!Ty.isVector())
+ return false;
+
+ const LLT EltTy = Ty.getElementType();
+ const unsigned EltSize = EltTy.getSizeInBits();
+ return Ty.getNumElements() % 2 != 0 &&
+ EltSize > 1 && EltSize < 32 &&
Ty.getSizeInBits() % 32 != 0;
};
}
+static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ return Ty.getSizeInBits() % 32 == 0;
+ };
+}
+
static LegalityPredicate isWideVec16(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
@@ -115,20 +126,32 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+static LLT getBitcastRegisterType(const LLT Ty) {
+ const unsigned Size = Ty.getSizeInBits();
+
+ LLT CoercedTy;
+ if (Size <= 32) {
+ // <2 x s8> -> s16
+ // <4 x s8> -> s32
+ return LLT::scalar(Size);
+ }
+
+ return LLT::scalarOrVector(Size / 32, 32);
+}
+
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
- unsigned Size = Ty.getSizeInBits();
-
- LLT CoercedTy;
- if (Size <= 32) {
- // <2 x s8> -> s16
- // <4 x s8> -> s32
- CoercedTy = LLT::scalar(Size);
- } else
- CoercedTy = LLT::scalarOrVector(Size / 32, 32);
+ return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
+ };
+}
- return std::make_pair(TypeIdx, CoercedTy);
+static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ unsigned Size = Ty.getSizeInBits();
+ assert(Size % 32 == 0);
+ return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
};
}
@@ -213,7 +236,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
switch (AS) {
case AMDGPUAS::PRIVATE_ADDRESS:
// FIXME: Private element size.
- return 32;
+ return ST.enableFlatScratch() ? 128 : 32;
case AMDGPUAS::LOCAL_ADDRESS:
return ST.useDS128() ? 128 : 64;
case AMDGPUAS::GLOBAL_ADDRESS:
@@ -243,7 +266,7 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
unsigned RegSize = Ty.getSizeInBits();
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
unsigned AS = Query.Types[1].getAddressSpace();
// All of these need to be custom lowered to cast the pointer operand.
@@ -286,9 +309,10 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
assert(RegSize >= MemSize);
- if (Align < MemSize) {
+ if (AlignBits < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
- if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
+ if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
+ Align(AlignBits / 8)))
return false;
}
@@ -308,7 +332,12 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
return false;
if (!Ty.isVector())
return true;
- unsigned EltSize = Ty.getElementType().getSizeInBits();
+
+ LLT EltTy = Ty.getElementType();
+ if (EltTy.isPointer())
+ return true;
+
+ unsigned EltSize = EltTy.getSizeInBits();
return EltSize != 32 && EltSize != 64;
}
@@ -319,6 +348,66 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
!loadStoreBitcastWorkaround(Ty);
}
+/// Return true if a load or store of the type should be lowered with a bitcast
+/// to a different type.
+static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
+ const unsigned MemSizeInBits) {
+ const unsigned Size = Ty.getSizeInBits();
+ if (Size != MemSizeInBits)
+ return Size <= 32 && Ty.isVector();
+
+ if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+ return true;
+ return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+ !isRegisterVectorElementType(Ty.getElementType());
+}
+
+/// Return true if we should legalize a load by widening an odd sized memory
+/// access up to the alignment. Note this case when the memory access itself
+/// changes, not the size of the result register.
+static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
+ unsigned AlignInBits, unsigned AddrSpace,
+ unsigned Opcode) {
+ // We don't want to widen cases that are naturally legal.
+ if (isPowerOf2_32(SizeInBits))
+ return false;
+
+ // If we have 96-bit memory operations, we shouldn't touch them. Note we may
+ // end up widening these for a scalar load during RegBankSelect, since there
+ // aren't 96-bit scalar loads.
+ if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
+ return false;
+
+ if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
+ return false;
+
+ // A load is known dereferenceable up to the alignment, so it's legal to widen
+ // to it.
+ //
+ // TODO: Could check dereferenceable for less aligned cases.
+ unsigned RoundedSize = NextPowerOf2(SizeInBits);
+ if (AlignInBits < RoundedSize)
+ return false;
+
+ // Do not widen if it would introduce a slow unaligned load.
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ bool Fast = false;
+ return TLI->allowsMisalignedMemoryAccessesImpl(
+ RoundedSize, AddrSpace, Align(AlignInBits / 8),
+ MachineMemOperand::MOLoad, &Fast) &&
+ Fast;
+}
+
+static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
+ unsigned Opcode) {
+ if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
+ return false;
+
+ return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
+ Query.MMODescrs[0].AlignInBits,
+ Query.Types[1].getAddressSpace(), Opcode);
+}
+
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
@@ -329,6 +418,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
const LLT S1 = LLT::scalar(1);
+ const LLT S8 = LLT::scalar(8);
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
@@ -337,6 +427,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT S512 = LLT::scalar(512);
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
+ const LLT V2S8 = LLT::vector(2, 8);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
@@ -410,48 +501,103 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
getActionDefinitionsBuilder(G_PHI)
- .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
+ .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
.legalFor(AllS32Vectors)
.legalFor(AllS64Vectors)
.legalFor(AddrSpaces64)
.legalFor(AddrSpaces32)
- .clampScalar(0, S32, S256)
+ .legalIf(isPointer(0))
+ .clampScalar(0, S16, S256)
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .legalIf(isPointer(0));
+ .scalarize(0);
- if (ST.hasVOP3PInsts()) {
+ if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
+ // Full set of gfx9 features.
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16, V2S16})
.clampScalar(0, S16, S32)
.clampMaxNumElements(0, S16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32);
+
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
+ .legalFor({S32, S16, V2S16}) // Clamp modifier
+ .minScalarOrElt(0, S16)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .lower();
} else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16})
.clampScalar(0, S16, S32)
.scalarize(0)
- .widenScalarToNextPow2(0, 32);
+ .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
+
+ // Technically the saturating operations require clamp bit support, but this
+ // was introduced at the same time as 16-bit operations.
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .legalFor({S32, S16}) // Clamp modifier
+ .minScalar(0, S16)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 16)
+ .lower();
+
+ // We're just lowering this, but it helps get a better result to try to
+ // coerce to the desired type first.
+ getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
+ .minScalar(0, S16)
+ .scalarize(0)
+ .lower();
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32})
.clampScalar(0, S32, S32)
.scalarize(0);
+
+ if (ST.hasIntClamp()) {
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .legalFor({S32}) // Clamp modifier.
+ .scalarize(0)
+ .minScalarOrElt(0, S32)
+ .lower();
+ } else {
+ // Clamp bit support was added in VI, along with 16-bit operations.
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .minScalar(0, S32)
+ .scalarize(0)
+ .lower();
+ }
+
+ // FIXME: DAG expansion gets better results. The widening uses the smaller
+ // range values and goes for the min/max lowering directly.
+ getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
+ .minScalar(0, S32)
+ .scalarize(0)
+ .lower();
}
- // FIXME: Not really legal. Placeholder for custom lowering.
getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
.customFor({S32, S64})
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0, 32)
.scalarize(0);
- getActionDefinitionsBuilder({G_UMULH, G_SMULH})
- .legalFor({S32})
- .clampScalar(0, S32, S32)
- .scalarize(0);
+ auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
+ .legalFor({S32})
+ .maxScalarOrElt(0, S32);
+
+ if (ST.hasVOP3PInsts()) {
+ Mulh
+ .clampMaxNumElements(0, S8, 2)
+ .lowerFor({V2S8});
+ }
+
+ Mulh
+ .scalarize(0)
+ .lower();
// Report legal for any types we can handle anywhere. For the cases only legal
// on the SALU, RegBankSelect will be able to re-legalize.
@@ -479,9 +625,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_CONSTANT)
.legalFor({S1, S32, S64, S16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
+ .legalIf(isPointer(0))
.clampScalar(0, S32, S64)
- .widenScalarToNextPow2(0)
- .legalIf(isPointer(0));
+ .widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
.legalFor({S32, S64, S16})
@@ -505,8 +651,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor({{PrivatePtr, S32}});
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
- .unsupportedFor({PrivatePtr})
- .custom();
+ .customIf(typeIsNot(0, PrivatePtr));
+
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
auto &FPOpActions = getActionDefinitionsBuilder(
@@ -599,7 +745,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_FPEXT)
.legalFor({{S64, S32}, {S32, S16}})
- .lowerFor({{S64, S16}}) // FIXME: Implement
+ .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
.scalarize(0);
getActionDefinitionsBuilder(G_FSUB)
@@ -621,6 +767,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FMad.scalarize(0)
.lower();
+ auto &FRem = getActionDefinitionsBuilder(G_FREM);
+ if (ST.has16BitInsts()) {
+ FRem.customFor({S16, S32, S64});
+ } else {
+ FRem.minScalar(0, S32)
+ .customFor({S32, S64});
+ }
+ FRem.scalarize(0);
+
// TODO: Do we need to clamp maximum bitwidth?
getActionDefinitionsBuilder(G_TRUNC)
.legalIf(isScalar(0))
@@ -648,12 +803,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.has16BitInsts())
IToFP.legalFor({{S16, S16}});
IToFP.clampScalar(1, S32, S64)
+ .minScalar(0, S32)
.scalarize(0)
.widenScalarToNextPow2(1);
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
- .customFor({{S64, S64}});
+ .customFor({{S64, S64}})
+ .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
if (ST.has16BitInsts())
FPToI.legalFor({{S16, S16}});
else
@@ -663,7 +820,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
- getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
+ // Lower roundeven into G_FRINT
+ getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
.scalarize(0)
.lower();
@@ -685,16 +843,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
- // FIXME: Clamp offset operand.
getActionDefinitionsBuilder(G_PTR_ADD)
- .legalIf(isPointer(0))
- .scalarize(0);
+ .legalIf(all(isPointer(0), sameSize(0, 1)))
+ .scalarize(0)
+ .scalarSameSizeAs(1, 0);
getActionDefinitionsBuilder(G_PTRMASK)
- .legalIf(typeInSet(1, {S64, S32}))
- .minScalar(1, S32)
- .maxScalarIf(sizeIs(0, 32), 1, S32)
- .maxScalarIf(sizeIs(0, 64), 1, S64)
+ .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
+ .scalarSameSizeAs(1, 0)
.scalarize(0);
auto &CmpBuilder =
@@ -746,6 +902,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
ExpOps.clampScalar(0, MinScalarFPTy, S32)
.scalarize(0);
+ getActionDefinitionsBuilder(G_FPOWI)
+ .clampScalar(0, MinScalarFPTy, S32)
+ .lower();
+
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{S32, S32}, {S32, S64}})
@@ -870,10 +1030,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Split vector extloads.
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
if (MemSize < DstTy.getSizeInBits())
- MemSize = std::max(MemSize, Align);
+ MemSize = std::max(MemSize, AlignBits);
if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
return true;
@@ -895,35 +1055,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return true;
}
- if (Align < MemSize) {
+ if (AlignBits < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
- return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
+ return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
+ Align(AlignBits / 8));
}
return false;
};
- const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
- unsigned Opc) -> bool {
- unsigned Size = Query.Types[0].getSizeInBits();
- if (isPowerOf2_32(Size))
- return false;
-
- if (Size == 96 && ST.hasDwordx3LoadStores())
- return false;
-
- unsigned AddrSpace = Query.Types[1].getAddressSpace();
- if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
- return false;
-
- unsigned Align = Query.MMODescrs[0].AlignInBits;
- unsigned RoundedSize = NextPowerOf2(Size);
- return (Align >= RoundedSize);
- };
-
- unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
- unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
- unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
+ unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
+ unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
+ unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
// TODO: Refine based on subtargets which support unaligned access or 128-bit
// LDS
@@ -981,31 +1124,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// 16-bit vector parts.
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
- const LLT Ty = Query.Types[0];
- const unsigned Size = Ty.getSizeInBits();
-
- if (Size != Query.MMODescrs[0].SizeInBits)
- return Size <= 32 && Ty.isVector();
-
- if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
- return true;
- return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
- !isRegisterVectorElementType(Ty.getElementType());
+ return shouldBitcastLoadStoreType(ST, Query.Types[0],
+ Query.MMODescrs[0].SizeInBits);
}, bitcastToRegisterType(0));
+ if (!IsStore) {
+ // Widen suitably aligned loads by loading extra bytes. The standard
+ // legalization actions can't properly express widening memory operands.
+ Actions.customIf([=](const LegalityQuery &Query) -> bool {
+ return shouldWidenLoad(ST, Query, G_LOAD);
+ });
+ }
+
+ // FIXME: load/store narrowing should be moved to lower action
Actions
- .customIf(typeIs(1, Constant32Ptr))
- // Widen suitably aligned loads by loading extra elements.
- .moreElementsIf([=](const LegalityQuery &Query) {
- const LLT Ty = Query.Types[0];
- return Op == G_LOAD && Ty.isVector() &&
- shouldWidenLoadResult(Query, Op);
- }, moreElementsToNextPow2(0))
- .widenScalarIf([=](const LegalityQuery &Query) {
- const LLT Ty = Query.Types[0];
- return Op == G_LOAD && !Ty.isVector() &&
- shouldWidenLoadResult(Query, Op);
- }, widenScalarOrEltToNextPow2(0))
.narrowScalarIf(
[=](const LegalityQuery &Query) -> bool {
return !Query.Types[0].isVector() &&
@@ -1111,15 +1243,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// May need relegalization for the scalars.
return std::make_pair(0, EltTy);
})
- .minScalar(0, S32);
+ .lowerIfMemSizeNotPow2()
+ .minScalar(0, S32);
if (IsStore)
Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
- // TODO: Need a bitcast lower option?
Actions
.widenScalarToNextPow2(0)
- .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+ .lower();
}
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
@@ -1147,14 +1280,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
G_ATOMICRMW_UMIN})
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
- {S64, GlobalPtr}, {S64, LocalPtr}});
+ {S64, GlobalPtr}, {S64, LocalPtr},
+ {S32, RegionPtr}, {S64, RegionPtr}});
if (ST.hasFlatAddressSpace()) {
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
if (ST.hasLDSFPAtomics()) {
getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
- .legalFor({{S32, LocalPtr}});
+ .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
}
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
@@ -1207,6 +1341,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S16, S64);
Shifts.widenScalarToNextPow2(0, 16);
+
+ getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
+ .minScalar(0, S16)
+ .scalarize(0)
+ .lower();
} else {
// Make sure we legalize the shift amount type first, as the general
// expansion for the shifted type will produce much worse code if it hasn't
@@ -1214,6 +1353,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S32, S64);
Shifts.widenScalarToNextPow2(0, 32);
+
+ getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
+ .minScalar(0, S32)
+ .scalarize(0)
+ .lower();
}
Shifts.scalarize(0);
@@ -1227,15 +1371,38 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT EltTy = Query.Types[EltTypeIdx];
const LLT VecTy = Query.Types[VecTypeIdx];
const LLT IdxTy = Query.Types[IdxTypeIdx];
- return (EltTy.getSizeInBits() == 16 ||
- EltTy.getSizeInBits() % 32 == 0) &&
- VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= MaxRegisterSize &&
- IdxTy.getSizeInBits() == 32;
+ const unsigned EltSize = EltTy.getSizeInBits();
+ return (EltSize == 32 || EltSize == 64) &&
+ VecTy.getSizeInBits() % 32 == 0 &&
+ VecTy.getSizeInBits() <= MaxRegisterSize &&
+ IdxTy.getSizeInBits() == 32;
+ })
+ .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
+ bitcastToVectorElement32(VecTypeIdx))
+ //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
+ .bitcastIf(
+ all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
+ [=](const LegalityQuery &Query) {
+ // For > 64-bit element types, try to turn this into a 64-bit
+ // element vector since we may be able to do better indexing
+ // if this is scalar. If not, fall back to 32.
+ const LLT EltTy = Query.Types[EltTypeIdx];
+ const LLT VecTy = Query.Types[VecTypeIdx];
+ const unsigned DstEltSize = EltTy.getSizeInBits();
+ const unsigned VecSize = VecTy.getSizeInBits();
+
+ const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
+ return std::make_pair(
+ VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
})
.clampScalar(EltTypeIdx, S32, S64)
.clampScalar(VecTypeIdx, S32, S64)
- .clampScalar(IdxTypeIdx, S32, S32);
+ .clampScalar(IdxTypeIdx, S32, S32)
+ .clampMaxNumElements(VecTypeIdx, S32, 32)
+ // TODO: Clamp elements for 64-bit vectors?
+ // It should only be necessary with variable indexes.
+ // As a last resort, lower to the stack
+ .lower();
}
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -1306,7 +1473,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
- .legalIf(isRegisterType(0));
+ .legalIf(all(isRegisterType(0), isRegisterType(1)))
+ .clampMaxNumElements(0, S32, 32)
+ .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
+ .clampMaxNumElements(0, S16, 64);
// TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
// pre-legalize.
@@ -1335,6 +1505,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
auto &Builder = getActionDefinitionsBuilder(Op)
+ .legalIf(all(isRegisterType(0), isRegisterType(1)))
.lowerFor({{S16, V2S16}})
.lowerIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
@@ -1390,19 +1561,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
})
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &BigTy = Query.Types[BigTyIdx];
- const LLT &LitTy = Query.Types[LitTyIdx];
-
- if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
- return false;
- if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
- return false;
-
- return BigTy.getSizeInBits() % 16 == 0 &&
- LitTy.getSizeInBits() % 16 == 0 &&
- BigTy.getSizeInBits() <= MaxRegisterSize;
- })
// Any vectors left are the wrong size. Scalarize them.
.scalarize(0)
.scalarize(1);
@@ -1427,12 +1585,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
SextInReg.lowerFor({{S32}, {S64}});
}
- // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
- // available, and is selectively legal for s16, s32, v2s16.
- getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
- .scalarize(0)
- .clampScalar(0, S16, S32);
-
SextInReg
.scalarize(0)
.clampScalar(0, S32, S64)
@@ -1446,11 +1598,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
.legalFor({S64});
+ getActionDefinitionsBuilder(G_FENCE)
+ .alwaysLegal();
+
getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+ G_ATOMICRMW_NAND,
+ G_ATOMICRMW_FSUB,
G_READ_REGISTER,
G_WRITE_REGISTER,
@@ -1474,7 +1631,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
- GISelChangeObserver &Observer = Helper.Observer;
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
@@ -1483,6 +1639,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeFrint(MI, MRI, B);
case TargetOpcode::G_FCEIL:
return legalizeFceil(MI, MRI, B);
+ case TargetOpcode::G_FREM:
+ return legalizeFrem(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_TRUNC:
return legalizeIntrinsicTrunc(MI, MRI, B);
case TargetOpcode::G_SITOFP:
@@ -1510,7 +1668,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
case TargetOpcode::G_GLOBAL_VALUE:
return legalizeGlobalValue(MI, MRI, B);
case TargetOpcode::G_LOAD:
- return legalizeLoad(MI, MRI, B, Observer);
+ return legalizeLoad(Helper, MI);
case TargetOpcode::G_FMAD:
return legalizeFMad(MI, MRI, B);
case TargetOpcode::G_FDIV:
@@ -1580,8 +1738,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register QueuePtr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
+ if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
// Offset into amd_queue_t for group_segment_aperture_base_hi /
@@ -1623,8 +1780,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const AMDGPUTargetMachine &TM
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
+ if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
return true;
}
@@ -1721,6 +1877,7 @@ bool AMDGPULegalizerInfo::legalizeFrint(
auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+ MI.eraseFromParent();
return true;
}
@@ -1752,7 +1909,24 @@ bool AMDGPULegalizerInfo::legalizeFceil(
return true;
}
-static MachineInstrBuilder extractF64Exponent(unsigned Hi,
+bool AMDGPULegalizerInfo::legalizeFrem(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0Reg = MI.getOperand(1).getReg();
+ Register Src1Reg = MI.getOperand(2).getReg();
+ auto Flags = MI.getFlags();
+ LLT Ty = MRI.getType(DstReg);
+
+ auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
+ auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
+ auto Neg = B.buildFNeg(Ty, Trunc, Flags);
+ B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+static MachineInstrBuilder extractF64Exponent(Register Hi,
MachineIRBuilder &B) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
@@ -1762,6 +1936,7 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi,
auto Const1 = B.buildConstant(S32, ExpBits);
auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
+ .addUse(Hi)
.addUse(Const0.getReg(0))
.addUse(Const1.getReg(0));
@@ -1809,6 +1984,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
+ MI.eraseFromParent();
return true;
}
@@ -1907,10 +2083,11 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getConstantVRegValWithLookThrough.
- Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
- MI.getOperand(2).getReg(), MRI);
- if (!IdxVal) // Dynamic case will be selected to register indexing.
+ Optional<ValueAndVReg> MaybeIdxVal =
+ getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+ if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
return true;
+ const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
Register Dst = MI.getOperand(0).getReg();
Register Vec = MI.getOperand(1).getReg();
@@ -1919,8 +2096,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Dst));
- if (IdxVal->Value < VecTy.getNumElements())
- B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
+ if (IdxVal < VecTy.getNumElements())
+ B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
@@ -1938,11 +2115,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getConstantVRegValWithLookThrough.
- Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
- MI.getOperand(3).getReg(), MRI);
- if (!IdxVal) // Dynamic case will be selected to register indexing.
+ Optional<ValueAndVReg> MaybeIdxVal =
+ getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+ if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
return true;
+ int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
Register Dst = MI.getOperand(0).getReg();
Register Vec = MI.getOperand(1).getReg();
Register Ins = MI.getOperand(2).getReg();
@@ -1951,8 +2129,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Ins));
- if (IdxVal->Value < VecTy.getNumElements())
- B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
+ if (IdxVal < VecTy.getNumElements())
+ B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
@@ -2043,7 +2221,9 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
// variable, but since the encoding of $symbol starts 4 bytes after the start
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address.
+ // compute the correct address. Similarly for the s_addc_u32 instruction, the
+ // encoding of $symbol starts 12 bytes after the start of the s_add_u32
+ // instruction.
LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -2057,7 +2237,7 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
if (GAFlags == SIInstrInfo::MO_NONE)
MIB.addImm(0);
else
- MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
+ MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
@@ -2078,7 +2258,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isEntryFunction()) {
+ if (!MFI->isModuleEntryFunction()) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
@@ -2104,6 +2284,25 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true; // Leave in place;
}
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
+ Type *Ty = GV->getValueType();
+ // HIP uses an unsized array `extern __shared__ T s[]` or similar
+ // zero-sized type in other languages to declare the dynamic shared
+ // memory which size is not known at the compile time. They will be
+ // allocated by the runtime and placed directly after the static
+ // allocated ones. They all share the same offset.
+ if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+ // Adjust alignment for that dynamic shared memory array.
+ MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+ LLT S32 = LLT::scalar(32);
+ auto Sz =
+ B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
+ B.buildIntToPtr(DstReg, Sz);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
B.buildConstant(
DstReg,
MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
@@ -2154,15 +2353,90 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true;
}
-bool AMDGPULegalizerInfo::legalizeLoad(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, GISelChangeObserver &Observer) const {
- LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
- auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
- Observer.changingInstr(MI);
- MI.getOperand(1).setReg(Cast.getReg(0));
- Observer.changedInstr(MI);
- return true;
+static LLT widenToNextPowerOf2(LLT Ty) {
+ if (Ty.isVector())
+ return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
+ return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
+}
+
+bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ GISelChangeObserver &Observer = Helper.Observer;
+
+ Register PtrReg = MI.getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(PtrReg);
+ unsigned AddrSpace = PtrTy.getAddressSpace();
+
+ if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
+ Observer.changingInstr(MI);
+ MI.getOperand(1).setReg(Cast.getReg(0));
+ Observer.changedInstr(MI);
+ return true;
+ }
+
+ Register ValReg = MI.getOperand(0).getReg();
+ LLT ValTy = MRI.getType(ValReg);
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned ValSize = ValTy.getSizeInBits();
+ const unsigned MemSize = 8 * MMO->getSize();
+ const Align MemAlign = MMO->getAlign();
+ const unsigned AlignInBits = 8 * MemAlign.value();
+
+ // Widen non-power-of-2 loads to the alignment if needed
+ if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
+ const unsigned WideMemSize = PowerOf2Ceil(MemSize);
+
+ // This was already the correct extending load result type, so just adjust
+ // the memory type.
+ if (WideMemSize == ValSize) {
+ MachineFunction &MF = B.getMF();
+
+ MachineMemOperand *WideMMO =
+ MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
+ Observer.changingInstr(MI);
+ MI.setMemRefs(MF, {WideMMO});
+ Observer.changedInstr(MI);
+ return true;
+ }
+
+ // Don't bother handling edge case that should probably never be produced.
+ if (ValSize > WideMemSize)
+ return false;
+
+ LLT WideTy = widenToNextPowerOf2(ValTy);
+
+ Register WideLoad;
+ if (!WideTy.isVector()) {
+ WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+ B.buildTrunc(ValReg, WideLoad).getReg(0);
+ } else {
+ // Extract the subvector.
+
+ if (isRegisterType(ValTy)) {
+ // If this a case where G_EXTRACT is legal, use it.
+ // (e.g. <3 x s32> -> <4 x s32>)
+ WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+ B.buildExtract(ValReg, WideLoad, 0);
+ } else {
+ // For cases where the widened type isn't a nice register value, unmerge
+ // from a widened register (e.g. <3 x s16> -> <4 x s16>)
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
+ B.setInsertPt(B.getMBB(), MI.getIterator());
+ B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
+ }
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
}
bool AMDGPULegalizerInfo::legalizeFMad(
@@ -2194,8 +2468,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
Register CmpVal = MI.getOperand(2).getReg();
Register NewVal = MI.getOperand(3).getReg();
- assert(SITargetLowering::isFlatGlobalAddrSpace(
- MRI.getType(PtrReg).getAddressSpace()) &&
+ assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
"this should not have been custom lowered");
LLT ValTy = MRI.getType(CmpVal);
@@ -2364,23 +2637,42 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
return true;
}
+// Check that this is a G_XOR x, -1
+static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
+ if (MI.getOpcode() != TargetOpcode::G_XOR)
+ return false;
+ auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
+ return ConstVal && *ConstVal == -1;
+}
+
// Return the use branch instruction, otherwise null if the usage is invalid.
-static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineInstr *&Br,
- MachineBasicBlock *&UncondBrTarget) {
+static MachineInstr *
+verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
+ MachineBasicBlock *&UncondBrTarget, bool &Negated) {
Register CondDef = MI.getOperand(0).getReg();
if (!MRI.hasOneNonDBGUse(CondDef))
return nullptr;
MachineBasicBlock *Parent = MI.getParent();
- MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
- if (UseMI.getParent() != Parent ||
- UseMI.getOpcode() != AMDGPU::G_BRCOND)
+ MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
+
+ if (isNot(MRI, *UseMI)) {
+ Register NegatedCond = UseMI->getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(NegatedCond))
+ return nullptr;
+
+ // We're deleting the def of this value, so we need to remove it.
+ UseMI->eraseFromParent();
+
+ UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
+ Negated = true;
+ }
+
+ if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
return nullptr;
// Make sure the cond br is followed by a G_BR, or is the last instruction.
- MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
+ MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
if (Next == Parent->end()) {
MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
@@ -2393,84 +2685,19 @@ static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
UncondBrTarget = Br->getOperand(0).getMBB();
}
- return &UseMI;
-}
-
-Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
- MachineRegisterInfo &MRI,
- Register LiveIn,
- Register PhyReg) const {
- assert(PhyReg.isPhysical() && "Physical register expected");
-
- // Insert the live-in copy, if required, by defining destination virtual
- // register.
- // FIXME: It seems EmitLiveInCopies isn't called anywhere?
- if (!MRI.getVRegDef(LiveIn)) {
- // FIXME: Should have scoped insert pt
- MachineBasicBlock &OrigInsBB = B.getMBB();
- auto OrigInsPt = B.getInsertPt();
-
- MachineBasicBlock &EntryMBB = B.getMF().front();
- EntryMBB.addLiveIn(PhyReg);
- B.setInsertPt(EntryMBB, EntryMBB.begin());
- B.buildCopy(LiveIn, PhyReg);
-
- B.setInsertPt(OrigInsBB, OrigInsPt);
- }
-
- return LiveIn;
-}
-
-Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
- MachineRegisterInfo &MRI,
- Register PhyReg, LLT Ty,
- bool InsertLiveInCopy) const {
- assert(PhyReg.isPhysical() && "Physical register expected");
-
- // Get or create virtual live-in regester
- Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
- if (!LiveIn) {
- LiveIn = MRI.createGenericVirtualRegister(Ty);
- MRI.addLiveIn(PhyReg, LiveIn);
- }
-
- // When the actual true copy required is from virtual register to physical
- // register (to be inserted later), live-in copy insertion from physical
- // to register virtual register is not required
- if (!InsertLiveInCopy)
- return LiveIn;
-
- return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
-}
-
-const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
- MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
- const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- const ArgDescriptor *Arg;
- const TargetRegisterClass *RC;
- LLT ArgTy;
- std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
- if (!Arg) {
- LLVM_DEBUG(dbgs() << "Required arg register missing\n");
- return nullptr;
- }
- return Arg;
+ return UseMI;
}
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
- const ArgDescriptor *Arg) const {
- if (!Arg->isRegister() || !Arg->getRegister().isValid())
- return false; // TODO: Handle these
-
- Register SrcReg = Arg->getRegister();
- assert(SrcReg.isPhysical() && "Physical register expected");
+ const ArgDescriptor *Arg,
+ const TargetRegisterClass *ArgRC,
+ LLT ArgTy) const {
+ MCRegister SrcReg = Arg->getRegister();
+ assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
assert(DstReg.isVirtual() && "Virtual register expected");
- MachineRegisterInfo &MRI = *B.getMRI();
-
- LLT Ty = MRI.getType(DstReg);
- Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
-
+ Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
+ ArgTy);
if (Arg->isMasked()) {
// TODO: Should we try to emit this once in the entry block?
const LLT S32 = LLT::scalar(32);
@@ -2492,15 +2719,24 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
return true;
}
-bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
- MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+bool AMDGPULegalizerInfo::loadInputValue(
+ Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *ArgRC;
+ LLT ArgTy;
+ std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
- const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
- if (!Arg)
- return false;
+ if (!Arg->isRegister() || !Arg->getRegister().isValid())
+ return false; // TODO: Handle these
+ return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
+}
- if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
+bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
return false;
MI.eraseFromParent();
@@ -2516,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);
- if (legalizeFastUnsafeFDIV(MI, MRI, B))
- return true;
-
if (DstTy == S16)
return legalizeFDIV16(MI, MRI, B);
if (DstTy == S32)
@@ -2813,22 +3046,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
-
uint16_t Flags = MI.getFlags();
-
LLT ResTy = MRI.getType(Res);
- LLT S32 = LLT::scalar(32);
- LLT S64 = LLT::scalar(64);
const MachineFunction &MF = B.getMF();
- bool Unsafe =
- MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
-
- if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
- return false;
+ bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+ MI.getFlag(MachineInstr::FmAfn);
- if (!Unsafe && ResTy == S32 &&
- MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
+ if (!AllowInaccurateRcp)
return false;
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -2855,22 +3080,58 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
}
// x / y -> x * (1.0 / y)
- if (Unsafe) {
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
- .addUse(RHS)
- .setMIFlags(Flags);
- B.buildFMul(Res, LHS, RCP, Flags);
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+ B.buildFMul(Res, LHS, RCP, Flags);
- MI.eraseFromParent();
- return true;
- }
+ MI.eraseFromParent();
+ return true;
+}
- return false;
+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register Res = MI.getOperand(0).getReg();
+ Register X = MI.getOperand(1).getReg();
+ Register Y = MI.getOperand(2).getReg();
+ uint16_t Flags = MI.getFlags();
+ LLT ResTy = MRI.getType(Res);
+
+ const MachineFunction &MF = B.getMF();
+ bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+ MI.getFlag(MachineInstr::FmAfn);
+
+ if (!AllowInaccurateRcp)
+ return false;
+
+ auto NegY = B.buildFNeg(ResTy, Y);
+ auto One = B.buildFConstant(ResTy, 1.0);
+
+ auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(Y)
+ .setMIFlags(Flags);
+
+ auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
+ R = B.buildFMA(ResTy, Tmp0, R, R);
+
+ auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
+ R = B.buildFMA(ResTy, Tmp1, R, R);
+
+ auto Ret = B.buildFMul(ResTy, X, R);
+ auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
+
+ B.buildFMA(Res, Tmp2, R, Ret);
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
+ if (legalizeFastUnsafeFDIV(MI, MRI, B))
+ return true;
+
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2933,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable,
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
+ if (legalizeFastUnsafeFDIV(MI, MRI, B))
+ return true;
+
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2999,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
+ if (legalizeFastUnsafeFDIV64(MI, MRI, B))
+ return true;
+
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -3109,35 +3376,118 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
+// FIXME: Why do we handle this one but not other removed instructions?
+//
+// Reciprocal square root. The clamp prevents infinite results, clamping
+// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
+// +-max_float.
+bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(2).getReg();
+ auto Flags = MI.getFlags();
+
+ LLT Ty = MRI.getType(Dst);
+
+ const fltSemantics *FltSemantics;
+ if (Ty == LLT::scalar(32))
+ FltSemantics = &APFloat::IEEEsingle();
+ else if (Ty == LLT::scalar(64))
+ FltSemantics = &APFloat::IEEEdouble();
+ else
+ return false;
+
+ auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+
+ // We don't need to concern ourselves with the snan handling difference, since
+ // the rsq quieted (or not) so use the one which will directly select.
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- if (!MFI->isEntryFunction()) {
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ const bool UseIEEE = MFI->getMode().IEEE;
+
+ auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+ auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
+ B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
+
+ auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+
+ if (UseIEEE)
+ B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
+ else
+ B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
+ switch (IID) {
+ case Intrinsic::amdgcn_ds_fadd:
+ return AMDGPU::G_ATOMICRMW_FADD;
+ case Intrinsic::amdgcn_ds_fmin:
+ return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+ case Intrinsic::amdgcn_ds_fmax:
+ return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+ default:
+ llvm_unreachable("not a DS FP intrinsic");
}
+}
+bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI,
+ Intrinsic::ID IID) const {
+ GISelChangeObserver &Observer = Helper.Observer;
+ Observer.changingInstr(MI);
+
+ MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
+
+ // The remaining operands were used to set fields in the MemOperand on
+ // construction.
+ for (int I = 6; I > 3; --I)
+ MI.RemoveOperand(I);
+
+ MI.RemoveOperand(1); // Remove the intrinsic ID.
+ Observer.changedInstr(MI);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
uint64_t Offset =
ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
- Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
- const ArgDescriptor *Arg;
- const TargetRegisterClass *RC;
- LLT ArgTy;
- std::tie(Arg, RC, ArgTy) =
- MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- if (!Arg)
- return false;
-
Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
- if (!loadInputValue(KernargPtrReg, B, Arg))
+ if (!loadInputValue(KernargPtrReg, B,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
return false;
+ // FIXME: This should be nuw
B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (!MFI->isEntryFunction()) {
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!getImplicitArgPtr(DstReg, MRI, B))
+ return false;
+
MI.eraseFromParent();
return true;
}
@@ -3147,7 +3497,9 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
+ auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ Register Hi32 = Unmerge.getReg(1);
+
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
MI.eraseFromParent();
return true;
@@ -3165,11 +3517,10 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const unsigned MaxImm = 4095;
Register BaseReg;
unsigned TotalConstOffset;
- MachineInstr *OffsetDef;
const LLT S32 = LLT::scalar(32);
- std::tie(BaseReg, TotalConstOffset, OffsetDef)
- = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+ std::tie(BaseReg, TotalConstOffset) =
+ AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
unsigned ImmOffset = TotalConstOffset;
@@ -3205,24 +3556,58 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
- Register Reg) const {
- if (!ST.hasUnpackedD16VMem())
- return Reg;
-
+ Register Reg,
+ bool ImageStore) const {
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
- auto Unmerge = B.buildUnmerge(S16, Reg);
+ if (ST.hasUnpackedD16VMem()) {
+ auto Unmerge = B.buildUnmerge(S16, Reg);
- SmallVector<Register, 4> WideRegs;
- for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
- WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+ SmallVector<Register, 4> WideRegs;
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
- int NumElts = StoreVT.getNumElements();
+ int NumElts = StoreVT.getNumElements();
- return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ }
+
+ if (ImageStore && ST.hasImageStoreD16Bug()) {
+ if (StoreVT.getNumElements() == 2) {
+ SmallVector<Register, 4> PackedRegs;
+ Reg = B.buildBitcast(S32, Reg).getReg(0);
+ PackedRegs.push_back(Reg);
+ PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
+ return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
+ }
+
+ if (StoreVT.getNumElements() == 3) {
+ SmallVector<Register, 4> PackedRegs;
+ auto Unmerge = B.buildUnmerge(S16, Reg);
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ PackedRegs.push_back(Unmerge.getReg(I));
+ PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
+ Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
+ return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
+ }
+
+ if (StoreVT.getNumElements() == 4) {
+ SmallVector<Register, 4> PackedRegs;
+ Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
+ auto Unmerge = B.buildUnmerge(S32, Reg);
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ PackedRegs.push_back(Unmerge.getReg(I));
+ PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
+ return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
+ }
+
+ llvm_unreachable("invalid data type");
+ }
+
+ return Reg;
}
Register AMDGPULegalizerInfo::fixStoreSourceType(
@@ -3513,6 +3898,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -3523,12 +3911,20 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
Intrinsic::ID IID) const {
const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+ const bool HasReturn = MI.getNumExplicitDefs() != 0;
- Register Dst = MI.getOperand(0).getReg();
- Register VData = MI.getOperand(2).getReg();
+ Register Dst;
- Register CmpVal;
int OpOffset = 0;
+ if (HasReturn) {
+ // A few FP atomics do not support return values.
+ Dst = MI.getOperand(0).getReg();
+ } else {
+ OpOffset = -1;
+ }
+
+ Register VData = MI.getOperand(2 + OpOffset).getReg();
+ Register CmpVal;
if (IsCmpSwap) {
CmpVal = MI.getOperand(3 + OpOffset).getReg();
@@ -3536,7 +3932,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
}
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
- const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
+ const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
// The struct intrinsic variants add one additional operand over raw.
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
@@ -3561,9 +3957,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
if (!VIndex)
VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
- auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
- .addDef(Dst)
- .addUse(VData); // vdata
+ auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
+
+ if (HasReturn)
+ MIB.addDef(Dst);
+
+ MIB.addUse(VData); // vdata
if (IsCmpSwap)
MIB.addReg(CmpVal);
@@ -3583,38 +3982,41 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
/// vector with s16 typed elements.
-static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
- SmallVectorImpl<Register> &PackedAddrs,
- int AddrIdx, int DimIdx, int EndIdx,
- int NumGradients) {
+static void packImageA16AddressToDwords(
+ MachineIRBuilder &B, MachineInstr &MI,
+ SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);
- for (int I = AddrIdx; I < EndIdx; ++I) {
- MachineOperand &SrcOp = MI.getOperand(I);
+ for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
+ MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
if (!SrcOp.isReg())
continue; // _L to _LZ may have eliminated this.
Register AddrReg = SrcOp.getReg();
- if (I < DimIdx) {
+ if (I < Intr->GradientStart) {
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
} else {
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
// derivatives dx/dh and dx/dv are packed with undef.
if (((I + 1) >= EndIdx) ||
- ((NumGradients / 2) % 2 == 1 &&
- (I == DimIdx + (NumGradients / 2) - 1 ||
- I == DimIdx + NumGradients - 1)) ||
+ ((Intr->NumGradients / 2) % 2 == 1 &&
+ (I == static_cast<unsigned>(Intr->GradientStart +
+ (Intr->NumGradients / 2) - 1) ||
+ I == static_cast<unsigned>(Intr->GradientStart +
+ Intr->NumGradients - 1))) ||
// Check for _L to _LZ optimization
- !MI.getOperand(I + 1).isReg()) {
+ !MI.getOperand(ArgOffset + I + 1).isReg()) {
PackedAddrs.push_back(
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
.getReg(0));
} else {
PackedAddrs.push_back(
- B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
+ B.buildBuildVector(
+ V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
.getReg(0));
++I;
}
@@ -3673,43 +4075,37 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
/// now unnecessary arguments with $noreg.
bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer,
- const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
+ MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
- const int NumDefs = MI.getNumExplicitDefs();
+ const unsigned NumDefs = MI.getNumExplicitDefs();
+ const unsigned ArgOffset = NumDefs + 1;
bool IsTFE = NumDefs == 2;
// We are only processing the operands of d16 image operations on subtargets
// that use the unpacked register layout, or need to repack the TFE result.
// TODO: Do we need to guard against already legalized intrinsics?
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
- AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
MachineRegisterInfo *MRI = B.getMRI();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);
- // Index of first address argument
- const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
-
- int NumVAddrs, NumGradients;
- std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
- const int DMaskIdx = BaseOpcode->Atomic ? -1 :
- getDMaskIdx(BaseOpcode, NumDefs);
unsigned DMask = 0;
// Check for 16 bit addresses and pack if true.
- int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
- LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
- LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
+ LLT GradTy =
+ MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
+ LLT AddrTy =
+ MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 = GradTy == S16;
const bool IsA16 = AddrTy == S16;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
- DMask = MI.getOperand(DMaskIdx).getImm();
+ DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
if (BaseOpcode->Gather4) {
DMaskLanes = 4;
} else if (DMask != 0) {
@@ -3736,7 +4132,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (IsTFE && DMask == 0) {
DMask = 0x1;
DMaskLanes = 1;
- MI.getOperand(DMaskIdx).setImm(DMask);
+ MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
}
if (BaseOpcode->Atomic) {
@@ -3757,41 +4153,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
}
- int CorrectedNumVAddrs = NumVAddrs;
+ unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
// Optimize _L to _LZ when _L is zero
if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
- AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
const ConstantFP *ConstantLod;
- const int LodIdx = AddrIdx + NumVAddrs - 1;
- if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
+ if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
+ m_GFCst(ConstantLod))) {
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
// Set new opcode to _lz variant of _l, and change the intrinsic ID.
- ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
- LZMappingInfo->LZ, ImageDimIntr->Dim);
+ const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+ AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
+ Intr->Dim);
// The starting indexes should remain in the same place.
- --NumVAddrs;
--CorrectedNumVAddrs;
- MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
- static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
- MI.RemoveOperand(LodIdx);
+ MI.getOperand(MI.getNumExplicitDefs())
+ .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
+ MI.RemoveOperand(ArgOffset + Intr->LodIndex);
+ Intr = NewImageDimIntr;
}
}
}
// Optimize _mip away, when 'lod' is zero
- if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+ if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
int64_t ConstantLod;
- const int LodIdx = AddrIdx + NumVAddrs - 1;
-
- if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
+ if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
+ m_ICst(ConstantLod))) {
if (ConstantLod == 0) {
// TODO: Change intrinsic opcode and remove operand instead or replacing
// it with 0, as the _L to _LZ handling is done above.
- MI.getOperand(LodIdx).ChangeToImmediate(0);
+ MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
--CorrectedNumVAddrs;
}
}
@@ -3806,18 +4202,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
} else if (!ST.hasG16())
return false;
- if (NumVAddrs > 1) {
+ if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
// Don't compress addresses for G16
- const int PackEndIdx =
- IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
- packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
- PackEndIdx, NumGradients);
+ const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
+ packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
+ PackEndIdx);
if (!IsA16) {
// Add uncompressed address
- for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
- int AddrReg = MI.getOperand(I).getReg();
+ for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
+ int AddrReg = MI.getOperand(ArgOffset + I).getReg();
assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
PackedRegs.push_back(AddrReg);
}
@@ -3833,9 +4228,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
PackedRegs.resize(1);
}
- const int NumPacked = PackedRegs.size();
- for (int I = 0; I != NumVAddrs; ++I) {
- MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
+ const unsigned NumPacked = PackedRegs.size();
+ for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
+ MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
if (!SrcOp.isReg()) {
assert(SrcOp.isImm() && SrcOp.getImm() == 0);
continue;
@@ -3843,8 +4238,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
assert(SrcOp.getReg() != AMDGPU::NoRegister);
- if (I < NumPacked)
- SrcOp.setReg(PackedRegs[I]);
+ if (I - Intr->VAddrStart < NumPacked)
+ SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
else
SrcOp.setReg(AMDGPU::NoRegister);
}
@@ -3863,8 +4258,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// allocation when possible.
const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
- if (!UseNSA && NumVAddrs > 1)
- convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
+ if (!UseNSA && Intr->NumVAddrs > 1)
+ convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
+ Intr->NumVAddrs);
}
int Flags = 0;
@@ -3881,7 +4277,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (!Ty.isVector() || Ty.getElementType() != S16)
return true;
- Register RepackedReg = handleD16VData(B, *MRI, VData);
+ Register RepackedReg = handleD16VData(B, *MRI, VData, true);
if (RepackedReg != VData) {
MI.getOperand(1).setReg(RepackedReg);
}
@@ -4053,8 +4449,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer) const {
+ LegalizerHelper &Helper, MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ GISelChangeObserver &Observer = Helper.Observer;
+
Register Dst = MI.getOperand(0).getReg();
LLT Ty = B.getMRI()->getType(Dst);
unsigned Size = Ty.getSizeInBits();
@@ -4062,6 +4460,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
Observer.changingInstr(MI);
+ if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
+ Ty = getBitcastRegisterType(Ty);
+ Helper.bitcastDst(MI, Ty, 0);
+ Dst = MI.getOperand(0).getReg();
+ B.setInsertPt(B.getMBB(), MI);
+ }
+
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
@@ -4083,8 +4488,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
if (!isPowerOf2_32(Size)) {
- LegalizerHelper Helper(MF, *this, Observer, B);
-
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
@@ -4095,6 +4498,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
return true;
}
+// TODO: Move to selection
bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -4105,17 +4509,14 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
} else {
// Pass queue pointer to trap handler as input, and insert trap instruction
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
- const ArgDescriptor *Arg =
- getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
- if (!Arg)
- return false;
MachineRegisterInfo &MRI = *B.getMRI();
- Register SGPR01(AMDGPU::SGPR0_SGPR1);
- Register LiveIn = getLiveInRegister(
- B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
- /*InsertLiveInCopy=*/false);
- if (!loadInputValue(LiveIn, B, Arg))
+
+ Register LiveIn =
+ MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+ if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return false;
+
+ Register SGPR01(AMDGPU::SGPR0_SGPR1);
B.buildCopy(SGPR01, LiveIn);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(GCNSubtarget::TrapIDLLVMTrap)
@@ -4146,6 +4547,78 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
return true;
}
+bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register NodePtr = MI.getOperand(2).getReg();
+ Register RayExtent = MI.getOperand(3).getReg();
+ Register RayOrigin = MI.getOperand(4).getReg();
+ Register RayDir = MI.getOperand(5).getReg();
+ Register RayInvDir = MI.getOperand(6).getReg();
+ Register TDescr = MI.getOperand(7).getReg();
+
+ bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
+ bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+ unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+ : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+ SmallVector<Register, 12> Ops;
+ if (Is64) {
+ auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+ Ops.push_back(Unmerge.getReg(0));
+ Ops.push_back(Unmerge.getReg(1));
+ } else {
+ Ops.push_back(NodePtr);
+ }
+ Ops.push_back(RayExtent);
+
+ auto packLanes = [&Ops, &S32, &B] (Register Src) {
+ auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+ Ops.push_back(Unmerge.getReg(0));
+ Ops.push_back(Unmerge.getReg(1));
+ Ops.push_back(Unmerge.getReg(2));
+ };
+
+ packLanes(RayOrigin);
+ if (IsA16) {
+ auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+ Register R1 = MRI.createGenericVirtualRegister(S32);
+ Register R2 = MRI.createGenericVirtualRegister(S32);
+ Register R3 = MRI.createGenericVirtualRegister(S32);
+ B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
+ B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
+ B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
+ Ops.push_back(R1);
+ Ops.push_back(R2);
+ Ops.push_back(R3);
+ } else {
+ packLanes(RayDir);
+ packLanes(RayInvDir);
+ }
+
+ auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
+ .addDef(DstReg)
+ .addImm(Opcode);
+
+ for (Register R : Ops) {
+ MIB.addUse(R);
+ }
+
+ MIB.addUse(TDescr)
+ .addImm(IsA16 ? 1 : 0)
+ .cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -4158,7 +4631,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_else: {
MachineInstr *Br = nullptr;
MachineBasicBlock *UncondBrTarget = nullptr;
- if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
+ bool Negated = false;
+ if (MachineInstr *BrCond =
+ verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
@@ -4166,6 +4641,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Register Use = MI.getOperand(3).getReg();
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
+
+ if (Negated)
+ std::swap(CondBrTarget, UncondBrTarget);
+
B.setInsertPt(B.getMBB(), BrCond->getIterator());
if (IntrID == Intrinsic::amdgcn_if) {
B.buildInstr(AMDGPU::SI_IF)
@@ -4174,10 +4653,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
.addMBB(UncondBrTarget);
} else {
B.buildInstr(AMDGPU::SI_ELSE)
- .addDef(Def)
- .addUse(Use)
- .addMBB(UncondBrTarget)
- .addImm(0);
+ .addDef(Def)
+ .addUse(Use)
+ .addMBB(UncondBrTarget);
}
if (Br) {
@@ -4201,13 +4679,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_loop: {
MachineInstr *Br = nullptr;
MachineBasicBlock *UncondBrTarget = nullptr;
- if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
+ bool Negated = false;
+ if (MachineInstr *BrCond =
+ verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
Register Reg = MI.getOperand(2).getReg();
+ if (Negated)
+ std::swap(CondBrTarget, UncondBrTarget);
+
B.setInsertPt(B.getMBB(), BrCond->getIterator());
B.buildInstr(AMDGPU::SI_LOOP)
.addUse(Reg)
@@ -4280,7 +4763,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
case Intrinsic::amdgcn_s_buffer_load:
- return legalizeSBufferLoad(MI, B, Helper.Observer);
+ return legalizeSBufferLoad(Helper, MI);
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store:
return legalizeBufferStore(MI, MRI, B, false, false);
@@ -4323,6 +4806,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return legalizeBufferAtomic(MI, B, IntrID);
@@ -4334,6 +4819,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
return legalizeDebugTrapIntrinsic(MI, MRI, B);
+ case Intrinsic::amdgcn_rsq_clamp:
+ return legalizeRsqClampIntrinsic(MI, MRI, B);
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
+ return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
+ case Intrinsic::amdgcn_image_bvh_intersect_ray:
+ return legalizeBVHIntrinsic(MI, B);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index ce32bbf76b34..87e8b2128a25 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -23,9 +23,13 @@ namespace llvm {
class GCNTargetMachine;
class LLVMContext;
class GCNSubtarget;
+class MachineIRBuilder;
+namespace AMDGPU {
+struct ImageDimIntrinsicInfo;
+}
/// This class provides the information for the target register banks.
-class AMDGPULegalizerInfo : public LegalizerInfo {
+class AMDGPULegalizerInfo final : public LegalizerInfo {
const GCNSubtarget &ST;
public:
@@ -44,6 +48,8 @@ public:
MachineIRBuilder &B) const;
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -67,9 +73,7 @@ public:
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
- bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
- GISelChangeObserver &Observer) const;
+ bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -86,16 +90,11 @@ public:
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
- Register getLiveInRegister(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- Register PhyReg, LLT Ty,
- bool InsertLiveInCopy = true) const;
- Register insertLiveInCopy(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- Register LiveIn, Register PhyReg) const;
- const ArgDescriptor *
- getArgDescriptor(MachineIRBuilder &B,
- AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
- const ArgDescriptor *Arg) const;
+ const ArgDescriptor *Arg,
+ const TargetRegisterClass *ArgRC, LLT ArgTy) const;
+ bool loadInputValue(Register DstReg, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
bool legalizePreloadedArgIntrin(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
@@ -130,9 +129,20 @@ public:
MachineIRBuilder &B) const;
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
+ bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI, Intrinsic::ID IID) const;
+
+ bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -142,7 +152,7 @@ public:
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- Register Reg) const;
+ Register Reg, bool ImageStore = false) const;
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -154,19 +164,19 @@ public:
MachineIRBuilder &B, bool IsTyped,
bool IsFormat) const;
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool IsTyped,
- bool IsFormat) const;
+ MachineIRBuilder &B, bool IsFormat,
+ bool IsTyped) const;
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;
+ bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
+
bool legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B,
GISelChangeObserver &Observer,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
- bool legalizeSBufferLoad(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer) const;
+ bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
bool IsInc) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 4a14259f1bdb..6b7f57252b7a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -13,27 +13,12 @@
#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
+#include "GCNSubtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
-#include <cmath>
-#include <vector>
#define DEBUG_TYPE "amdgpu-simplifylib"
@@ -495,8 +480,7 @@ bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
}
bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
- return AllNative ||
- std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end();
+ return AllNative || llvm::is_contained(UseNative, F);
}
void AMDGPULibCalls::initNativeFuncs() {
@@ -1289,6 +1273,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
BasicBlock * const CBB = CI->getParent();
int const MaxScan = 30;
+ bool Changed = false;
{ // fold in load value.
LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
@@ -1296,6 +1281,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
BasicBlock::iterator BBI = LI->getIterator();
Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA);
if (AvailableVal) {
+ Changed = true;
CArgVal->replaceAllUsesWith(AvailableVal);
if (CArgVal->getNumUses() == 0)
LI->eraseFromParent();
@@ -1331,7 +1317,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
if (UI) break;
}
- if (!UI) return false;
+ if (!UI)
+ return Changed;
// Merge the sin and cos.
@@ -1340,7 +1327,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
FunctionCallee Fsincos = getFunction(M, nf);
- if (!Fsincos) return false;
+ if (!Fsincos)
+ return Changed;
BasicBlock::iterator ItOld = B.GetInsertPoint();
AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
@@ -1747,6 +1735,40 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
return Changed;
}
+PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ AMDGPULibCalls Simplifier(&TM);
+ Simplifier.initNativeFuncs();
+
+ bool Changed = false;
+ auto AA = &AM.getResult<AAManager>(F);
+
+ LLVM_DEBUG(dbgs() << "AMDIC: process function ";
+ F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
+
+ for (auto &BB : F) {
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
+ // Ignore non-calls.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ ++I;
+ // Ignore intrinsics that do not become real instructions.
+ if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd())
+ continue;
+
+ // Ignore indirect calls.
+ Function *Callee = CI->getCalledFunction();
+ if (Callee == 0)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+ dbgs().flush());
+ if (Simplifier.fold(CI, AA))
+ Changed = true;
+ }
+ }
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
if (skipFunction(F) || UseNative.empty())
return false;
@@ -1769,3 +1791,32 @@ bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
}
return Changed;
}
+
+PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (UseNative.empty())
+ return PreservedAnalyses::all();
+
+ AMDGPULibCalls Simplifier;
+ Simplifier.initNativeFuncs();
+
+ bool Changed = false;
+ for (auto &BB : F) {
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
+ // Ignore non-calls.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ ++I;
+ if (!CI)
+ continue;
+
+ // Ignore indirect calls.
+ Function *Callee = CI->getCalledFunction();
+ if (Callee == 0)
+ continue;
+
+ if (Simplifier.useNative(CI))
+ Changed = true;
+ }
+ }
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 2b5143ba7506..646087cdb7db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -12,17 +12,14 @@
#include "AMDGPULibFunc.h"
#include "AMDGPU.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Attributes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/raw_ostream.h"
-#include <string>
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 54c15e4e4d39..714e74faaf13 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -8,12 +8,16 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
@@ -131,7 +135,9 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
if (!CI)
continue;
- Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI);
+ Function *Caller = CI->getParent()->getParent();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *Caller);
+ Changed |= ST.makeLIDRangeMetadata(CI);
}
return Changed;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 62ab5bb55a16..8fb4f93fd4b3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -12,30 +12,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/CodeGen/Passes.h"
+#include "GCNSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-
+#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
using namespace llvm;
@@ -108,10 +89,14 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
uint64_t ExplicitArgOffset = 0;
for (Argument &Arg : F.args()) {
- Type *ArgTy = Arg.getType();
- Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
- unsigned Size = DL.getTypeSizeInBits(ArgTy);
- unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None;
+ if (!ABITypeAlign)
+ ABITypeAlign = DL.getABITypeAlign(ArgTy);
+
+ uint64_t Size = DL.getTypeSizeInBits(ArgTy);
+ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
@@ -119,6 +104,19 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
if (Arg.use_empty())
continue;
+ // If this is byval, the loads are already explicit in the function. We just
+ // need to rewrite the pointer values.
+ if (IsByRef) {
+ Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
+ Builder.getInt8Ty(), KernArgSegment, EltOffset,
+ Arg.getName() + ".byval.kernarg.offset");
+
+ Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ ArgOffsetPtr, Arg.getType());
+ Arg.replaceAllUsesWith(CastOffsetPtr);
+ continue;
+ }
+
if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
@@ -224,8 +222,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
Arg.getName() + ".load");
Arg.replaceAllUsesWith(NewVal);
} else if (IsV3) {
- Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
- ArrayRef<int>{0, 1, 2},
+ Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},
Arg.getName() + ".load");
Arg.replaceAllUsesWith(Shuf);
} else {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 00e12f808783..9ab6a5246ce5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -13,13 +13,14 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
@@ -41,16 +42,11 @@ enum DispatchPackedOffsets {
};
class AMDGPULowerKernelAttributes : public ModulePass {
- Module *Mod = nullptr;
-
public:
static char ID;
AMDGPULowerKernelAttributes() : ModulePass(ID) {}
- bool processUse(CallInst *CI);
-
- bool doInitialization(Module &M) override;
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
@@ -64,12 +60,7 @@ public:
} // end anonymous namespace
-bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
- Mod = &M;
- return false;
-}
-
-bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
+static bool processUse(CallInst *CI) {
Function *F = CI->getParent()->getParent();
auto MD = F->getMetadata("reqd_work_group_size");
@@ -89,7 +80,7 @@ bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
Value *GridSizeY = nullptr;
Value *GridSizeZ = nullptr;
- const DataLayout &DL = Mod->getDataLayout();
+ const DataLayout &DL = F->getParent()->getDataLayout();
// We expect to see several GEP users, casted to the appropriate type and
// loaded.
@@ -239,7 +230,7 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
StringRef DispatchPtrName
= Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
- Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
+ Function *DispatchPtr = M.getFunction(DispatchPtrName);
if (!DispatchPtr) // Dispatch ptr not used.
return false;
@@ -267,3 +258,22 @@ char AMDGPULowerKernelAttributes::ID = 0;
ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
return new AMDGPULowerKernelAttributes();
}
+
+PreservedAnalyses
+AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
+ StringRef DispatchPtrName =
+ Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
+
+ Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName);
+ if (!DispatchPtr) // Dispatch ptr not used.
+ return PreservedAnalyses::all();
+
+ for (Instruction &I : instructions(F)) {
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ if (CI->getCalledFunction() == DispatchPtr)
+ processUse(CI);
+ }
+ }
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 99d229c9b74e..a8cba3f5cc5c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -13,12 +13,10 @@
//
#include "AMDGPUAsmPrinter.h"
-#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600AsmPrinter.h"
-#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Constants.h"
@@ -323,7 +321,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// The isPseudo check really shouldn't be here, but unfortunately there are
// some negative lit tests that depend on being able to continue through
// here even when pseudo instructions haven't been lowered.
- if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) {
+ //
+ // We also overestimate branch sizes with the offset bug.
+ if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
+ (!STI.hasOffset3fBug() || !MI->isBranch())) {
SmallVector<MCFixup, 4> Fixups;
SmallVector<char, 16> CodeBytes;
raw_svector_ostream CodeStream(CodeBytes);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
new file mode 100644
index 000000000000..c3441f81a78e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -0,0 +1,38 @@
+//===- AMDGPUMIRFormatter.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Implementation of AMDGPU overrides of MIRFormatter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMIRFormatter.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue(
+ StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
+ const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+ if (Src == "BufferResource") {
+ PSV = MFI->getBufferPSV(TII);
+ return false;
+ }
+ if (Src == "ImageResource") {
+ PSV = MFI->getImagePSV(TII);
+ return false;
+ }
+ if (Src == "GWSResource") {
+ PSV = MFI->getGWSPSV(TII);
+ return false;
+ }
+ llvm_unreachable("unknown MIR custom pseudo source value");
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
new file mode 100644
index 000000000000..a61f1f7b8182
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -0,0 +1,47 @@
+//===-- llvm/Target/AMDGPU/AMDGPUMIRFormatter.h -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDGPU specific overrides of MIRFormatter.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
+#define LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/MIRFormatter.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+struct PerFunctionMIParsingState;
+struct SlotMapping;
+
+class AMDGPUMIRFormatter final : public MIRFormatter {
+public:
+ AMDGPUMIRFormatter() {}
+ virtual ~AMDGPUMIRFormatter() = default;
+
+ /// Implement target specific parsing of target custom pseudo source value.
+ virtual bool
+ parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF,
+ PerFunctionMIParsingState &PFS,
+ const PseudoSourceValue *&PSV,
+ ErrorCallbackType ErrorCallback) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index f61af5a27943..b6a69b2819ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -11,36 +11,17 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
+#include "GCNSubtarget.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegionInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <tuple>
-#include <utility>
using namespace llvm;
@@ -342,11 +323,11 @@ protected:
LinearizedRegion *Parent;
RegionMRT *RMRT;
- void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+ void storeLiveOutReg(MachineBasicBlock *MBB, Register Reg,
MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
- void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+ void storeLiveOutRegRegion(RegionMRT *Region, Register Reg,
MachineInstr *DefInstr,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
@@ -397,7 +378,7 @@ public:
void replaceLiveOut(unsigned OldReg, unsigned NewReg);
- void replaceRegister(unsigned Register, unsigned NewRegister,
+ void replaceRegister(unsigned Register, class Register NewRegister,
MachineRegisterInfo *MRI, bool ReplaceInside,
bool ReplaceOutside, bool IncludeLoopPHIs);
@@ -690,12 +671,12 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF,
return Result;
}
-void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, Register Reg,
MachineInstr *DefInstr,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
<< "\n");
// If this is a source register to a PHI we are chaining, it
@@ -730,12 +711,12 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
}
}
-void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, Register Reg,
MachineInstr *DefInstr,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
<< "\n");
for (auto &UI : MRI->use_operands(Reg)) {
@@ -907,7 +888,8 @@ void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
}
}
-void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+void LinearizedRegion::replaceRegister(unsigned Register,
+ class Register NewRegister,
MachineRegisterInfo *MRI,
bool ReplaceInside, bool ReplaceOutside,
bool IncludeLoopPHI) {
@@ -950,7 +932,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
(IncludeLoopPHI && IsLoopPHI);
if (ShouldReplace) {
- if (Register::isPhysicalRegister(NewRegister)) {
+ if (NewRegister.isPhysical()) {
LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
<< printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
@@ -1002,11 +984,11 @@ void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
}
bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
- return MBBs.count(MBB) == 1;
+ return MBBs.contains(MBB);
}
bool LinearizedRegion::isLiveOut(unsigned Reg) {
- return LiveOuts.count(Reg) == 1;
+ return LiveOuts.contains(Reg);
}
bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
@@ -1025,7 +1007,7 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
for (auto &RI : II.uses()) {
if (RI.isReg()) {
Register Reg = RI.getReg();
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
if (hasNoDef(Reg, MRI))
continue;
if (!MRI->hasOneDef(Reg)) {
@@ -1168,7 +1150,7 @@ private:
void createEntryPHIs(LinearizedRegion *CurrentRegion);
void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
- void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+ void replaceRegisterWith(unsigned Register, class Register NewRegister);
MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
MachineBasicBlock *CodeBB,
@@ -1872,7 +1854,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
: DebugLoc();
- unsigned Reg =
+ Register Reg =
TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
SelectBB->getNumber() /* CodeBBStart->getNumber() */);
if (&(*(IfBB->getParent()->begin())) == IfBB) {
@@ -2224,8 +2206,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegi
PHIInfo.clear();
}
-void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
- unsigned NewRegister) {
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(
+ unsigned Register, class Register NewRegister) {
assert(Register != NewRegister && "Cannot replace a reg with itself");
for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
@@ -2233,7 +2215,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
I != E;) {
MachineOperand &O = *I;
++I;
- if (Register::isPhysicalRegister(NewRegister)) {
+ if (NewRegister.isPhysical()) {
LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
<< printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
@@ -2334,7 +2316,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
TII->removeBranch(*RegionExit);
// We need to create a backedge if there is a loop
- unsigned Reg = TII->insertNE(
+ Register Reg = TII->insertNE(
RegionExit, RegionExit->instr_end(), DL,
CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
CurrentRegion->getRegionMRT()->getEntry()->getNumber());
@@ -2393,7 +2375,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
TII->removeBranch(*RegionExit);
// We need to create a backedge if there is a loop
- unsigned Reg =
+ Register Reg =
TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
CurrentRegion->getRegionMRT()->getEntry()->getNumber());
@@ -2592,7 +2574,7 @@ static void removeOldExitPreds(RegionMRT *Region) {
static bool mbbHasBackEdge(MachineBasicBlock *MBB,
SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
- if (MBBs.count(*SI) != 0) {
+ if (MBBs.contains(*SI)) {
return true;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 64acd6efe028..717145b7af53 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -7,17 +7,20 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMachineFunction.h"
-#include "AMDGPUSubtarget.h"
#include "AMDGPUPerfHintAnalysis.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
-AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
- MachineFunctionInfo(),
- Mode(MF.getFunction()),
- IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
- NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
+ : MachineFunctionInfo(), Mode(MF.getFunction()),
+ IsEntryFunction(
+ AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
+ IsModuleEntryFunction(
+ AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())),
+ NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
@@ -49,10 +52,27 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
/// TODO: We should sort these to minimize wasted space due to alignment
/// padding. Currently the padding is decided by the first encountered use
/// during lowering.
- unsigned Offset = LDSSize = alignTo(LDSSize, Alignment);
+ unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
Entry.first->second = Offset;
- LDSSize += DL.getTypeAllocSize(GV.getValueType());
+ StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+ // Update the LDS size considering the padding to align the dynamic shared
+ // memory.
+ LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
return Offset;
}
+
+void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+ const GlobalVariable &GV) {
+ assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
+
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
+ if (Alignment <= DynLDSAlign)
+ return;
+
+ LDSSize = alignTo(StaticLDSSize, Alignment);
+ DynLDSAlign = Alignment;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index c504dd76bc65..07cac776082d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -9,9 +9,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "Utils/AMDGPUBaseInfo.h"
namespace llvm {
@@ -29,13 +29,27 @@ protected:
/// Number of bytes in the LDS that are being used.
unsigned LDSSize = 0;
+ /// Number of bytes in the LDS allocated statically. This field is only used
+ /// in the instruction selector and not part of the machine function info.
+ unsigned StaticLDSSize = 0;
+
+ /// Align for dynamic shared memory if any. Dynamic shared memory is
+ /// allocated directly after the static one, i.e., LDSSize. Need to pad
+ /// LDSSize to ensure that dynamic one is aligned accordingly.
+ /// The maximal alignment is updated during IR translation or lowering
+ /// stages.
+ Align DynLDSAlign;
+
// State of MODE register, assumed FP mode.
AMDGPU::SIModeRegisterDefaults Mode;
- // Kernels + shaders. i.e. functions called by the driver and not called
+ // Kernels + shaders. i.e. functions called by the hardware and not called
// by other functions.
bool IsEntryFunction = false;
+ // Entry points called by other functions instead of directly by the hardware.
+ bool IsModuleEntryFunction = false;
+
bool NoSignedZerosFPMath = false;
// Function may be memory bound.
@@ -65,6 +79,8 @@ public:
return IsEntryFunction;
}
+ bool isModuleEntryFunction() const { return IsModuleEntryFunction; }
+
bool hasNoSignedZerosFPMath() const {
return NoSignedZerosFPMath;
}
@@ -78,6 +94,10 @@ public:
}
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+
+ Align getDynLDSAlign() const { return DynLDSAlign; }
+
+ void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
};
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 4d9f08b3af01..6646cce8186b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -13,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMachineModuleInfo.h"
-#include "llvm/IR/Module.h"
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 2b0b8b42acfe..1b513c456307 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -15,11 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/IR/LLVMContext.h"
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index b05855d1afc6..c15c94ee17f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -12,10 +12,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMacroFusion.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
+#include "SIInstrInfo.h"
#include "llvm/CodeGen/MacroFusion.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
index da4b3cf8bc24..82c6d75bb060 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -6,7 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 4f9ffa11bc73..d27eb68ca74b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -34,16 +34,11 @@
#include "AMDGPU.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/User.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 8b69f51c1a0d..756bc948b1dd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -16,6 +16,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+namespace llvm {
namespace AMDGPU {
namespace ElfNote {
@@ -41,7 +42,7 @@ enum NoteType{
NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
};
-}
-}
-
+} // End namespace ElfNote
+} // End namespace AMDGPU
+} // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 93079738ef99..2f6220e425cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -22,11 +22,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueMap.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 9599e09fbd96..99dbf5080741 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -17,7 +17,6 @@
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/IR/ValueMap.h"
-#include "llvm/Pass.h"
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 098b0e993886..09e2c762abdb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -11,35 +11,65 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/Debug.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
using namespace llvm;
using namespace MIPatternMatch;
-struct FMinFMaxLegacyInfo {
- Register LHS;
- Register RHS;
- Register True;
- Register False;
- CmpInst::Predicate Pred;
+class AMDGPUPostLegalizerCombinerHelper {
+protected:
+ MachineIRBuilder &B;
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ CombinerHelper &Helper;
+
+public:
+ AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+ : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
+
+ struct FMinFMaxLegacyInfo {
+ Register LHS;
+ Register RHS;
+ Register True;
+ Register False;
+ CmpInst::Predicate Pred;
+ };
+
+ // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
+ bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
+ void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
+ const FMinFMaxLegacyInfo &Info);
+
+ bool matchUCharToFloat(MachineInstr &MI);
+ void applyUCharToFloat(MachineInstr &MI);
+
+ // FIXME: Should be able to have 2 separate matchdatas rather than custom
+ // struct boilerplate.
+ struct CvtF32UByteMatchInfo {
+ Register CvtVal;
+ unsigned ShiftOffset;
+ };
+
+ bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
+ void applyCvtF32UByteN(MachineInstr &MI,
+ const CvtF32UByteMatchInfo &MatchInfo);
};
-// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
-static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
+bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
+ MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
// FIXME: Combines should have subtarget predicates, and we shouldn't need
// this here.
if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
@@ -77,12 +107,11 @@ static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
}
}
-static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
- const FMinFMaxLegacyInfo &Info) {
-
- auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
- MachineIRBuilder MIB(MI);
- MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
+void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
+ MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
+ B.setInstrAndDebugLoc(MI);
+ auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
+ B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
};
switch (Info.Pred) {
@@ -127,8 +156,7 @@ static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
MI.eraseFromParent();
}
-static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineFunction &MF, CombinerHelper &Helper) {
+bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
// TODO: We could try to match extracting the higher bytes, which would be
@@ -147,15 +175,15 @@ static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
return false;
}
-static void applyUCharToFloat(MachineInstr &MI) {
- MachineIRBuilder B(MI);
+void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
+ B.setInstrAndDebugLoc(MI);
const LLT S32 = LLT::scalar(32);
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
- LLT Ty = B.getMRI()->getType(DstReg);
- LLT SrcTy = B.getMRI()->getType(SrcReg);
+ LLT Ty = MRI.getType(DstReg);
+ LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy != S32)
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
@@ -171,16 +199,8 @@ static void applyUCharToFloat(MachineInstr &MI) {
MI.eraseFromParent();
}
-// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
-// boilerplate.
-struct CvtF32UByteMatchInfo {
- Register CvtVal;
- unsigned ShiftOffset;
-};
-
-static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineFunction &MF,
- CvtF32UByteMatchInfo &MatchInfo) {
+bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
+ MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
Register SrcReg = MI.getOperand(1).getReg();
// Look through G_ZEXT.
@@ -207,14 +227,14 @@ static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
return false;
}
-static void applyCvtF32UByteN(MachineInstr &MI,
- const CvtF32UByteMatchInfo &MatchInfo) {
- MachineIRBuilder B(MI);
+void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
+ MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
+ B.setInstrAndDebugLoc(MI);
unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
const LLT S32 = LLT::scalar(32);
Register CvtSrc = MatchInfo.CvtVal;
- LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
+ LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
if (SrcTy != S32) {
assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
@@ -225,6 +245,18 @@ static void applyCvtF32UByteN(MachineInstr &MI,
MI.eraseFromParent();
}
+class AMDGPUPostLegalizerCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+ AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
+
+public:
+ AMDGPUPostLegalizerCombinerHelperState(
+ CombinerHelper &Helper,
+ AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
+ : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
+};
+
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
@@ -234,7 +266,7 @@ namespace {
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
+class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
@@ -258,10 +290,12 @@ public:
bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
- CombinerHelper Helper(Observer, B, KB, MDT);
- AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+ CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
+ AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
+ AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
+ PostLegalizerHelper);
- if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ if (Generated.tryCombineAll(Observer, MI, B))
return true;
switch (MI.getOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 800ad2039f0e..e4b628bf6b23 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -11,17 +11,15 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/Debug.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
@@ -37,7 +35,7 @@ namespace {
#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
+class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 524a34be876f..c8bd9b96b44f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -19,33 +19,21 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
using namespace llvm;
#define DEBUG_TYPE "printfToRuntime"
#define DWORD_ALIGN 4
namespace {
-class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final
- : public ModulePass {
+class AMDGPUPrintfRuntimeBinding final : public ModulePass {
public:
static char ID;
@@ -54,25 +42,36 @@ public:
private:
bool runOnModule(Module &M) override;
- void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
- StringRef fmt, size_t num_ops) const;
-
- bool shouldPrintAsStr(char Specifier, Type *OpType) const;
- bool
- lowerPrintfForGpu(Module &M,
- function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
}
+};
- Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) {
+class AMDGPUPrintfRuntimeBindingImpl {
+public:
+ AMDGPUPrintfRuntimeBindingImpl(
+ function_ref<const DominatorTree &(Function &)> GetDT,
+ function_ref<const TargetLibraryInfo &(Function &)> GetTLI)
+ : GetDT(GetDT), GetTLI(GetTLI) {}
+ bool run(Module &M);
+
+private:
+ void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
+ StringRef fmt, size_t num_ops) const;
+
+ bool shouldPrintAsStr(char Specifier, Type *OpType) const;
+ bool lowerPrintfForGpu(Module &M);
+
+ Value *simplify(Instruction *I, const TargetLibraryInfo *TLI,
+ const DominatorTree *DT) {
return SimplifyInstruction(I, {*TD, TLI, DT});
}
const DataLayout *TD;
- const DominatorTree *DT;
+ function_ref<const DominatorTree &(Function &)> GetDT;
+ function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
SmallVector<CallInst *, 32> Printfs;
};
} // namespace
@@ -95,12 +94,11 @@ ModulePass *createAMDGPUPrintfRuntimeBinding() {
}
} // namespace llvm
-AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding()
- : ModulePass(ID), TD(nullptr), DT(nullptr) {
+AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() : ModulePass(ID) {
initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
}
-void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
+void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt,
size_t NumOps) const {
// not all format characters are collected.
@@ -132,8 +130,8 @@ void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
}
}
-bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
- Type *OpType) const {
+bool AMDGPUPrintfRuntimeBindingImpl::shouldPrintAsStr(char Specifier,
+ Type *OpType) const {
if (Specifier != 's')
return false;
const PointerType *PT = dyn_cast<PointerType>(OpType);
@@ -146,8 +144,7 @@ bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
return ElemIType->getBitWidth() == 8;
}
-bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
- Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
LLVMContext &Ctx = M.getContext();
IRBuilder<> Builder(Ctx);
Type *I32Ty = Type::getInt32Ty(Ctx);
@@ -172,7 +169,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
}
if (auto I = dyn_cast<Instruction>(Op)) {
- Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction()));
+ Value *Op_simplified =
+ simplify(I, &GetTLI(*I->getFunction()), &GetDT(*I->getFunction()));
if (Op_simplified)
Op = Op_simplified;
}
@@ -184,8 +182,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
StringRef Str("unknown");
if (GVar && GVar->hasInitializer()) {
- auto Init = GVar->getInitializer();
- if (auto CA = dyn_cast<ConstantDataArray>(Init)) {
+ auto *Init = GVar->getInitializer();
+ if (auto *CA = dyn_cast<ConstantDataArray>(Init)) {
if (CA->isString())
Str = CA->getAsCString();
} else if (isa<ConstantAggregateZero>(Init)) {
@@ -248,16 +246,15 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
}
}
if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
- if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
- GlobalVariable *GV =
- dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (auto *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+ auto *GV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
if (GV && GV->hasInitializer()) {
Constant *Init = GV->getInitializer();
- ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
- if (Init->isZeroValue() || CA->isString()) {
- size_t SizeStr = Init->isZeroValue()
- ? 1
- : (strlen(CA->getAsCString().data()) + 1);
+ bool IsZeroValue = Init->isZeroValue();
+ auto *CA = dyn_cast<ConstantDataArray>(Init);
+ if (IsZeroValue || (CA && CA->isString())) {
+ size_t SizeStr =
+ IsZeroValue ? 1 : (strlen(CA->getAsCString().data()) + 1);
size_t Rem = SizeStr % DWORD_ALIGN;
size_t NSizeStr = 0;
LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
@@ -379,9 +376,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
ZeroIdxList.push_back(zeroInt);
- GetElementPtrInst *BufferIdx =
- dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
- nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch));
+ GetElementPtrInst *BufferIdx = GetElementPtrInst::Create(
+ nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch);
Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
Value *id_gep_cast =
@@ -395,8 +391,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
// the following GEP is the buffer pointer
- BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create(
- nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch));
+ BufferIdx = GetElementPtrInst::Create(nullptr, pcall, FourthIdxList,
+ "PrintBuffGep", Brnch);
Type *Int32Ty = Type::getInt32Ty(Ctx);
Type *Int64Ty = Type::getInt64Ty(Ctx);
@@ -409,17 +405,15 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
if (ArgType->isFPOrFPVectorTy() && !isa<VectorType>(ArgType)) {
Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
if (OpConvSpecifiers[ArgCount - 1] == 'f') {
- ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
- if (fpCons) {
- APFloat Val(fpCons->getValueAPF());
+ if (auto *FpCons = dyn_cast<ConstantFP>(Arg)) {
+ APFloat Val(FpCons->getValueAPF());
bool Lost = false;
Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
&Lost);
Arg = ConstantFP::get(Ctx, Val);
IType = Int32Ty;
- } else {
- FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
- if (FpExt && FpExt->getType()->isDoubleTy() &&
+ } else if (auto *FpExt = dyn_cast<FPExtInst>(Arg)) {
+ if (FpExt->getType()->isDoubleTy() &&
FpExt->getOperand(0)->getType()->isFloatTy()) {
Arg = FpExt->getOperand(0);
IType = Int32Ty;
@@ -431,14 +425,14 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
} else if (ArgType->getTypeID() == Type::PointerTyID) {
if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
const char *S = NonLiteralStr;
- if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
- GlobalVariable *GV =
- dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (auto *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+ auto *GV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
if (GV && GV->hasInitializer()) {
Constant *Init = GV->getInitializer();
- ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
- if (Init->isZeroValue() || CA->isString()) {
- S = Init->isZeroValue() ? "" : CA->getAsCString().data();
+ bool IsZeroValue = Init->isZeroValue();
+ auto *CA = dyn_cast<ConstantDataArray>(Init);
+ if (IsZeroValue || (CA && CA->isString())) {
+ S = IsZeroValue ? "" : CA->getAsCString().data();
}
}
}
@@ -491,27 +485,27 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
switch (EleSize) {
default:
EleCount = TotalSize / 64;
- IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ IType = Type::getInt64Ty(ArgType->getContext());
break;
case 8:
if (EleCount >= 8) {
EleCount = TotalSize / 64;
- IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ IType = Type::getInt64Ty(ArgType->getContext());
} else if (EleCount >= 3) {
EleCount = 1;
- IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+ IType = Type::getInt32Ty(ArgType->getContext());
} else {
EleCount = 1;
- IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext()));
+ IType = Type::getInt16Ty(ArgType->getContext());
}
break;
case 16:
if (EleCount >= 3) {
EleCount = TotalSize / 64;
- IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ IType = Type::getInt64Ty(ArgType->getContext());
} else {
EleCount = 1;
- IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+ IType = Type::getInt32Ty(ArgType->getContext());
}
break;
}
@@ -539,8 +533,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
(void)StBuff;
if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
break;
- BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
- nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch));
+ BufferIdx = GetElementPtrInst::Create(nullptr, BufferIdx, BuffOffset,
+ "PrintBuffNextPtr", Brnch);
LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
<< *BufferIdx << '\n');
}
@@ -556,7 +550,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
return true;
}
-bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
Triple TT(M.getTargetTriple());
if (TT.getArch() == Triple::r600)
return false;
@@ -585,11 +579,31 @@ bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
}
TD = &M.getDataLayout();
- auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
+ return lowerPrintfForGpu(M);
+}
+
+bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+ auto GetDT = [this](Function &F) -> DominatorTree & {
+ return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ };
auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
};
- return lowerPrintfForGpu(M, GetTLI);
+ return AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M);
+}
+
+PreservedAnalyses
+AMDGPUPrintfRuntimeBindingPass::run(Module &M, ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto GetDT = [&FAM](Function &F) -> DominatorTree & {
+ return FAM.getResult<DominatorTreeAnalysis>(F);
+ };
+ auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+ bool Changed = AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 727f71b35049..2a6ea838efc0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -12,53 +12,15 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
+#include "GCNSubtarget.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <map>
-#include <tuple>
-#include <utility>
-#include <vector>
#define DEBUG_TYPE "amdgpu-promote-alloca"
@@ -83,8 +45,26 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+
+ bool handleAlloca(AllocaInst &I, bool SufficientLDS);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+class AMDGPUPromoteAllocaImpl {
private:
- const TargetMachine *TM;
+ const TargetMachine &TM;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
@@ -116,28 +96,14 @@ private:
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
-public:
- static char ID;
-
- AMDGPUPromoteAlloca() : FunctionPass(ID) {}
-
- bool doInitialization(Module &M) override;
- bool runOnFunction(Function &F) override;
-
- StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
-
bool handleAlloca(AllocaInst &I, bool SufficientLDS);
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- FunctionPass::getAnalysisUsage(AU);
- }
+public:
+ AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
+ bool run(Function &F);
};
class AMDGPUPromoteAllocaToVector : public FunctionPass {
-private:
- unsigned MaxVGPRs;
-
public:
static char ID;
@@ -149,8 +115,6 @@ public:
return "AMDGPU Promote Alloca to vector";
}
- bool handleAlloca(AllocaInst &I);
-
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
FunctionPass::getAnalysisUsage(AU);
@@ -171,32 +135,41 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
-bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
- Mod = &M;
- DL = &Mod->getDataLayout();
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+ return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
+ }
return false;
}
-bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
+PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+ return PreservedAnalyses::all();
+}
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- TM = &TPC->getTM<TargetMachine>();
- else
- return false;
+bool AMDGPUPromoteAllocaImpl::run(Function &F) {
+ Mod = F.getParent();
+ DL = &Mod->getDataLayout();
- const Triple &TT = TM->getTargetTriple();
+ const Triple &TT = TM.getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
if (!ST.isPromoteAllocaEnabled())
return false;
if (IsAMDGCN) {
- const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
} else {
MaxVGPRs = 128;
@@ -221,9 +194,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
}
std::pair<Value *, Value *>
-AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
const Function &F = *Builder.GetInsertBlock()->getParent();
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
if (!IsAMDHSA) {
Function *LocalSizeYFn
@@ -308,9 +281,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
return std::make_pair(Y, LoadZU);
}
-Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
+ unsigned N) {
const AMDGPUSubtarget &ST =
- AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
+ AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent());
Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
switch (N) {
@@ -592,11 +566,9 @@ static bool isCallPromotable(CallInst *CI) {
}
}
-bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
- Value *Val,
- Instruction *Inst,
- int OpIdx0,
- int OpIdx1) const {
+bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
+ Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
+ int OpIdx1) const {
// Figure out which operand is the one we might not be promoting.
Value *OtherOp = Inst->getOperand(OpIdx0);
if (Val == OtherOp)
@@ -605,7 +577,7 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
if (isa<ConstantPointerNull>(OtherOp))
return true;
- Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+ Value *OtherObj = getUnderlyingObject(OtherOp);
if (!isa<AllocaInst>(OtherObj))
return false;
@@ -624,10 +596,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
return true;
}
-bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
- Value *BaseAlloca,
- Value *Val,
- std::vector<Value*> &WorkList) const {
+bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
+ Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
for (User *User : Val->users()) {
if (is_contained(WorkList, User))
@@ -727,10 +697,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
return true;
}
-bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
FunctionType *FTy = F.getFunctionType();
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
@@ -749,35 +719,79 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
if (LocalMemLimit == 0)
return false;
- const DataLayout &DL = Mod->getDataLayout();
+ SmallVector<const Constant *, 16> Stack;
+ SmallPtrSet<const Constant *, 8> VisitedConstants;
+ SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
+
+ auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
+ for (const User *U : Val->users()) {
+ if (const Instruction *Use = dyn_cast<Instruction>(U)) {
+ if (Use->getParent()->getParent() == &F)
+ return true;
+ } else {
+ const Constant *C = cast<Constant>(U);
+ if (VisitedConstants.insert(C).second)
+ Stack.push_back(C);
+ }
+ }
+
+ return false;
+ };
- // Check how much local memory is being used by global objects
- CurrentLocalMemUsage = 0;
for (GlobalVariable &GV : Mod->globals()) {
if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
- for (const User *U : GV.users()) {
- const Instruction *Use = dyn_cast<Instruction>(U);
- if (!Use)
- continue;
+ if (visitUsers(&GV, &GV)) {
+ UsedLDS.insert(&GV);
+ Stack.clear();
+ continue;
+ }
- if (Use->getParent()->getParent() == &F) {
- Align Alignment =
- DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
-
- // FIXME: Try to account for padding here. The padding is currently
- // determined from the inverse order of uses in the function. I'm not
- // sure if the use list order is in any way connected to this, so the
- // total reported size is likely incorrect.
- uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
- CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
- CurrentLocalMemUsage += AllocSize;
+ // For any ConstantExpr uses, we need to recursively search the users until
+ // we see a function.
+ while (!Stack.empty()) {
+ const Constant *C = Stack.pop_back_val();
+ if (visitUsers(&GV, C)) {
+ UsedLDS.insert(&GV);
+ Stack.clear();
break;
}
}
}
+ const DataLayout &DL = Mod->getDataLayout();
+ SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
+ AllocatedSizes.reserve(UsedLDS.size());
+
+ for (const GlobalVariable *GV : UsedLDS) {
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+ uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+ AllocatedSizes.emplace_back(AllocSize, Alignment);
+ }
+
+ // Sort to try to estimate the worst case alignment padding
+ //
+ // FIXME: We should really do something to fix the addresses to a more optimal
+ // value instead
+ llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS,
+ std::pair<uint64_t, Align> RHS) {
+ return LHS.second < RHS.second;
+ });
+
+ // Check how much local memory is being used by global objects
+ CurrentLocalMemUsage = 0;
+
+ // FIXME: Try to account for padding here. The real padding and address is
+ // currently determined from the inverse order of uses in the function when
+ // legalizing, which could also potentially change. We try to estimate the
+ // worst case here, but we probably should fix the addresses earlier.
+ for (auto Alloc : AllocatedSizes) {
+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
+ CurrentLocalMemUsage += Alloc.first;
+ }
+
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
F);
@@ -819,7 +833,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
}
// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
+bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.
if (!I.isStaticAlloca() || I.isArrayAllocation())
@@ -860,7 +874,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (!SufficientLDS)
return false;
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
Align Alignment =
@@ -1039,22 +1053,29 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
return true;
}
-bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
- if (skipFunction(F) || DisablePromoteAllocaToVector)
+bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
+ // Array allocations are probably not worth handling, since an allocation of
+ // the array type is the canonical form.
+ if (!I.isStaticAlloca() || I.isArrayAllocation())
return false;
- const TargetMachine *TM;
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- TM = &TPC->getTM<TargetMachine>();
- else
+ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+ Module *Mod = I.getParent()->getParent()->getParent();
+ return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
+bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
+ if (DisablePromoteAllocaToVector)
return false;
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
if (!ST.isPromoteAllocaEnabled())
return false;
- if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
- const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ unsigned MaxVGPRs;
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
} else {
MaxVGPRs = 128;
@@ -1070,23 +1091,31 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
}
for (AllocaInst *AI : Allocas) {
- if (handleAlloca(*AI))
+ if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
Changed = true;
}
return Changed;
}
-bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
- // Array allocations are probably not worth handling, since an allocation of
- // the array type is the canonical form.
- if (!I.isStaticAlloca() || I.isArrayAllocation())
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+ if (skipFunction(F))
return false;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+ return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
+ }
+ return false;
+}
- LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
-
- Module *Mod = I.getParent()->getParent()->getParent();
- return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+PreservedAnalyses
+AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
+ bool Changed = promoteAllocasToVector(F, TM);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+ return PreservedAnalyses::all();
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index 982aae374884..cd71c7a16c73 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -27,16 +27,14 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include <string>
#define DEBUG_TYPE "amdgpu-propagate-attributes"
@@ -56,8 +54,10 @@ static constexpr const FeatureBitset TargetFeatures = {
};
// Attributes to propagate.
+// TODO: Support conservative min/max merging instead of cloning.
static constexpr const char* AttributeNames[] = {
- "amdgpu-waves-per-eu"
+ "amdgpu-waves-per-eu",
+ "amdgpu-flat-work-group-size"
};
static constexpr unsigned NumAttr =
@@ -371,15 +371,28 @@ AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
}
bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
- if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ if (!TM) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ TM = &TPC->getTM<TargetMachine>();
+ }
+
+ if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
return false;
return AMDGPUPropagateAttributes(TM, false).process(F);
}
bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
- if (!TM)
- return false;
+ if (!TM) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ TM = &TPC->getTM<TargetMachine>();
+ }
return AMDGPUPropagateAttributes(TM, true).process(M);
}
@@ -393,3 +406,21 @@ ModulePass
*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
return new AMDGPUPropagateAttributesLate(TM);
}
+
+PreservedAnalyses
+AMDGPUPropagateAttributesEarlyPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ return PreservedAnalyses::all();
+
+ return AMDGPUPropagateAttributes(&TM, false).process(F)
+ ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
+
+PreservedAnalyses
+AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
+ return AMDGPUPropagateAttributes(&TM, true).process(M)
+ ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 71d82679b3ff..d644c0319286 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -11,19 +11,17 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
+#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/Debug.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
+#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-regbank-combiner"
using namespace llvm;
@@ -39,7 +37,7 @@ namespace {
#include "AMDGPUGenRegBankGICombiner.inc"
#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
-class AMDGPURegBankCombinerInfo : public CombinerInfo {
+class AMDGPURegBankCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dfaf97bfb08e..502356d4f9a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -70,21 +70,17 @@
#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#define GET_TARGET_REGBANK_IMPL
#include "AMDGPUGenRegisterBank.inc"
@@ -187,7 +183,12 @@ public:
}
void changingInstr(MachineInstr &MI) override {}
- void changedInstr(MachineInstr &MI) override {}
+ void changedInstr(MachineInstr &MI) override {
+ // FIXME: In principle we should probably add the instruction to NewInsts,
+ // but the way the LegalizerHelper uses the observer, we will always see the
+ // registers we need to set the regbank on also referenced in a new
+ // instruction.
+ }
};
}
@@ -750,6 +751,9 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
for (MachineInstr &MI : Range) {
for (MachineOperand &Def : MI.defs()) {
+ if (MRI.use_nodbg_empty(Def.getReg()))
+ continue;
+
LLT ResTy = MRI.getType(Def.getReg());
const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
ResultRegs.push_back(Def.getReg());
@@ -847,7 +851,18 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
continue;
}
- LLT OpTy = MRI.getType(Op.getReg());
+ Register OpReg = Op.getReg();
+ LLT OpTy = MRI.getType(OpReg);
+
+ const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
+ if (OpBank != &AMDGPU::VGPRRegBank) {
+ // Insert copy from AGPR to VGPR before the loop.
+ B.setMBB(MBB);
+ OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
+ MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
+ B.setInstr(*I);
+ }
+
unsigned OpSize = OpTy.getSizeInBits();
// Can only do a readlane of 32-bit pieces.
@@ -857,11 +872,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(CurrentLaneOpReg, OpTy);
- constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
- .addReg(Op.getReg());
+ .addReg(OpReg);
Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
@@ -872,7 +887,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
- .addReg(Op.getReg());
+ .addReg(OpReg);
Op.setReg(CurrentLaneOpReg);
if (!First) {
@@ -904,7 +919,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
// Insert the unmerge before the loop.
B.setMBB(MBB);
- auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
B.setInstr(*I);
unsigned NumPieces = Unmerge->getNumOperands() - 1;
@@ -1039,7 +1054,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
// Return any unique registers used by \p MI at \p OpIndices that need to be
// handled in a waterfall loop. Returns these registers in \p
-// SGPROperandRegs. Returns true if there are any operansd to handle and a
+// SGPROperandRegs. Returns true if there are any operands to handle and a
// waterfall loop is necessary.
bool AMDGPURegisterBankInfo::collectWaterfallOperands(
SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
@@ -1048,7 +1063,7 @@ bool AMDGPURegisterBankInfo::collectWaterfallOperands(
assert(MI.getOperand(Op).isUse());
Register Reg = MI.getOperand(Op).getReg();
const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
- if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+ if (OpBank->getID() != AMDGPU::SGPRRegBankID)
SGPROperandRegs.insert(Reg);
}
@@ -1083,16 +1098,24 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
Register Reg = MI.getOperand(OpIdx).getReg();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
- if (Bank != &AMDGPU::VGPRRegBank)
+ if (Bank == &AMDGPU::SGPRRegBank)
return;
+ LLT Ty = MRI.getType(Reg);
MachineIRBuilder B(MI);
+
+ if (Bank != &AMDGPU::VGPRRegBank) {
+ // We need to copy from AGPR to VGPR
+ Reg = B.buildCopy(Ty, Reg).getReg(0);
+ MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
+ }
+
Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
.addDef(SGPR)
.addReg(Reg);
- MRI.setType(SGPR, MRI.getType(Reg));
+ MRI.setType(SGPR, Ty);
const TargetRegisterClass *Constrained =
constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
@@ -1149,10 +1172,8 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
// 96-bit loads are only available for vector loads. We need to split this
// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
- MachineIRBuilder B(MI);
ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&O);
- B.setChangeObserver(Observer);
+ MachineIRBuilder B(MI, O);
if (MMO->getAlign() < Align(16)) {
LLT Part64, Part32;
@@ -1191,13 +1212,10 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
MRI.setType(BasePtrReg, PtrTy);
- MachineIRBuilder B(MI);
-
unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
- ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
- GISelObserverWrapper Observer(&O);
- B.setChangeObserver(Observer);
+ ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, Observer);
LegalizerHelper Helper(B.getMF(), Observer, B);
if (LoadTy.isVector()) {
@@ -1241,10 +1259,7 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register SPReg = Info->getStackPtrOffsetReg();
ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&ApplyBank);
-
- MachineIRBuilder B(MI);
- B.setChangeObserver(Observer);
+ MachineIRBuilder B(MI, ApplyBank);
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
@@ -1309,7 +1324,7 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo *MRI = B.getMRI();
- if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
+ if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
uint32_t SOffset, ImmOffset;
if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
Alignment)) {
@@ -1325,10 +1340,9 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
Register Base;
unsigned Offset;
- MachineInstr *Unused;
- std::tie(Base, Offset, Unused)
- = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
+ std::tie(Base, Offset) =
+ AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
uint32_t SOffset, ImmOffset;
if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
@@ -1535,9 +1549,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
// The scalar form packs the offset and width in a single operand.
ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&ApplyBank);
- MachineIRBuilder B(MI);
- B.setChangeObserver(Observer);
+ MachineIRBuilder B(MI, ApplyBank);
// Ensure the high bits are clear to insert the offset.
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
@@ -1922,7 +1934,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
const RegisterBank &IdxBank =
*OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
- bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+ bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
LLT VecTy = MRI.getType(VecReg);
unsigned EltSize = VecTy.getScalarSizeInBits();
@@ -2004,7 +2016,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
const RegisterBank &IdxBank =
*OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
- bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+ bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
LLT VecTy = MRI.getType(VecReg);
unsigned EltSize = VecTy.getScalarSizeInBits();
@@ -2129,9 +2141,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Promote SGPR/VGPR booleans to s32
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
- GISelObserverWrapper Observer(&ApplyBank);
- MachineIRBuilder B(MI);
- LegalizerHelper Helper(*MF, Observer, B);
+ MachineIRBuilder B(MI, ApplyBank);
+ LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
@@ -2274,9 +2285,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
- GISelObserverWrapper Observer(&ApplyBank);
- MachineIRBuilder B(MI);
- LegalizerHelper Helper(*MF, Observer, B);
+ MachineIRBuilder B(MI, ApplyBank);
+ LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
LegalizerHelper::Legalized)
@@ -2319,15 +2329,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
setRegsToType(MRI, DefRegs, HalfTy);
- B.buildInstr(Opc)
- .addDef(DefRegs[0])
- .addUse(Src0Regs[0])
- .addUse(Src1Regs[0]);
-
- B.buildInstr(Opc)
- .addDef(DefRegs[1])
- .addUse(Src0Regs[1])
- .addUse(Src1Regs[1]);
+ B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
+ B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
@@ -2355,13 +2358,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
const LLT S32 = LLT::scalar(32);
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
- MachineIRBuilder B(MI);
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&ApplySALU);
+ MachineIRBuilder B(MI, ApplySALU);
if (DstTy.isVector()) {
- B.setChangeObserver(Observer);
-
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
@@ -2374,7 +2374,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
MI.eraseFromParent();
} else {
- LegalizerHelper Helper(*MF, Observer, B);
+ LegalizerHelper Helper(*MF, ApplySALU, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
@@ -2411,8 +2411,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (Ty == V2S16) {
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&ApplySALU);
- B.setChangeObserver(Observer);
+ B.setChangeObserver(ApplySALU);
// Need to widen to s32, and expand as cmp + select, and avoid producing
// illegal vector extends or unmerges that would need further
@@ -2444,8 +2443,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
} else if (Ty == S16) {
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&ApplySALU);
- LegalizerHelper Helper(*MF, Observer, B);
+ B.setChangeObserver(ApplySALU);
+ LegalizerHelper Helper(*MF, ApplySALU, B);
// Need to widen to s32, and expand as cmp + select.
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
@@ -2499,9 +2498,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_CTPOP:
case AMDGPU::G_CTLZ_ZERO_UNDEF:
case AMDGPU::G_CTTZ_ZERO_UNDEF: {
- MachineIRBuilder B(MI);
- MachineFunction &MF = B.getMF();
-
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
@@ -2514,8 +2510,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break;
ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
- GISelObserverWrapper Observer(&ApplyVALU);
- LegalizerHelper Helper(MF, Observer, B);
+ MachineIRBuilder B(MI, ApplyVALU);
+
+ MachineFunction &MF = B.getMF();
+ LegalizerHelper Helper(MF, ApplyVALU, B);
if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
llvm_unreachable("narrowScalar should have succeeded");
@@ -2693,8 +2691,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register BaseIdxReg;
unsigned ConstOffset;
- MachineInstr *OffsetDef;
- std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
+ std::tie(BaseIdxReg, ConstOffset) =
AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
// See if the index is an add of a constant which will be foldable by moving
@@ -2825,9 +2822,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register BaseIdxReg;
unsigned ConstOffset;
- MachineInstr *OffsetDef;
- std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
- AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
+ std::tie(BaseIdxReg, ConstOffset) =
+ AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
// See if the index is an add of a constant which will be foldable by moving
// the base register of the index later if this is going to be executed in a
@@ -2957,6 +2953,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 5});
+ return;
+ }
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {3, 6});
@@ -2989,7 +2990,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 3); // Index
return;
}
- case Intrinsic::amdgcn_ballot:
case Intrinsic::amdgcn_interp_p1:
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
@@ -3017,6 +3017,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_ubfe:
applyMappingBFEIntrinsic(OpdMapper, false);
return;
+ case Intrinsic::amdgcn_ballot:
+ // Use default handling and insert copy to vcc source.
+ break;
}
break;
}
@@ -3031,6 +3034,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
return;
}
+ case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
+ unsigned N = MI.getNumExplicitOperands() - 2;
+ executeInWaterfallLoop(MI, MRI, { N });
+ return;
+ }
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
@@ -3106,6 +3114,59 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return applyDefaultMapping(OpdMapper);
}
+// vgpr, sgpr -> vgpr
+// vgpr, agpr -> vgpr
+// agpr, agpr -> agpr
+// agpr, sgpr -> vgpr
+static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
+ if (RB0 == AMDGPU::InvalidRegBankID)
+ return RB1;
+ if (RB1 == AMDGPU::InvalidRegBankID)
+ return RB0;
+
+ if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
+ return AMDGPU::SGPRRegBankID;
+
+ if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
+ return AMDGPU::AGPRRegBankID;
+
+ return AMDGPU::VGPRRegBankID;
+}
+
+static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
+ if (RB0 == AMDGPU::InvalidRegBankID)
+ return RB1;
+ if (RB1 == AMDGPU::InvalidRegBankID)
+ return RB0;
+
+ // vcc, vcc -> vcc
+ // vcc, sgpr -> vcc
+ // vcc, vgpr -> vcc
+ if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
+ return AMDGPU::VCCRegBankID;
+
+ // vcc, vgpr -> vgpr
+ return regBankUnion(RB0, RB1);
+}
+
+unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI) const {
+ unsigned RegBank = AMDGPU::InvalidRegBankID;
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (!MI.getOperand(i).isReg())
+ continue;
+ Register Reg = MI.getOperand(i).getReg();
+ if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
+ RegBank = regBankUnion(RegBank, Bank->getID());
+ if (RegBank == AMDGPU::VGPRRegBankID)
+ break;
+ }
+ }
+
+ return RegBank;
+}
+
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3214,7 +3275,7 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
if (MustBeSGPR) {
// If this must be an SGPR, so we must report whatever it is as legal.
- unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
} else {
// Some operands must be VGPR, and these are easy to copy to.
@@ -3232,7 +3293,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
LLT PtrTy = MRI.getType(PtrReg);
unsigned Size = PtrTy.getSizeInBits();
if (Subtarget.useFlatForGlobal() ||
- !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
+ !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
// If we're using MUBUF instructions for global memory, an SGPR base register
@@ -3258,8 +3319,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
- if (PtrBank == &AMDGPU::SGPRRegBank &&
- SITargetLowering::isFlatGlobalAddrSpace(AS)) {
+ if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
if (isScalarLoadLegal(MI)) {
// We have a uniform instruction so we want to use an SMRD load
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -3292,41 +3352,18 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
unsigned
AMDGPURegisterBankInfo::getRegBankID(Register Reg,
const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI,
unsigned Default) const {
- const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
return Bank ? Bank->getID() : Default;
}
-
-static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
- return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
- AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
-}
-
-static int regBankBoolUnion(int RB0, int RB1) {
- if (RB0 == -1)
- return RB1;
- if (RB1 == -1)
- return RB0;
-
- // vcc, vcc -> vcc
- // vcc, sgpr -> vcc
- // vcc, vgpr -> vcc
- if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
- return AMDGPU::VCCRegBankID;
-
- // vcc, vgpr -> vgpr
- return regBankUnion(RB0, RB1);
-}
-
const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
// Lie and claim anything is legal, even though this needs to be an SGPR
// applyMapping will have to deal with it as a waterfall loop.
- unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
+ unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
unsigned Size = getSizeInBits(Reg, MRI, TRI);
return AMDGPU::getValueMapping(Bank, Size);
}
@@ -3361,7 +3398,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- if (MI.isCopy()) {
+ if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
// The default logic bothers to analyze impossible alternative mappings. We
// want the most straightforward mapping, so just directly handle this.
const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
@@ -3377,9 +3414,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getInvalidInstructionMapping();
const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
+ unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
+ SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
+ OpdsMapping[0] = &ValMap;
+ if (MI.getOpcode() == AMDGPU::G_FREEZE)
+ OpdsMapping[1] = &ValMap;
+
return getInstructionMapping(
1, /*Cost*/ 1,
- /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+ /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
}
if (MI.isRegSequence()) {
@@ -3388,7 +3431,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned BankID = AMDGPU::SGPRRegBankID;
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
- auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
+ auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
// It doesn't make sense to use vcc or scc banks here, so just ignore
// them.
if (OpBank != AMDGPU::SGPRRegBankID) {
@@ -3409,8 +3452,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
//
// TODO: There are additional exec masking dependencies to analyze.
if (MI.getOpcode() == TargetOpcode::G_PHI) {
- // TODO: Generate proper invalid bank enum.
- int ResultBank = -1;
+ unsigned ResultBank = AMDGPU::InvalidRegBankID;
Register DstReg = MI.getOperand(0).getReg();
// Sometimes the result may have already been assigned a bank.
@@ -3432,7 +3474,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
ResultBank = regBankBoolUnion(ResultBank, OpBank);
}
- assert(ResultBank != -1);
+ assert(ResultBank != AMDGPU::InvalidRegBankID);
unsigned Size = MRI.getType(DstReg).getSizeInBits();
@@ -3461,9 +3503,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const RegisterBank *DstBank
= getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
- unsigned TargetBankID = -1;
- unsigned BankLHS = -1;
- unsigned BankRHS = -1;
+ unsigned TargetBankID = AMDGPU::InvalidRegBankID;
+ unsigned BankLHS = AMDGPU::InvalidRegBankID;
+ unsigned BankRHS = AMDGPU::InvalidRegBankID;
if (DstBank) {
TargetBankID = DstBank->getID();
if (DstBank == &AMDGPU::VCCRegBank) {
@@ -3471,15 +3513,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
BankLHS = AMDGPU::VCCRegBankID;
BankRHS = AMDGPU::VCCRegBankID;
} else {
- BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
AMDGPU::SGPRRegBankID);
- BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::SGPRRegBankID);
}
} else {
- BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
AMDGPU::VCCRegBankID);
- BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::VCCRegBankID);
// Both inputs should be true booleans to produce a boolean result.
@@ -3507,10 +3549,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
- unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
+ unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
- unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
+ unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
}
@@ -3542,6 +3584,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
+ case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
+ case AMDGPU::G_SSUBSAT:
+ case AMDGPU::G_UADDSAT:
+ case AMDGPU::G_USUBSAT:
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FPTOSI:
@@ -3606,13 +3652,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_DYN_STACKALLOC: {
// Result is always uniform, and a wave reduction is needed for the source.
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
break;
}
case AMDGPU::G_INSERT: {
- unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
- AMDGPU::VGPRRegBankID;
+ unsigned BankID = getMappingType(MRI, MI);
unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
@@ -3623,7 +3668,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_EXTRACT: {
- unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
@@ -3637,8 +3682,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (DstTy == LLT::vector(2, 16)) {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
- unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
- unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
+ unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
@@ -3651,8 +3696,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_MERGE_VALUES:
case AMDGPU::G_CONCAT_VECTORS: {
- unsigned Bank = isSALUMapping(MI) ?
- AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ unsigned Bank = getMappingType(MRI, MI);
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
@@ -3669,7 +3713,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
break;
}
@@ -3677,7 +3721,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_CTTZ_ZERO_UNDEF:
case AMDGPU::G_CTPOP: {
unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
- unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
// This should really be getValueMappingSGPR64Only, but allowing the generic
@@ -3689,7 +3733,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_TRUNC: {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
- unsigned Bank = getRegBankID(Src, MRI, *TRI);
+ unsigned Bank = getRegBankID(Src, MRI);
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
@@ -3726,7 +3770,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_FCMP: {
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
@@ -3751,10 +3795,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// See if the result register has already been constrained to vcc, which may
// happen due to control flow intrinsic lowering.
- unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+ unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
AMDGPU::SGPRRegBankID);
- unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
Op2Bank == AMDGPU::SGPRRegBankID &&
@@ -3777,11 +3821,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
// VGPR index can be used for waterfall when indexing a SGPR vector.
- unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
@@ -3798,9 +3842,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
- unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
- MRI, *TRI);
- unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
+ unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
@@ -3820,8 +3863,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_UNMERGE_VALUES: {
- unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
- AMDGPU::VGPRRegBankID;
+ unsigned Bank = getMappingType(MRI, MI);
// Op1 and Dst should use the same register bank.
// FIXME: Shouldn't this be the default? Why do we need to handle this?
@@ -3876,7 +3918,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
// vdata_out
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -3958,6 +4001,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
+ case Intrinsic::amdgcn_fma_legacy:
case Intrinsic::amdgcn_ldexp:
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp:
@@ -4011,10 +4055,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wwm:
case Intrinsic::amdgcn_wqm:
case Intrinsic::amdgcn_softwqm:
+ case Intrinsic::amdgcn_set_inactive:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
- case Intrinsic::amdgcn_groupstaticsize: {
+ case Intrinsic::amdgcn_groupstaticsize:
+ case Intrinsic::amdgcn_reloc_constant:
+ case Intrinsic::returnaddress: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
@@ -4065,7 +4112,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// This must be an SGPR, but accept a VGPR.
Register IdxReg = MI.getOperand(3).getReg();
unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
- unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
LLVM_FALLTHROUGH;
}
@@ -4080,10 +4127,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
Register SrcReg = MI.getOperand(2).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
- unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
Register IdxReg = MI.getOperand(3).getReg();
unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
- unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
// These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
@@ -4149,7 +4196,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_interp_p2_f16: {
const int M0Idx = MI.getNumOperands() - 1;
Register M0Reg = MI.getOperand(M0Idx).getReg();
- unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
@@ -4182,6 +4229,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
assert(RSrcIntrin->IsImage);
return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
}
+ case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
+ unsigned N = MI.getNumExplicitOperands() - 2;
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
+ OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
+ for (unsigned I = 2; I < N; ++I)
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ break;
+ }
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
@@ -4193,15 +4248,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_ds_fadd:
- case Intrinsic::amdgcn_ds_fmin:
- case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_global_atomic_csub:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
- unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::SGPRRegBankID);
OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
@@ -4228,14 +4282,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
- unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::SGPRRegBankID);
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_s_setreg: {
// This must be an SGPR, but accept a VGPR.
- unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::SGPRRegBankID);
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
@@ -4304,7 +4358,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
// This must be an SGPR, but accept a VGPR.
- unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::SGPRRegBankID);
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
@@ -4313,7 +4367,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_ds_gws_sema_p:
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
// This must be an SGPR, but accept a VGPR.
- unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
AMDGPU::SGPRRegBankID);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
@@ -4325,16 +4379,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_SELECT: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
AMDGPU::SGPRRegBankID);
- unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
AMDGPU::SGPRRegBankID);
bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID;
unsigned CondBankDefault = SGPRSrcs ?
AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
- unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
CondBankDefault);
if (CondBank == AMDGPU::SGPRRegBankID)
CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
@@ -4380,7 +4434,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FADD:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
case AMDGPU::G_AMDGPU_ATOMIC_INC:
- case AMDGPU::G_AMDGPU_ATOMIC_DEC: {
+ case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
+ case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
@@ -4394,7 +4450,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_BRCOND: {
- unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+ unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
AMDGPU::SGPRRegBankID);
assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
if (Bank != AMDGPU::SGPRRegBankID)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 8f38ec4eeb3a..1c1441729e30 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -20,7 +20,6 @@
#define GET_REGBANK_DECLARATIONS
#include "AMDGPUGenRegisterBank.inc"
-#undef GET_REGBANK_DECLARATIONS
namespace llvm {
@@ -39,7 +38,8 @@ protected:
#define GET_TARGET_REGBANK_CLASS
#include "AMDGPUGenRegisterBank.inc"
};
-class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+
+class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
public:
const GCNSubtarget &Subtarget;
const SIRegisterInfo *TRI;
@@ -105,7 +105,6 @@ public:
getInstrMappingForLoad(const MachineInstr &MI) const;
unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI,
unsigned Default = AMDGPU::VGPRRegBankID) const;
// Return a value mapping for an operand that is required to be an SGPR.
@@ -150,6 +149,9 @@ public:
getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
+ unsigned getMappingType(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI) const;
+
bool isSALUMapping(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 9f6ebd00cd97..6c70b53b23c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_LO16, SReg_32, SReg_64, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
+ [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 9c3d96de6d68..e2aafa25142e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -43,35 +43,16 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <utility>
#define DEBUG_TYPE "amdgpu-rewrite-out-arguments"
@@ -303,8 +284,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
for (ReturnInst *RI : Returns) {
BasicBlock *BB = RI->getParent();
- MemDepResult Q = MDA->getPointerDependencyFrom(MemoryLocation(OutArg),
- true, BB->end(), BB, RI);
+ MemDepResult Q = MDA->getPointerDependencyFrom(
+ MemoryLocation::getBeforeOrAfter(OutArg), true, BB->end(), BB, RI);
StoreInst *SI = nullptr;
if (Q.isDef())
SI = dyn_cast<StoreInst>(Q.getInst());
@@ -325,9 +306,10 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
Value *ReplVal = Store.second->getValueOperand();
auto &ValVec = Replacements[Store.first];
- if (llvm::find_if(ValVec,
- [OutArg](const std::pair<Argument *, Value *> &Entry) {
- return Entry.first == OutArg;}) != ValVec.end()) {
+ if (llvm::any_of(ValVec,
+ [OutArg](const std::pair<Argument *, Value *> &Entry) {
+ return Entry.first == OutArg;
+ })) {
LLVM_DEBUG(dbgs()
<< "Saw multiple out arg stores" << *OutArg << '\n');
// It is possible to see stores to the same argument multiple times,
@@ -408,8 +390,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (DL->getTypeSizeInBits(EffectiveEltTy) !=
DL->getTypeSizeInBits(Val->getType())) {
assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
- Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()),
- ArrayRef<int>{0, 1, 2});
+ Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2});
}
Val = B.CreateBitCast(Val, EffectiveEltTy);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bc68310b2f5c..fd65727f04d4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -225,6 +225,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 213788ae0f67..f1a7d7463676 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,18 +13,21 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/IR/MDBuilder.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include <algorithm>
using namespace llvm;
@@ -50,6 +53,15 @@ static cl::opt<bool> EnableVGPRIndexMode(
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
+static cl::opt<bool> EnableFlatScratch(
+ "amdgpu-enable-flat-scratch",
+ cl::desc("Use flat scratch instructions"),
+ cl::init(false));
+
+static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
+ cl::desc("Enable the use of AA during codegen."),
+ cl::init(true));
+
GCNSubtarget::~GCNSubtarget() = default;
R600Subtarget &
@@ -57,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
SmallString<256> FullFS("+promote-alloca,");
FullFS += FS;
- ParseSubtargetFeatures(GPU, FullFS);
+ ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
HasMulU24 = getGeneration() >= EVERGREEN;
HasMulI24 = hasCaymanISA();
@@ -77,11 +89,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
- // Assuming ECC is enabled is the conservative default.
- SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
- if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
- FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+ // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
+ if (isAmdHsaOS())
+ FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
@@ -97,17 +109,38 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += FS;
- ParseSubtargetFeatures(GPU, FullFS);
+ ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
+
+ // Implement the "generic" processors, which acts as the default when no
+ // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
+ // the first amdgcn target that supports flat addressing. Other OSes defaults
+ // to the first amdgcn target.
+ if (Gen == AMDGPUSubtarget::INVALID) {
+ Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
+ : AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
// We don't support FP64 for EG/NI atm.
assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
- // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
- // on VI and newer hardware to avoid assertion failures due to missing ADDR64
- // variants of MUBUF instructions.
- if (!hasAddr64() && !FS.contains("flat-for-global")) {
+ // Targets must either support 64-bit offsets for MUBUF instructions, and/or
+ // support flat operations, otherwise they cannot access a 64-bit global
+ // address space
+ assert(hasAddr64() || hasFlat());
+ // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
+ // that do not support ADDR64 variants of MUBUF instructions. Such targets
+ // cannot use a 64 bit offset with a MUBUF instruction to access the global
+ // address space
+ if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
+ ToggleFeature(AMDGPU::FeatureFlatForGlobal);
FlatForGlobal = true;
}
+ // Unless +-flat-for-global is specified, use MUBUF instructions for global
+ // address space access if flat operations are not available.
+ if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
+ ToggleFeature(AMDGPU::FeatureFlatForGlobal);
+ FlatForGlobal = false;
+ }
// Set defaults if needed.
if (MaxPrivateElementSize == 0)
@@ -131,20 +164,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
- // Disable XNACK on targets where it is not enabled by default unless it is
- // explicitly requested.
- if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
- ToggleFeature(AMDGPU::FeatureXNACK);
- EnableXNACK = false;
- }
+ TargetID.setTargetIDFromFeaturesString(FS);
- // ECC is on by default, but turn it off if the hardware doesn't support it
- // anyway. This matters for the gfx9 targets with d16 loads, but don't support
- // ECC.
- if (DoesNotSupportSRAMECC && EnableSRAMECC) {
- ToggleFeature(AMDGPU::FeatureSRAMECC);
- EnableSRAMECC = false;
- }
+ LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
+ << TargetID.getXnackSetting() << '\n');
+ LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
+ << TargetID.getSramEccSetting() << '\n');
return *this;
}
@@ -170,10 +195,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM) :
- AMDGPUGenSubtargetInfo(TT, GPU, FS),
+ AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
TargetTriple(TT),
- Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
+ TargetID(*this),
+ Gen(INVALID),
InstrItins(getInstrItineraryForCPU(GPU)),
LDSBankCount(0),
MaxPrivateElementSize(0),
@@ -184,13 +210,12 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
- CodeObjectV3(false),
UnalignedScratchAccess(false),
- UnalignedBufferAccess(false),
+ UnalignedAccessMode(false),
HasApertureRegs(false),
+ SupportsXNACK(false),
EnableXNACK(false),
- DoesNotSupportXNACK(false),
EnableCuMode(false),
TrapHandler(false),
@@ -239,8 +264,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasMAIInsts(false),
HasPkFmacF16Inst(false),
HasAtomicFaddInsts(false),
+ SupportsSRAMECC(false),
EnableSRAMECC(false),
- DoesNotSupportSRAMECC(false),
HasNoSdstCMPX(false),
HasVscnt(false),
HasGetWaveIdInst(false),
@@ -257,6 +282,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
HasMFMAInlineLiteralBug(false),
+ UnalignedBufferAccess(false),
+ UnalignedDSAccess(false),
ScalarizeGlobal(false),
@@ -269,6 +296,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasNSAtoVMEMBug(false),
HasOffset3fBug(false),
HasFlatSegmentOffsetBug(false),
+ HasImageStoreD16Bug(false),
+ HasImageGather4D16Bug(false),
FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
@@ -283,20 +312,24 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
+bool GCNSubtarget::enableFlatScratch() const {
+ return EnableFlatScratch && hasFlatScratchInsts();
+}
+
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
if (getGeneration() < GFX10)
return 1;
switch (Opcode) {
- case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
- case AMDGPU::V_LSHL_B64:
- case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHL_B64_e64:
+ case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
- case AMDGPU::V_LSHR_B64:
- case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_LSHR_B64_e64:
+ case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
- case AMDGPU::V_ASHR_I64:
+ case AMDGPU::V_ASHR_I64_e64:
return 1;
}
@@ -436,6 +469,25 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
return Requested;
}
+static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+ auto Node = Kernel.getMetadata("reqd_work_group_size");
+ if (Node && Node->getNumOperands() == 3)
+ return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
+ return std::numeric_limits<unsigned>::max();
+}
+
+bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
+ return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
+}
+
+unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
+ unsigned Dimension) const {
+ unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+ if (ReqdSize != std::numeric_limits<unsigned>::max())
+ return ReqdSize - 1;
+ return getFlatWorkGroupSizes(Kernel).second - 1;
+}
+
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
Function *Kernel = I->getParent()->getParent();
unsigned MinSize = 0;
@@ -472,11 +524,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
default:
break;
}
+
if (Dim <= 3) {
- if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
- if (Node->getNumOperands() == 3)
- MinSize = MaxSize = mdconst::extract<ConstantInt>(
- Node->getOperand(Dim))->getZExtValue();
+ unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+ if (ReqdSize != std::numeric_limits<unsigned>::max())
+ MinSize = MaxSize = ReqdSize;
}
}
}
@@ -498,6 +550,12 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
return true;
}
+unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
+ if (isMesaKernel(F))
+ return 16;
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+}
+
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
Align &MaxAlign) const {
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
@@ -508,12 +566,15 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
MaxAlign = Align(1);
for (const Argument &Arg : F.args()) {
- Type *ArgTy = Arg.getType();
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
+ if (!Alignment)
+ Alignment = DL.getABITypeAlign(ArgTy);
- const Align Alignment = DL.getABITypeAlign(ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
- MaxAlign = std::max(MaxAlign, Alignment);
+ MaxAlign = max(MaxAlign, Alignment);
}
return ExplicitArgBytes;
@@ -536,9 +597,14 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
return alignTo(TotalSize, 4);
}
+AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
+ return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
+ : AMDGPUDwarfFlavour::Wave64;
+}
+
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :
- R600GenSubtargetInfo(TT, GPU, FS),
+ R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
AMDGPUSubtarget(TT),
InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
@@ -571,13 +637,15 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
}
bool GCNSubtarget::hasMadF16() const {
- return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
+ return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
}
bool GCNSubtarget::useVGPRIndexMode() const {
return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
}
+bool GCNSubtarget::useAA() const { return UseAA; }
+
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return getMaxWavesPerEU();
@@ -787,7 +855,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
for (unsigned I = 0; I < Succs.size(); ++I) {
for (const SDep &SI : Succs[I]->Succs) {
const SUnit *SU = SI.getSUnit();
- if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
+ if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
Succs.push_back(SU);
}
}
@@ -795,7 +863,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
SmallPtrSet<const SUnit*, 32> Visited;
while (!Preds.empty()) {
const SUnit *SU = Preds.pop_back_val();
- if (llvm::find(Succs, SU) != Succs.end())
+ if (llvm::is_contained(Succs, SU))
return false;
Visited.insert(SU);
for (const SDep &SI : SU->Preds)
@@ -859,8 +927,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MAI = *SU.getInstr();
if (!TII->isMAI(MAI) ||
- MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
- MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
continue;
unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index c833bfbcf936..ba3a8acae551 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,4 +1,4 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -7,58 +7,38 @@
//==-----------------------------------------------------------------------===//
//
/// \file
-/// AMDGPU specific subclass of TargetSubtarget.
+/// Base class for AMDGPU specific classes of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
-#include "AMDGPU.h"
-#include "AMDGPUCallLowering.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "R600FrameLowering.h"
-#include "R600ISelLowering.h"
-#include "R600InstrInfo.h"
-#include "SIFrameLowering.h"
-#include "SIISelLowering.h"
-#include "SIInstrInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "AMDGPUGenSubtargetInfo.inc"
-#define GET_SUBTARGETINFO_HEADER
-#include "R600GenSubtargetInfo.inc"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Support/Alignment.h"
namespace llvm {
-class StringRef;
+enum AMDGPUDwarfFlavour : unsigned;
+class Function;
+class Instruction;
+class MachineFunction;
+class TargetMachine;
class AMDGPUSubtarget {
public:
enum Generation {
- R600 = 0,
- R700 = 1,
- EVERGREEN = 2,
- NORTHERN_ISLANDS = 3,
- SOUTHERN_ISLANDS = 4,
- SEA_ISLANDS = 5,
- VOLCANIC_ISLANDS = 6,
- GFX9 = 7,
- GFX10 = 8
+ INVALID = 0,
+ R600 = 1,
+ R700 = 2,
+ EVERGREEN = 3,
+ NORTHERN_ISLANDS = 4,
+ SOUTHERN_ISLANDS = 5,
+ SEA_ISLANDS = 6,
+ VOLCANIC_ISLANDS = 7,
+ GFX9 = 8,
+ GFX10 = 9
};
private:
@@ -78,7 +58,7 @@ protected:
bool EnablePromoteAlloca;
bool HasTrigReducedRange;
unsigned MaxWavesPerEU;
- int LocalMemorySize;
+ unsigned LocalMemorySize;
char WavefrontSizeLog2;
public:
@@ -134,9 +114,7 @@ public:
return TargetTriple.getOS() == Triple::Mesa3D;
}
- bool isMesaKernel(const Function &F) const {
- return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
- }
+ bool isMesaKernel(const Function &F) const;
bool isAmdHsaOrMesa(const Function &F) const {
return isAmdHsaOS() || isMesaKernel(F);
@@ -202,7 +180,7 @@ public:
return WavefrontSizeLog2;
}
- int getLocalMemorySize() const {
+ unsigned getLocalMemorySize() const {
return LocalMemorySize;
}
@@ -239,1150 +217,26 @@ public:
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
- /// Creates value range metadata on an workitemid.* inrinsic call or load.
+ /// Return the maximum workitem ID value in the function, for the given (0, 1,
+ /// 2) dimension.
+ unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
+
+ /// Creates value range metadata on an workitemid.* intrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
/// \returns Number of bytes of arguments that are passed to a shader or
/// kernel in addition to the explicit ones declared for the function.
- unsigned getImplicitArgNumBytes(const Function &F) const {
- if (isMesaKernel(F))
- return 16;
- return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
- }
+ unsigned getImplicitArgNumBytes(const Function &F) const;
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
/// \returns Corresponsing DWARF register number mapping flavour for the
/// \p WavefrontSize.
- AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const {
- return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
- : AMDGPUDwarfFlavour::Wave64;
- }
+ AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
virtual ~AMDGPUSubtarget() {}
};
-class GCNSubtarget : public AMDGPUGenSubtargetInfo,
- public AMDGPUSubtarget {
-
- using AMDGPUSubtarget::getMaxWavesPerEU;
-
-public:
- enum TrapHandlerAbi {
- TrapHandlerAbiNone = 0,
- TrapHandlerAbiHsa = 1
- };
-
- enum TrapID {
- TrapIDHardwareReserved = 0,
- TrapIDHSADebugTrap = 1,
- TrapIDLLVMTrap = 2,
- TrapIDLLVMDebugTrap = 3,
- TrapIDDebugBreakpoint = 7,
- TrapIDDebugReserved8 = 8,
- TrapIDDebugReservedFE = 0xfe,
- TrapIDDebugReservedFF = 0xff
- };
-
- enum TrapRegValues {
- LLVMTrapHandlerRegValue = 1
- };
-
-private:
- /// GlobalISel related APIs.
- std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
- std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-protected:
- // Basic subtarget description.
- Triple TargetTriple;
- unsigned Gen;
- InstrItineraryData InstrItins;
- int LDSBankCount;
- unsigned MaxPrivateElementSize;
-
- // Possibly statically set by tablegen, but may want to be overridden.
- bool FastFMAF32;
- bool FastDenormalF32;
- bool HalfRate64Ops;
-
- // Dynamially set bits that enable features.
- bool FlatForGlobal;
- bool AutoWaitcntBeforeBarrier;
- bool CodeObjectV3;
- bool UnalignedScratchAccess;
- bool UnalignedBufferAccess;
- bool HasApertureRegs;
- bool EnableXNACK;
- bool DoesNotSupportXNACK;
- bool EnableCuMode;
- bool TrapHandler;
-
- // Used as options.
- bool EnableLoadStoreOpt;
- bool EnableUnsafeDSOffsetFolding;
- bool EnableSIScheduler;
- bool EnableDS128;
- bool EnablePRTStrictNull;
- bool DumpCode;
-
- // Subtarget statically properties set by tablegen
- bool FP64;
- bool FMA;
- bool MIMG_R128;
- bool IsGCN;
- bool GCN3Encoding;
- bool CIInsts;
- bool GFX8Insts;
- bool GFX9Insts;
- bool GFX10Insts;
- bool GFX10_3Insts;
- bool GFX7GFX8GFX9Insts;
- bool SGPRInitBug;
- bool HasSMemRealTime;
- bool HasIntClamp;
- bool HasFmaMixInsts;
- bool HasMovrel;
- bool HasVGPRIndexMode;
- bool HasScalarStores;
- bool HasScalarAtomics;
- bool HasSDWAOmod;
- bool HasSDWAScalar;
- bool HasSDWASdst;
- bool HasSDWAMac;
- bool HasSDWAOutModsVOPC;
- bool HasDPP;
- bool HasDPP8;
- bool HasR128A16;
- bool HasGFX10A16;
- bool HasG16;
- bool HasNSAEncoding;
- bool GFX10_BEncoding;
- bool HasDLInsts;
- bool HasDot1Insts;
- bool HasDot2Insts;
- bool HasDot3Insts;
- bool HasDot4Insts;
- bool HasDot5Insts;
- bool HasDot6Insts;
- bool HasMAIInsts;
- bool HasPkFmacF16Inst;
- bool HasAtomicFaddInsts;
- bool EnableSRAMECC;
- bool DoesNotSupportSRAMECC;
- bool HasNoSdstCMPX;
- bool HasVscnt;
- bool HasGetWaveIdInst;
- bool HasSMemTimeInst;
- bool HasRegisterBanking;
- bool HasVOP3Literal;
- bool HasNoDataDepHazard;
- bool FlatAddressSpace;
- bool FlatInstOffsets;
- bool FlatGlobalInsts;
- bool FlatScratchInsts;
- bool ScalarFlatScratchInsts;
- bool AddNoCarryInsts;
- bool HasUnpackedD16VMem;
- bool R600ALUInst;
- bool CaymanISA;
- bool CFALUBug;
- bool LDSMisalignedBug;
- bool HasMFMAInlineLiteralBug;
- bool HasVertexCache;
- short TexVTXClauseSize;
- bool ScalarizeGlobal;
-
- bool HasVcmpxPermlaneHazard;
- bool HasVMEMtoScalarWriteHazard;
- bool HasSMEMtoVectorWriteHazard;
- bool HasInstFwdPrefetchBug;
- bool HasVcmpxExecWARHazard;
- bool HasLdsBranchVmemWARHazard;
- bool HasNSAtoVMEMBug;
- bool HasOffset3fBug;
- bool HasFlatSegmentOffsetBug;
-
- // Dummy feature to use for assembler in tablegen.
- bool FeatureDisable;
-
- SelectionDAGTargetInfo TSInfo;
-private:
- SIInstrInfo InstrInfo;
- SITargetLowering TLInfo;
- SIFrameLowering FrameLowering;
-
- // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
- static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
-
-public:
- GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const GCNTargetMachine &TM);
- ~GCNSubtarget() override;
-
- GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
- StringRef GPU, StringRef FS);
-
- const SIInstrInfo *getInstrInfo() const override {
- return &InstrInfo;
- }
-
- const SIFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
-
- const SITargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
-
- const SIRegisterInfo *getRegisterInfo() const override {
- return &InstrInfo.getRegisterInfo();
- }
-
- const CallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
- }
-
- const InlineAsmLowering *getInlineAsmLowering() const override {
- return InlineAsmLoweringInfo.get();
- }
-
- InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
-
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
-
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-
- // Nothing implemented, just prevent crashes on use.
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
- return &TSInfo;
- }
-
- const InstrItineraryData *getInstrItineraryData() const override {
- return &InstrItins;
- }
-
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
- Generation getGeneration() const {
- return (Generation)Gen;
- }
-
- /// Return the number of high bits known to be zero fror a frame index.
- unsigned getKnownHighZeroBitsForFrameIndex() const {
- return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
- }
-
- int getLDSBankCount() const {
- return LDSBankCount;
- }
-
- unsigned getMaxPrivateElementSize() const {
- return MaxPrivateElementSize;
- }
-
- unsigned getConstantBusLimit(unsigned Opcode) const;
-
- bool hasIntClamp() const {
- return HasIntClamp;
- }
-
- bool hasFP64() const {
- return FP64;
- }
-
- bool hasMIMG_R128() const {
- return MIMG_R128;
- }
-
- bool hasHWFP64() const {
- return FP64;
- }
-
- bool hasFastFMAF32() const {
- return FastFMAF32;
- }
-
- bool hasHalfRate64Ops() const {
- return HalfRate64Ops;
- }
-
- bool hasAddr64() const {
- return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
- }
-
- // Return true if the target only has the reverse operand versions of VALU
- // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
- bool hasOnlyRevVALUShifts() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
-
- bool hasFractBug() const {
- return getGeneration() == SOUTHERN_ISLANDS;
- }
-
- bool hasBFE() const {
- return true;
- }
-
- bool hasBFI() const {
- return true;
- }
-
- bool hasBFM() const {
- return hasBFE();
- }
-
- bool hasBCNT(unsigned Size) const {
- return true;
- }
-
- bool hasFFBL() const {
- return true;
- }
-
- bool hasFFBH() const {
- return true;
- }
-
- bool hasMed3_16() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
- }
-
- bool hasMin3Max3_16() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
- }
-
- bool hasFmaMixInsts() const {
- return HasFmaMixInsts;
- }
-
- bool hasCARRY() const {
- return true;
- }
-
- bool hasFMA() const {
- return FMA;
- }
-
- bool hasSwap() const {
- return GFX9Insts;
- }
-
- bool hasScalarPackInsts() const {
- return GFX9Insts;
- }
-
- bool hasScalarMulHiInsts() const {
- return GFX9Insts;
- }
-
- TrapHandlerAbi getTrapHandlerAbi() const {
- return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
- }
-
- /// True if the offset field of DS instructions works as expected. On SI, the
- /// offset uses a 16-bit adder and does not always wrap properly.
- bool hasUsableDSOffset() const {
- return getGeneration() >= SEA_ISLANDS;
- }
-
- bool unsafeDSOffsetFoldingEnabled() const {
- return EnableUnsafeDSOffsetFolding;
- }
-
- /// Condition output from div_scale is usable.
- bool hasUsableDivScaleConditionOutput() const {
- return getGeneration() != SOUTHERN_ISLANDS;
- }
-
- /// Extra wait hazard is needed in some cases before
- /// s_cbranch_vccnz/s_cbranch_vccz.
- bool hasReadVCCZBug() const {
- return getGeneration() <= SEA_ISLANDS;
- }
-
- /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
- bool partialVCCWritesUpdateVCCZ() const {
- return getGeneration() >= GFX10;
- }
-
- /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
- /// was written by a VALU instruction.
- bool hasSMRDReadVALUDefHazard() const {
- return getGeneration() == SOUTHERN_ISLANDS;
- }
-
- /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
- /// SGPR was written by a VALU Instruction.
- bool hasVMEMReadSGPRVALUDefHazard() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
-
- bool hasRFEHazards() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
-
- /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
- unsigned getSetRegWaitStates() const {
- return getGeneration() <= SEA_ISLANDS ? 1 : 2;
- }
-
- bool dumpCode() const {
- return DumpCode;
- }
-
- /// Return the amount of LDS that can be used that will not restrict the
- /// occupancy lower than WaveCount.
- unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
- const Function &) const;
-
- bool supportsMinMaxDenormModes() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
- }
-
- /// \returns If target supports S_DENORM_MODE.
- bool hasDenormModeInst() const {
- return getGeneration() >= AMDGPUSubtarget::GFX10;
- }
-
- bool useFlatForGlobal() const {
- return FlatForGlobal;
- }
-
- /// \returns If target supports ds_read/write_b128 and user enables generation
- /// of ds_read/write_b128.
- bool useDS128() const {
- return CIInsts && EnableDS128;
- }
-
- /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
- bool haveRoundOpsF64() const {
- return CIInsts;
- }
-
- /// \returns If MUBUF instructions always perform range checking, even for
- /// buffer resources used for private memory access.
- bool privateMemoryResourceIsRangeChecked() const {
- return getGeneration() < AMDGPUSubtarget::GFX9;
- }
-
- /// \returns If target requires PRT Struct NULL support (zero result registers
- /// for sparse texture support).
- bool usePRTStrictNull() const {
- return EnablePRTStrictNull;
- }
-
- bool hasAutoWaitcntBeforeBarrier() const {
- return AutoWaitcntBeforeBarrier;
- }
-
- bool hasCodeObjectV3() const {
- // FIXME: Need to add code object v3 support for mesa and pal.
- return isAmdHsaOS() ? CodeObjectV3 : false;
- }
-
- bool hasUnalignedBufferAccess() const {
- return UnalignedBufferAccess;
- }
-
- bool hasUnalignedScratchAccess() const {
- return UnalignedScratchAccess;
- }
-
- bool hasApertureRegs() const {
- return HasApertureRegs;
- }
-
- bool isTrapHandlerEnabled() const {
- return TrapHandler;
- }
-
- bool isXNACKEnabled() const {
- return EnableXNACK;
- }
-
- bool isCuModeEnabled() const {
- return EnableCuMode;
- }
-
- bool hasFlatAddressSpace() const {
- return FlatAddressSpace;
- }
-
- bool hasFlatScrRegister() const {
- return hasFlatAddressSpace();
- }
-
- bool hasFlatInstOffsets() const {
- return FlatInstOffsets;
- }
-
- bool hasFlatGlobalInsts() const {
- return FlatGlobalInsts;
- }
-
- bool hasFlatScratchInsts() const {
- return FlatScratchInsts;
- }
-
- bool hasScalarFlatScratchInsts() const {
- return ScalarFlatScratchInsts;
- }
-
- bool hasGlobalAddTidInsts() const {
- return GFX10_BEncoding;
- }
-
- bool hasAtomicCSub() const {
- return GFX10_BEncoding;
- }
-
- bool hasMultiDwordFlatScratchAddressing() const {
- return getGeneration() >= GFX9;
- }
-
- bool hasFlatSegmentOffsetBug() const {
- return HasFlatSegmentOffsetBug;
- }
-
- bool hasFlatLgkmVMemCountInOrder() const {
- return getGeneration() > GFX9;
- }
-
- bool hasD16LoadStore() const {
- return getGeneration() >= GFX9;
- }
-
- bool d16PreservesUnusedBits() const {
- return hasD16LoadStore() && !isSRAMECCEnabled();
- }
-
- bool hasD16Images() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
-
- /// Return if most LDS instructions have an m0 use that require m0 to be
- /// iniitalized.
- bool ldsRequiresM0Init() const {
- return getGeneration() < GFX9;
- }
-
- // True if the hardware rewinds and replays GWS operations if a wave is
- // preempted.
- //
- // If this is false, a GWS operation requires testing if a nack set the
- // MEM_VIOL bit, and repeating if so.
- bool hasGWSAutoReplay() const {
- return getGeneration() >= GFX9;
- }
-
- /// \returns if target has ds_gws_sema_release_all instruction.
- bool hasGWSSemaReleaseAll() const {
- return CIInsts;
- }
-
- bool hasAddNoCarry() const {
- return AddNoCarryInsts;
- }
-
- bool hasUnpackedD16VMem() const {
- return HasUnpackedD16VMem;
- }
-
- // Covers VS/PS/CS graphics shaders
- bool isMesaGfxShader(const Function &F) const {
- return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
- }
-
- bool hasMad64_32() const {
- return getGeneration() >= SEA_ISLANDS;
- }
-
- bool hasSDWAOmod() const {
- return HasSDWAOmod;
- }
-
- bool hasSDWAScalar() const {
- return HasSDWAScalar;
- }
-
- bool hasSDWASdst() const {
- return HasSDWASdst;
- }
-
- bool hasSDWAMac() const {
- return HasSDWAMac;
- }
-
- bool hasSDWAOutModsVOPC() const {
- return HasSDWAOutModsVOPC;
- }
-
- bool hasDLInsts() const {
- return HasDLInsts;
- }
-
- bool hasDot1Insts() const {
- return HasDot1Insts;
- }
-
- bool hasDot2Insts() const {
- return HasDot2Insts;
- }
-
- bool hasDot3Insts() const {
- return HasDot3Insts;
- }
-
- bool hasDot4Insts() const {
- return HasDot4Insts;
- }
-
- bool hasDot5Insts() const {
- return HasDot5Insts;
- }
-
- bool hasDot6Insts() const {
- return HasDot6Insts;
- }
-
- bool hasMAIInsts() const {
- return HasMAIInsts;
- }
-
- bool hasPkFmacF16Inst() const {
- return HasPkFmacF16Inst;
- }
-
- bool hasAtomicFaddInsts() const {
- return HasAtomicFaddInsts;
- }
-
- bool isSRAMECCEnabled() const {
- return EnableSRAMECC;
- }
-
- bool hasNoSdstCMPX() const {
- return HasNoSdstCMPX;
- }
-
- bool hasVscnt() const {
- return HasVscnt;
- }
-
- bool hasGetWaveIdInst() const {
- return HasGetWaveIdInst;
- }
-
- bool hasSMemTimeInst() const {
- return HasSMemTimeInst;
- }
-
- bool hasRegisterBanking() const {
- return HasRegisterBanking;
- }
-
- bool hasVOP3Literal() const {
- return HasVOP3Literal;
- }
-
- bool hasNoDataDepHazard() const {
- return HasNoDataDepHazard;
- }
-
- bool vmemWriteNeedsExpWaitcnt() const {
- return getGeneration() < SEA_ISLANDS;
- }
-
- // Scratch is allocated in 256 dword per wave blocks for the entire
- // wavefront. When viewed from the perspecive of an arbitrary workitem, this
- // is 4-byte aligned.
- //
- // Only 4-byte alignment is really needed to access anything. Transformations
- // on the pointer value itself may rely on the alignment / known low bits of
- // the pointer. Set this to something above the minimum to avoid needing
- // dynamic realignment in common cases.
- Align getStackAlignment() const { return Align(16); }
-
- bool enableMachineScheduler() const override {
- return true;
- }
-
- bool enableSubRegLiveness() const override {
- return true;
- }
-
- void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
- bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
-
- // static wrappers
- static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
-
- // XXX - Why is this here if it isn't in the default pass set?
- bool enableEarlyIfConversion() const override {
- return true;
- }
-
- void overrideSchedPolicy(MachineSchedPolicy &Policy,
- unsigned NumRegionInstrs) const override;
-
- unsigned getMaxNumUserSGPRs() const {
- return 16;
- }
-
- bool hasSMemRealTime() const {
- return HasSMemRealTime;
- }
-
- bool hasMovrel() const {
- return HasMovrel;
- }
-
- bool hasVGPRIndexMode() const {
- return HasVGPRIndexMode;
- }
-
- bool useVGPRIndexMode() const;
-
- bool hasScalarCompareEq64() const {
- return getGeneration() >= VOLCANIC_ISLANDS;
- }
-
- bool hasScalarStores() const {
- return HasScalarStores;
- }
-
- bool hasScalarAtomics() const {
- return HasScalarAtomics;
- }
-
- bool hasLDSFPAtomics() const {
- return GFX8Insts;
- }
-
- bool hasDPP() const {
- return HasDPP;
- }
-
- bool hasDPPBroadcasts() const {
- return HasDPP && getGeneration() < GFX10;
- }
-
- bool hasDPPWavefrontShifts() const {
- return HasDPP && getGeneration() < GFX10;
- }
-
- bool hasDPP8() const {
- return HasDPP8;
- }
-
- bool hasR128A16() const {
- return HasR128A16;
- }
-
- bool hasGFX10A16() const {
- return HasGFX10A16;
- }
-
- bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
-
- bool hasG16() const { return HasG16; }
-
- bool hasOffset3fBug() const {
- return HasOffset3fBug;
- }
-
- bool hasNSAEncoding() const {
- return HasNSAEncoding;
- }
-
- bool hasGFX10_BEncoding() const {
- return GFX10_BEncoding;
- }
-
- bool hasGFX10_3Insts() const {
- return GFX10_3Insts;
- }
-
- bool hasMadF16() const;
-
- bool enableSIScheduler() const {
- return EnableSIScheduler;
- }
-
- bool loadStoreOptEnabled() const {
- return EnableLoadStoreOpt;
- }
-
- bool hasSGPRInitBug() const {
- return SGPRInitBug;
- }
-
- bool hasMFMAInlineLiteralBug() const {
- return HasMFMAInlineLiteralBug;
- }
-
- bool has12DWordStoreHazard() const {
- return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
- }
-
- // \returns true if the subtarget supports DWORDX3 load/store instructions.
- bool hasDwordx3LoadStores() const {
- return CIInsts;
- }
-
- bool hasSMovFedHazard() const {
- return getGeneration() == AMDGPUSubtarget::GFX9;
- }
-
- bool hasReadM0MovRelInterpHazard() const {
- return getGeneration() == AMDGPUSubtarget::GFX9;
- }
-
- bool hasReadM0SendMsgHazard() const {
- return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
- getGeneration() <= AMDGPUSubtarget::GFX9;
- }
-
- bool hasVcmpxPermlaneHazard() const {
- return HasVcmpxPermlaneHazard;
- }
-
- bool hasVMEMtoScalarWriteHazard() const {
- return HasVMEMtoScalarWriteHazard;
- }
-
- bool hasSMEMtoVectorWriteHazard() const {
- return HasSMEMtoVectorWriteHazard;
- }
-
- bool hasLDSMisalignedBug() const {
- return LDSMisalignedBug && !EnableCuMode;
- }
-
- bool hasInstFwdPrefetchBug() const {
- return HasInstFwdPrefetchBug;
- }
-
- bool hasVcmpxExecWARHazard() const {
- return HasVcmpxExecWARHazard;
- }
-
- bool hasLdsBranchVmemWARHazard() const {
- return HasLdsBranchVmemWARHazard;
- }
-
- bool hasNSAtoVMEMBug() const {
- return HasNSAtoVMEMBug;
- }
-
- bool hasHardClauses() const { return getGeneration() >= GFX10; }
-
- /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
- /// SGPRs
- unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
-
- /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
- /// VGPRs
- unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
-
- /// Return occupancy for the given function. Used LDS and a number of
- /// registers if provided.
- /// Note, occupancy can be affected by the scratch allocation as well, but
- /// we do not have enough information to compute it.
- unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
- unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
-
- /// \returns true if the flat_scratch register should be initialized with the
- /// pointer to the wave's scratch memory rather than a size and offset.
- bool flatScratchIsPointer() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
- }
-
- /// \returns true if the machine has merged shaders in which s0-s7 are
- /// reserved by the hardware and user SGPRs start at s8
- bool hasMergedShaders() const {
- return getGeneration() >= GFX9;
- }
-
- /// \returns SGPR allocation granularity supported by the subtarget.
- unsigned getSGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
- }
-
- /// \returns SGPR encoding granularity supported by the subtarget.
- unsigned getSGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
- }
-
- /// \returns Total number of SGPRs supported by the subtarget.
- unsigned getTotalNumSGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
- }
-
- /// \returns Addressable number of SGPRs supported by the subtarget.
- unsigned getAddressableNumSGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
- }
-
- /// \returns Minimum number of SGPRs that meets the given number of waves per
- /// execution unit requirement supported by the subtarget.
- unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
- }
-
- /// \returns Maximum number of SGPRs that meets the given number of waves per
- /// execution unit requirement supported by the subtarget.
- unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
- return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
- }
-
- /// \returns Reserved number of SGPRs for given function \p MF.
- unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
-
- /// \returns Maximum number of SGPRs that meets number of waves per execution
- /// unit requirement for function \p MF, or number of SGPRs explicitly
- /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
- ///
- /// \returns Value that meets number of waves per execution unit requirement
- /// if explicitly requested value cannot be converted to integer, violates
- /// subtarget's specifications, or does not meet number of waves per execution
- /// unit requirement.
- unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
-
- /// \returns VGPR allocation granularity supported by the subtarget.
- unsigned getVGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
- }
-
- /// \returns VGPR encoding granularity supported by the subtarget.
- unsigned getVGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
- }
-
- /// \returns Total number of VGPRs supported by the subtarget.
- unsigned getTotalNumVGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
- }
-
- /// \returns Addressable number of VGPRs supported by the subtarget.
- unsigned getAddressableNumVGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
- }
-
- /// \returns Minimum number of VGPRs that meets given number of waves per
- /// execution unit requirement supported by the subtarget.
- unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
- }
-
- /// \returns Maximum number of VGPRs that meets given number of waves per
- /// execution unit requirement supported by the subtarget.
- unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
- }
-
- /// \returns Maximum number of VGPRs that meets number of waves per execution
- /// unit requirement for function \p MF, or number of VGPRs explicitly
- /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
- ///
- /// \returns Value that meets number of waves per execution unit requirement
- /// if explicitly requested value cannot be converted to integer, violates
- /// subtarget's specifications, or does not meet number of waves per execution
- /// unit requirement.
- unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
-
- void getPostRAMutations(
- std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
- const override;
-
- bool isWave32() const {
- return getWavefrontSize() == 32;
- }
-
- const TargetRegisterClass *getBoolRC() const {
- return getRegisterInfo()->getBoolRC();
- }
-
- /// \returns Maximum number of work groups per compute unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
- }
-
- /// \returns Minimum flat work group size supported by the subtarget.
- unsigned getMinFlatWorkGroupSize() const override {
- return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
- }
-
- /// \returns Maximum flat work group size supported by the subtarget.
- unsigned getMaxFlatWorkGroupSize() const override {
- return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
- }
-
- /// \returns Number of waves per execution unit required to support the given
- /// \p FlatWorkGroupSize.
- unsigned
- getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
- }
-
- /// \returns Minimum number of waves per execution unit supported by the
- /// subtarget.
- unsigned getMinWavesPerEU() const override {
- return AMDGPU::IsaInfo::getMinWavesPerEU(this);
- }
-
- void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
- SDep &Dep) const override;
-};
-
-class R600Subtarget final : public R600GenSubtargetInfo,
- public AMDGPUSubtarget {
-private:
- R600InstrInfo InstrInfo;
- R600FrameLowering FrameLowering;
- bool FMA;
- bool CaymanISA;
- bool CFALUBug;
- bool HasVertexCache;
- bool R600ALUInst;
- bool FP64;
- short TexVTXClauseSize;
- Generation Gen;
- R600TargetLowering TLInfo;
- InstrItineraryData InstrItins;
- SelectionDAGTargetInfo TSInfo;
-
-public:
- R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const TargetMachine &TM);
-
- const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-
- const R600FrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
-
- const R600TargetLowering *getTargetLowering() const override {
- return &TLInfo;
- }
-
- const R600RegisterInfo *getRegisterInfo() const override {
- return &InstrInfo.getRegisterInfo();
- }
-
- const InstrItineraryData *getInstrItineraryData() const override {
- return &InstrItins;
- }
-
- // Nothing implemented, just prevent crashes on use.
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
- return &TSInfo;
- }
-
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
- Generation getGeneration() const {
- return Gen;
- }
-
- Align getStackAlignment() const { return Align(4); }
-
- R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
- StringRef GPU, StringRef FS);
-
- bool hasBFE() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasBFI() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasBCNT(unsigned Size) const {
- if (Size == 32)
- return (getGeneration() >= EVERGREEN);
-
- return false;
- }
-
- bool hasBORROW() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasCARRY() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasCaymanISA() const {
- return CaymanISA;
- }
-
- bool hasFFBL() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasFFBH() const {
- return (getGeneration() >= EVERGREEN);
- }
-
- bool hasFMA() const { return FMA; }
-
- bool hasCFAluBug() const { return CFALUBug; }
-
- bool hasVertexCache() const { return HasVertexCache; }
-
- short getTexVTXClauseSize() const { return TexVTXClauseSize; }
-
- bool enableMachineScheduler() const override {
- return true;
- }
-
- bool enableSubRegLiveness() const override {
- return true;
- }
-
- /// \returns Maximum number of work groups per compute unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
- }
-
- /// \returns Minimum flat work group size supported by the subtarget.
- unsigned getMinFlatWorkGroupSize() const override {
- return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
- }
-
- /// \returns Maximum flat work group size supported by the subtarget.
- unsigned getMaxFlatWorkGroupSize() const override {
- return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
- }
-
- /// \returns Number of waves per execution unit required to support the given
- /// \p FlatWorkGroupSize.
- unsigned
- getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
- }
-
- /// \returns Minimum number of waves per execution unit supported by the
- /// subtarget.
- unsigned getMinWavesPerEU() const override {
- return AMDGPU::IsaInfo::getMinWavesPerEU(this);
- }
-};
-
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b4b10835837c..ce7c82e2a88a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,45 +15,40 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
-#include "AMDGPUCallLowering.h"
#include "AMDGPUExportClustering.h"
-#include "AMDGPUInstructionSelector.h"
-#include "AMDGPULegalizerInfo.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600MachineScheduler.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
-#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/Internalize.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include "llvm/Transforms/Vectorize.h"
-#include <memory>
using namespace llvm;
@@ -216,7 +211,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
- initializeSIFixupVectorISelPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
@@ -237,6 +231,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -260,7 +255,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUExternalAAWrapperPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
- initializeAMDGPUInlinerPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
@@ -284,7 +278,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -295,7 +288,6 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -309,7 +301,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
@@ -345,15 +336,15 @@ GCNILPSchedRegistry("gcn-ilp",
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
- return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat, non-integral buffer fat pointers.
- return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+ return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
"-ni:7";
}
@@ -402,16 +393,14 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
- return GPUAttr.hasAttribute(Attribute::None) ?
- getTargetCPU() : GPUAttr.getValueAsString();
+ return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
}
StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
Attribute FSAttr = F.getFnAttribute("target-features");
- return FSAttr.hasAttribute(Attribute::None) ?
- getTargetFeatureString() :
- FSAttr.getValueAsString();
+ return FSAttr.isValid() ? FSAttr.getValueAsString()
+ : getTargetFeatureString();
}
/// Predicate for Internalize pass.
@@ -433,7 +422,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
if (EnableFunctionCalls) {
delete Builder.Inliner;
- Builder.Inliner = createAMDGPUFunctionInliningPass();
+ Builder.Inliner = createFunctionInliningPass();
}
Builder.addExtension(
@@ -487,6 +476,133 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
});
}
+void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
+ AAM.registerFunctionAnalysis<AMDGPUAA>();
+}
+
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) {
+ PB.registerPipelineParsingCallback(
+ [this](StringRef PassName, ModulePassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "amdgpu-propagate-attributes-late") {
+ PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-unify-metadata") {
+ PM.addPass(AMDGPUUnifyMetadataPass());
+ return true;
+ }
+ if (PassName == "amdgpu-printf-runtime-binding") {
+ PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+ return true;
+ }
+ if (PassName == "amdgpu-always-inline") {
+ PM.addPass(AMDGPUAlwaysInlinePass());
+ return true;
+ }
+ return false;
+ });
+ PB.registerPipelineParsingCallback(
+ [this](StringRef PassName, FunctionPassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "amdgpu-simplifylib") {
+ PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-usenative") {
+ PM.addPass(AMDGPUUseNativeCallsPass());
+ return true;
+ }
+ if (PassName == "amdgpu-promote-alloca") {
+ PM.addPass(AMDGPUPromoteAllocaPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-promote-alloca-to-vector") {
+ PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-lower-kernel-attributes") {
+ PM.addPass(AMDGPULowerKernelAttributesPass());
+ return true;
+ }
+ if (PassName == "amdgpu-propagate-attributes-early") {
+ PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+ return true;
+ }
+
+ return false;
+ });
+
+ PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
+ FAM.registerPass([&] { return AMDGPUAA(); });
+ });
+
+ PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
+ if (AAName == "amdgpu-aa") {
+ AAM.registerFunctionAnalysis<AMDGPUAA>();
+ return true;
+ }
+ return false;
+ });
+
+ PB.registerPipelineStartEPCallback([this, DebugPassManager](
+ ModulePassManager &PM,
+ PassBuilder::OptimizationLevel Level) {
+ FunctionPassManager FPM(DebugPassManager);
+ FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+ FPM.addPass(AMDGPUUseNativeCallsPass());
+ if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0)
+ FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ });
+
+ PB.registerPipelineEarlySimplificationEPCallback(
+ [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+ if (Level == PassBuilder::OptimizationLevel::O0)
+ return;
+
+ PM.addPass(AMDGPUUnifyMetadataPass());
+ PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+
+ if (InternalizeSymbols) {
+ PM.addPass(InternalizePass(mustPreserveGV));
+ }
+ PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
+ if (InternalizeSymbols) {
+ PM.addPass(GlobalDCEPass());
+ }
+ if (EarlyInlineAll && !EnableFunctionCalls)
+ PM.addPass(AMDGPUAlwaysInlinePass());
+ });
+
+ PB.registerCGSCCOptimizerLateEPCallback(
+ [this, DebugPassManager](CGSCCPassManager &PM,
+ PassBuilder::OptimizationLevel Level) {
+ if (Level == PassBuilder::OptimizationLevel::O0)
+ return;
+
+ FunctionPassManager FPM(DebugPassManager);
+
+ // Add infer address spaces pass to the opt pipeline after inlining
+ // but before SROA to increase SROA opportunities.
+ FPM.addPass(InferAddressSpacesPass());
+
+ // This should run after inlining to have any chance of doing
+ // anything, and before other cleanup optimizations.
+ FPM.addPass(AMDGPULowerKernelAttributesPass());
+
+ if (Level != PassBuilder::OptimizationLevel::O0) {
+ // Promote alloca to vector before SROA and loop unroll. If we
+ // manage to eliminate allocas before unroll we may choose to unroll
+ // less.
+ FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+ }
+
+ PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+ });
+}
+
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
//===----------------------------------------------------------------------===//
@@ -526,6 +642,39 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
return I.get();
}
+int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
+ return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)
+ ? -1
+ : 0;
+}
+
+bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
+ AMDGPU::isFlatGlobalAddrSpace(DestAS);
+}
+
+unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ const auto *LD = dyn_cast<LoadInst>(V);
+ if (!LD)
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+ // It must be a generic pointer loaded.
+ assert(V->getType()->isPointerTy() &&
+ V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
+
+ const auto *Ptr = LD->getPointerOperand();
+ if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+ // For a generic pointer loaded from the constant memory, it could be assumed
+ // as a global pointer since the constant memory is only populated on the
+ // host side. As implied by the offload programming model, only global
+ // pointers could be referenced on the host side.
+ return AMDGPUAS::GLOBAL_ADDRESS;
+}
+
TargetTransformInfo
R600TargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(R600TTIImpl(this, F));
@@ -593,7 +742,6 @@ public:
createMachineScheduler(MachineSchedContext *C) const override {
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -866,6 +1014,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
+ addPass(createAMDGPULateCodeGenPreparePass());
if (EnableAtomicOptimizations) {
addPass(createAMDGPUAtomicOptimizerPass());
}
@@ -930,19 +1079,12 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
- // TODO: We have to add FinalizeISel
- // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
- // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
- // Will be removed as soon as SIFixupVectorISel is changed
- // to work with V_ADD/SUB_U64_PSEUDO instead.
- addPass(&FinalizeISelID);
- addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
}
bool GCNPassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
@@ -969,6 +1111,10 @@ bool GCNPassConfig::addRegBankSelect() {
bool GCNPassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
+ // TODO: Fix instruction selection to do the right thing for image
+ // instructions with tfe or lwe in the first place, instead of running a
+ // separate pass to fix them up?
+ addPass(createSIAddIMGInitPass());
return false;
}
@@ -976,7 +1122,6 @@ void GCNPassConfig::addPreRegAlloc() {
if (LateCFGStructurize) {
addPass(createAMDGPUMachineCFGStructurizerPass());
}
- addPass(createSIWholeQuadModePass());
}
void GCNPassConfig::addFastRegAlloc() {
@@ -988,13 +1133,18 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run just after RegisterCoalescing.
- insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
+ insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
+ insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
TargetPassConfig::addFastRegAlloc();
}
void GCNPassConfig::addOptimizedRegAlloc() {
+ // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
+ // instructions that cause scheduling barriers.
+ insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
+ insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
+
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
@@ -1004,9 +1154,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run just after RegisterCoalescing.
- insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
-
if (EnableDCEInRA)
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -1041,6 +1188,12 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIShrinkInstructionsPass());
addPass(createSIModeRegisterPass());
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIInsertHardClausesID);
+
+ addPass(&SIRemoveShortExecBranchesID);
+ addPass(&SIInsertSkipsPassID);
+ addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -1049,16 +1202,7 @@ void GCNPassConfig::addPreEmitPass() {
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
- //
- // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
- // be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
- if (getOptLevel() > CodeGenOpt::None)
- addPass(&SIInsertHardClausesID);
-
- addPass(&SIRemoveShortExecBranchesID);
- addPass(&SIInsertSkipsPassID);
- addPass(&SIPreEmitPeepholeID);
addPass(&BranchRelaxationPassID);
}
@@ -1087,6 +1231,12 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->initializeBaseYamlFields(YamlMFI);
+ if (MFI->Occupancy == 0) {
+ // Fixup the subtarget dependent default value.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
+ }
+
auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
Register TempReg;
if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index e223fecc8819..95aefa23c24c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -14,14 +14,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/CodeGen.h"
+#include "GCNSubtarget.h"
+#include "R600Subtarget.h"
#include "llvm/Target/TargetMachine.h"
-#include <memory>
namespace llvm {
@@ -56,12 +51,16 @@ public:
void adjustPassManager(PassManagerBuilder &) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) override;
+ void registerDefaultAliasAnalyses(AAManager &) override;
+
/// Get the integer value of a null pointer in the given address space.
- static int64_t getNullPointerValue(unsigned AddrSpace) {
- return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
- AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
- }
+ static int64_t getNullPointerValue(unsigned AddrSpace);
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+ unsigned getAssumedAddrSpace(const Value *V) const override;
};
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 6569980d2c75..f854c8c16e5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -7,13 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetObjectFile.h"
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
-
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 542a5f006c0f..7b8a79640bb2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -15,40 +15,12 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetTransformInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <utility>
+#include "llvm/Support/KnownBits.h"
using namespace llvm;
@@ -82,7 +54,25 @@ static cl::opt<bool> UseLegacyDA(
static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
"amdgpu-unroll-max-block-to-analyze",
cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
- cl::init(20), cl::Hidden);
+ cl::init(32), cl::Hidden);
+
+static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
+ cl::Hidden, cl::init(4000),
+ cl::desc("Cost of alloca argument"));
+
+// If the amount of scratch memory to eliminate exceeds our ability to allocate
+// it into registers we gain nothing by aggressively inlining functions for that
+// heuristic.
+static cl::opt<unsigned>
+ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
+ cl::init(256),
+ cl::desc("Maximum alloca size to use for inline cost"));
+
+// Inliner constraint to achieve reasonable compilation time.
+static cl::opt<size_t> InlineMaxBB(
+ "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
+ cl::desc("Maximum number of BBs allowed in a function after inlining"
+ " (compile time constraint)"));
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
unsigned Depth = 0) {
@@ -103,6 +93,12 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
return false;
}
+AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ TargetTriple(TM->getTargetTriple()),
+ ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()) {}
+
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
const Function &F = *L->getHeader()->getParent();
@@ -116,6 +112,26 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
const unsigned MaxAlloca = (256 - 16) * 4;
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
+
+ // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
+ // provided threshold value as the default for Threshold
+ if (MDNode *LoopUnrollThreshold =
+ findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
+ if (LoopUnrollThreshold->getNumOperands() == 2) {
+ ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
+ LoopUnrollThreshold->getOperand(1));
+ if (MetaThresholdValue) {
+ // We will also use the supplied value for PartialThreshold for now.
+ // We may introduce additional metadata if it becomes necessary in the
+ // future.
+ UP.Threshold = MetaThresholdValue->getSExtValue();
+ UP.PartialThreshold = UP.Threshold;
+ ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
+ ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
+ }
+ }
+ }
+
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
@@ -169,7 +185,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
const Value *Ptr = GEP->getPointerOperand();
const AllocaInst *Alloca =
- dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+ dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
if (!Alloca || !Alloca->isStaticAlloca())
continue;
Type *Ty = Alloca->getAllocatedType();
@@ -231,7 +247,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// If we got a GEP in a small BB from inner loop then increase max trip
// count to analyze for better estimation cost in unroll
- if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
+ if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
UP.MaxIterationsCountToAnalyze = 32;
}
}
@@ -240,6 +256,41 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
}
+
+const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
+ // Codegen control options which don't matter.
+ AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
+ AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
+ AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
+ AMDGPU::FeatureUnalignedAccessMode,
+
+ AMDGPU::FeatureAutoWaitcntBeforeBarrier,
+
+ // Property of the kernel/environment which can't actually differ.
+ AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
+ AMDGPU::FeatureTrapHandler,
+
+ // The default assumption needs to be ecc is enabled, but no directly
+ // exposed operations depend on it, so it can be safely inlined.
+ AMDGPU::FeatureSRAMECC,
+
+ // Perf-tuning features
+ AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
+
+GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()), CommonTTI(TM, F),
+ IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
+ MaxVGPRs(ST->getMaxNumVGPRs(
+ std::max(ST->getWavesPerEU(F).first,
+ ST->getWavesPerEUForWorkGroup(
+ ST->getFlatWorkGroupSizes(F).second)))) {
+ AMDGPU::SIModeRegisterDefaults Mode(F);
+ HasFP32Denormals = Mode.allFP32Denormals();
+ HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
+}
+
unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
// The concept of vector registers doesn't really exist. Some packed vector
// operations operate on the normal 32-bit registers.
@@ -267,6 +318,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
+unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+ if (Opcode == Instruction::Load || Opcode == Instruction::Store)
+ return 32 * 4 / ElemWidth;
+ return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+}
+
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
@@ -451,9 +508,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: We're having to query the throughput cost so that the basic
// implementation tries to generate legalize and scalarization costs. Maybe
// we could hoist the scalarization code here?
- return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
- Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ if (CostKind != TTI::TCK_CodeSize)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+ Opd1Info, Opd2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
+ // Scalarization
+
+ // Check if any of the operands are vector operands.
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ bool IsFloat = Ty->isFPOrFPVectorTy();
+ // Assume that floating point arithmetic operations cost twice as much as
+ // integer operations.
+ unsigned OpCost = (IsFloat ? 2 : 1);
+
+ if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+ // The operation is legal. Assume it costs 1.
+ // TODO: Once we have extract/insert subvector cost we need to use them.
+ return LT.first * OpCost;
+ }
+
+ if (!TLI->isOperationExpand(ISD, LT.second)) {
+ // If the operation is custom lowered, then assume that the code is twice
+ // as expensive.
+ return LT.first * 2 * OpCost;
+ }
+
+ // Else, assume that we need to scalarize this op.
+ // TODO: If one of the types get legalized by splitting, handle this
+ // similarly to what getCastInstrCost() does.
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
+ unsigned Cost = getArithmeticInstrCost(
+ Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo, Args, CxtI);
+ // Return the cost of multiple scalar invocation plus the cost of
+ // inserting and extracting the values.
+ return getScalarizationOverhead(VTy, Args) + Num * Cost;
+ }
+
+ // We don't know anything about this scalar instruction.
+ return OpCost;
}
// Legalize the type.
@@ -472,7 +570,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
case ISD::SRL:
case ISD::SRA:
if (SLT == MVT::i64)
- return get64BitInstrCost() * LT.first * NElts;
+ return get64BitInstrCost(CostKind) * LT.first * NElts;
if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
@@ -494,7 +592,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * NElts * getFullRateInstrCost();
case ISD::MUL: {
- const int QuarterRateCost = getQuarterRateInstrCost();
+ const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
if (SLT == MVT::i64) {
const int FullRateCost = getFullRateInstrCost();
return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
@@ -506,11 +604,32 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// i32
return QuarterRateCost * NElts * LT.first;
}
+ case ISD::FMUL:
+ // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
+ // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
+ // fused operation.
+ if (CxtI && CxtI->hasOneUse())
+ if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
+ const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
+ if (OPC == ISD::FADD || OPC == ISD::FSUB) {
+ if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
+ return TargetTransformInfo::TCC_Free;
+ if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
+ return TargetTransformInfo::TCC_Free;
+
+ // Estimate all types may be fused with contract/unsafe flags
+ const TargetOptions &Options = TLI->getTargetMachine().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath ||
+ (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
+ return TargetTransformInfo::TCC_Free;
+ }
+ }
+ LLVM_FALLTHROUGH;
case ISD::FADD:
case ISD::FSUB:
- case ISD::FMUL:
if (SLT == MVT::f64)
- return LT.first * NElts * get64BitInstrCost();
+ return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
@@ -523,7 +642,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: frem should be handled separately. The fdiv in it is most of it,
// but the current lowering is also not entirely correct.
if (SLT == MVT::f64) {
- int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+ int Cost = 7 * get64BitInstrCost(CostKind) +
+ getQuarterRateInstrCost(CostKind) +
+ 3 * getHalfRateInstrCost(CostKind);
// Add cost of workaround.
if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
@@ -535,7 +656,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// TODO: This is more complicated, unsafe flags etc.
if ((SLT == MVT::f32 && !HasFP32Denormals) ||
(SLT == MVT::f16 && ST->has16BitInsts())) {
- return LT.first * getQuarterRateInstrCost() * NElts;
+ return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
}
}
@@ -545,12 +666,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// f32 fmul
// v_cvt_f16_f32
// f16 div_fixup
- int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
+ int Cost =
+ 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
return LT.first * Cost * NElts;
}
if (SLT == MVT::f32 || SLT == MVT::f16) {
- int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+ // 4 more v_cvt_* insts without f16 insts support
+ int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
+ 1 * getQuarterRateInstrCost(CostKind);
if (!HasFP32Denormals) {
// FP mode switches.
@@ -568,18 +692,21 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
break;
}
- return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
- Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
-// Return true if there's a potential benefit from using v2f16 instructions for
-// an intrinsic, even if it requires nontrivial legalization.
+// Return true if there's a potential benefit from using v2f16/v2i16
+// instructions for an intrinsic, even if it requires nontrivial legalization.
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::fma: // TODO: fmuladd
// There's a small benefit to using vector ops in the legalized code.
case Intrinsic::round:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
return true;
default:
return false;
@@ -597,7 +724,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
Type *RetTy = ICA.getReturnType();
EVT OrigTy = TLI->getValueType(DL, RetTy);
if (!OrigTy.isSimple()) {
- return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+ if (CostKind != TTI::TCK_CodeSize)
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+ // TODO: Combine these two logic paths.
+ if (ICA.isTypeBasedOnly())
+ return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
+ Type *RetTy = ICA.getReturnType();
+ unsigned VF = ICA.getVectorFactor().getFixedValue();
+ unsigned RetVF =
+ (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
+ : 1);
+ assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+ const IntrinsicInst *I = ICA.getInst();
+ const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
+ FastMathFlags FMF = ICA.getFlags();
+ // Assume that we need to scalarize this intrinsic.
+ SmallVector<Type *, 4> Types;
+ for (const Value *Op : Args) {
+ Type *OpTy = Op->getType();
+ assert(VF == 1 || !OpTy->isVectorTy());
+ Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+ }
+
+ if (VF > 1 && !RetTy->isVoidTy())
+ RetTy = FixedVectorType::get(RetTy, VF);
+
+ // Compute the scalarization overhead based on Args for a vector
+ // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+ // CostModel will pass a vector RetTy and VF is 1.
+ unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
+ if (RetVF > 1 || VF > 1) {
+ ScalarizationCost = 0;
+ if (!RetTy->isVoidTy())
+ ScalarizationCost +=
+ getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+ ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+ }
+
+ IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+ ScalarizationCost, I);
+ return getIntrinsicInstrCost(Attrs, CostKind);
}
// Legalize the type.
@@ -609,16 +777,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
if (SLT == MVT::f64)
- return LT.first * NElts * get64BitInstrCost();
+ return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
// TODO: Get more refined intrinsic costs?
- unsigned InstRate = getQuarterRateInstrCost();
+ unsigned InstRate = getQuarterRateInstrCost(CostKind);
if (ICA.getID() == Intrinsic::fma) {
- InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
- : getQuarterRateInstrCost();
+ InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+ : getQuarterRateInstrCost(CostKind);
}
return LT.first * NElts * InstRate;
@@ -669,7 +837,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
CostKind);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- return LT.first * getHalfRateInstrCost();
+ return LT.first * getHalfRateInstrCost(CostKind);
}
int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -697,32 +865,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
-static bool isArgPassedInSGPR(const Argument *A) {
- const Function *F = A->getParent();
-
- // Arguments to compute shaders are never a source of divergence.
- CallingConv::ID CC = F->getCallingConv();
- switch (CC) {
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- return true;
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_LS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_ES:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
- // Everything else is in VGPRs.
- return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
- F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
- default:
- // TODO: Should calls support inreg for SGPR inputs?
- return false;
- }
-}
-
/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
/// this is analyzing the collective result of all output registers. Otherwise,
/// this is only querying a specific result index if this returns multiple
@@ -779,7 +921,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
/// different across workitems in a wavefront.
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
- return !isArgPassedInSGPR(A);
+ return !AMDGPU::isArgPassedInSGPR(A);
// Loads from the private and flat address spaces are divergent, because
// threads can execute the load instruction with the same inputs and get
@@ -921,7 +1063,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Type *MaskTy = MaskOp->getType();
bool DoTruncate = false;
- if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
+ if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
// All valid 64-bit to 32-bit casts work by chopping off the high
// bits. Any masking only clearing the low bits will also apply in the new
// address space.
@@ -993,7 +1138,47 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
// no way to support merge for backend defined attributes.
AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
- return CallerMode.isInlineCompatible(CalleeMode);
+ if (!CallerMode.isInlineCompatible(CalleeMode))
+ return false;
+
+ // Hack to make compile times reasonable.
+ if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
+ // Single BB does not increase total BB amount, thus subtract 1.
+ size_t BBSize = Caller->size() + Callee->size() - 1;
+ return BBSize <= InlineMaxBB;
+ }
+
+ return true;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+ // If we have a pointer to private array passed into a function
+ // it will not be optimized out, leaving scratch usage.
+ // Increase the inline threshold to allow inlining in this case.
+ uint64_t AllocaSize = 0;
+ SmallPtrSet<const AllocaInst *, 8> AIVisited;
+ for (Value *PtrArg : CB->args()) {
+ PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
+ if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+ Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
+ continue;
+
+ PtrArg = getUnderlyingObject(PtrArg);
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
+ if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+ continue;
+ AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+ // If the amount of stack memory is excessive we will not be able
+ // to get rid of the scratch anyway, bail out.
+ if (AllocaSize > ArgAllocaCutoff) {
+ AllocaSize = 0;
+ break;
+ }
+ }
+ }
+ if (AllocaSize)
+ return ArgAllocaCost;
+ return 0;
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
@@ -1006,6 +1191,16 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
CommonTTI.getPeelingPreferences(L, SE, PP);
}
+int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
+ return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+ : getQuarterRateInstrCost(CostKind);
+}
+
+R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
+
unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3364a9bcaccb..b29c94180fb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -19,22 +19,18 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
namespace llvm {
class AMDGPUTargetLowering;
+class GCNSubtarget;
+class InstCombiner;
class Loop;
+class R600Subtarget;
class ScalarEvolution;
+class SITargetLowering;
class Type;
class Value;
@@ -46,18 +42,14 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
Triple TargetTriple;
- const GCNSubtarget *ST;
+ const TargetSubtargetInfo *ST;
const TargetLoweringBase *TLI;
const TargetSubtargetInfo *getST() const { return ST; }
const TargetLoweringBase *getTLI() const { return TLI; }
public:
- explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
- : BaseT(TM, F.getParent()->getDataLayout()),
- TargetTriple(TM->getTargetTriple()),
- ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
- TLI(ST->getTargetLowering()) {}
+ explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
@@ -75,73 +67,41 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
const GCNSubtarget *ST;
const SITargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
- bool IsGraphicsShader;
+ bool IsGraphics;
bool HasFP32Denormals;
+ bool HasFP64FP16Denormals;
unsigned MaxVGPRs;
- const FeatureBitset InlineFeatureIgnoreList = {
- // Codegen control options which don't matter.
- AMDGPU::FeatureEnableLoadStoreOpt,
- AMDGPU::FeatureEnableSIScheduler,
- AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
- AMDGPU::FeatureFlatForGlobal,
- AMDGPU::FeaturePromoteAlloca,
- AMDGPU::FeatureUnalignedBufferAccess,
- AMDGPU::FeatureUnalignedScratchAccess,
-
- AMDGPU::FeatureAutoWaitcntBeforeBarrier,
-
- // Property of the kernel/environment which can't actually differ.
- AMDGPU::FeatureSGPRInitBug,
- AMDGPU::FeatureXNACK,
- AMDGPU::FeatureTrapHandler,
- AMDGPU::FeatureCodeObjectV3,
-
- // The default assumption needs to be ecc is enabled, but no directly
- // exposed operations depend on it, so it can be safely inlined.
- AMDGPU::FeatureSRAMECC,
-
- // Perf-tuning features
- AMDGPU::FeatureFastFMAF32,
- AMDGPU::HalfRate64Ops
- };
+ static const FeatureBitset InlineFeatureIgnoreList;
const GCNSubtarget *getST() const { return ST; }
- const AMDGPUTargetLowering *getTLI() const { return TLI; }
+ const SITargetLowering *getTLI() const { return TLI; }
static inline int getFullRateInstrCost() {
return TargetTransformInfo::TCC_Basic;
}
- static inline int getHalfRateInstrCost() {
- return 2 * TargetTransformInfo::TCC_Basic;
+ static inline int getHalfRateInstrCost(
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+ return CostKind == TTI::TCK_CodeSize ? 2
+ : 2 * TargetTransformInfo::TCC_Basic;
}
// TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
// should be 2 or 4.
- static inline int getQuarterRateInstrCost() {
- return 3 * TargetTransformInfo::TCC_Basic;
+ static inline int getQuarterRateInstrCost(
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+ return CostKind == TTI::TCK_CodeSize ? 2
+ : 4 * TargetTransformInfo::TCC_Basic;
}
- // On some parts, normal fp64 operations are half rate, and others
- // quarter. This also applies to some integer operations.
- inline int get64BitInstrCost() const {
- return ST->hasHalfRate64Ops() ?
- getHalfRateInstrCost() : getQuarterRateInstrCost();
- }
+ // On some parts, normal fp64 operations are half rate, and others
+ // quarter. This also applies to some integer operations.
+ int get64BitInstrCost(
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
public:
- explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
- : BaseT(TM, F.getParent()->getDataLayout()),
- ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
- TLI(ST->getTargetLowering()),
- CommonTTI(TM, F),
- IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
- HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()),
- MaxVGPRs(ST->getMaxNumVGPRs(
- std::max(ST->getWavesPerEU(F).first,
- ST->getWavesPerEUForWorkGroup(
- ST->getFlatWorkGroupSizes(F).second)))) {}
+ explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
bool hasBranchDivergence() { return true; }
bool useGPUDivergenceAnalysis() const;
@@ -162,6 +122,7 @@ public:
unsigned getNumberOfRegisters(unsigned RCID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
+ unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const;
@@ -213,7 +174,7 @@ public:
unsigned getFlatAddressSpace() const {
// Don't bother running InferAddressSpaces pass on graphics shaders which
// don't use flat addressing.
- if (IsGraphicsShader)
+ if (IsGraphics)
return -1;
return AMDGPUAS::FLAT_ADDRESS;
}
@@ -223,6 +184,16 @@ public:
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const;
+ bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
+ InstCombiner &IC) const;
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+ Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ SimplifyAndSetOp) const;
+
unsigned getVectorSplitCost() { return 0; }
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
@@ -232,6 +203,7 @@ public:
const Function *Callee) const;
unsigned getInliningThresholdMultiplier() { return 11; }
+ unsigned adjustInliningThreshold(const CallBase *CB) const;
int getInlinerVectorBonusPercent() { return 0; }
@@ -259,11 +231,7 @@ class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
AMDGPUTTIImpl CommonTTI;
public:
- explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
- : BaseT(TM, F.getParent()->getDataLayout()),
- ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
- TLI(ST->getTargetLowering()),
- CommonTTI(TM, F) {}
+ explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
const R600Subtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 418296684d76..84d72e1b579f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -20,21 +20,25 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "SIDefines.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -70,17 +74,25 @@ char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
- // TODO: Preserve dominator tree.
+ if (RequireAndPreserveDomTree)
+ AU.addRequired<DominatorTreeWrapperPass>();
+
AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addRequired<LegacyDivergenceAnalysis>();
+ if (RequireAndPreserveDomTree) {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ // FIXME: preserve PostDominatorTreeWrapperPass
+ }
+
// No divergent values are changed, only blocks and branch edges.
AU.addPreserved<LegacyDivergenceAnalysis>();
@@ -133,7 +145,7 @@ static void removeDoneExport(Function &F) {
}
}
-static BasicBlock *unifyReturnBlockSet(Function &F,
+static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
ArrayRef<BasicBlock *> ReturningBlocks,
bool InsertExport,
const TargetTransformInfo &TTI,
@@ -153,7 +165,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
Value *Undef = UndefValue::get(B.getFloatTy());
B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
{
- B.getInt32(9), // target, SQ_EXP_NULL
+ B.getInt32(AMDGPU::Exp::ET_NULL),
B.getInt32(0), // enabled channels
Undef, Undef, Undef, Undef, // values
B.getTrue(), // done
@@ -174,6 +186,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
// Loop over all of the blocks, replacing the return instruction with an
// unconditional branch.
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve(ReturningBlocks.size());
for (BasicBlock *BB : ReturningBlocks) {
// Add an incoming element to the PHI node for every return instruction that
// is merging into this new block...
@@ -183,17 +197,27 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
// Remove and delete the return inst.
BB->getTerminator()->eraseFromParent();
BranchInst::Create(NewRetBlock, BB);
+ Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
}
+ if (RequireAndPreserveDomTree)
+ DTU.applyUpdates(Updates);
+ Updates.clear();
+
for (BasicBlock *BB : ReturningBlocks) {
// Cleanup possible branch to unconditional branch to the return.
- simplifyCFG(BB, TTI, {2});
+ simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
+ SimplifyCFGOptions().bonusInstThreshold(2));
}
return NewRetBlock;
}
bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
// If there's only one exit, we don't need to do anything, unless this is a
@@ -216,6 +240,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
bool InsertExport = false;
bool Changed = false;
+ std::vector<DominatorTree::UpdateType> Updates;
+
for (BasicBlock *BB : PDT.roots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
@@ -272,14 +298,28 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+ Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
} else { // Conditional branch.
+ SmallVector<BasicBlock *, 2> Successors(succ_begin(BB), succ_end(BB));
+
// Create a new transition block to hold the conditional branch.
BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+ Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+ // 'Successors' become successors of TransitionBB instead of BB,
+ // and TransitionBB becomes a single successor of BB.
+ Updates.push_back({DominatorTree::Insert, BB, TransitionBB});
+ for (BasicBlock *Successor : Successors) {
+ Updates.push_back({DominatorTree::Insert, TransitionBB, Successor});
+ Updates.push_back({DominatorTree::Delete, BB, Successor});
+ }
+
// Create a branch that will always branch to the transition block and
// references DummyReturnBB.
BB->getTerminator()->eraseFromParent();
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+ Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
}
Changed = true;
}
@@ -295,10 +335,12 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
"UnifiedUnreachableBlock", &F);
new UnreachableInst(F.getContext(), UnreachableBlock);
+ Updates.reserve(Updates.size() + UnreachableBlocks.size());
for (BasicBlock *BB : UnreachableBlocks) {
// Remove and delete the unreachable inst.
BB->getTerminator()->eraseFromParent();
BranchInst::Create(UnreachableBlock, BB);
+ Updates.push_back({DominatorTree::Insert, BB, UnreachableBlock});
}
Changed = true;
}
@@ -328,6 +370,12 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
}
}
+ // FIXME: add PDT here once simplifycfg is ready.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ if (RequireAndPreserveDomTree)
+ DTU.applyUpdates(Updates);
+ Updates.clear();
+
// Now handle return blocks.
if (ReturningBlocks.empty())
return Changed; // No blocks return
@@ -345,11 +393,10 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// uniformly reached block with the "done" bit cleared.
auto BlocksToUnify = std::move(ReturningBlocks);
if (InsertExport) {
- BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
- UniformlyReachedRetBlocks.end());
+ llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
}
- unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
+ unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
"UnifiedReturnBlock");
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index 281ae6d646e9..240b6c2ff462 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -12,14 +12,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
-#include <algorithm>
-#include <cassert>
using namespace llvm;
@@ -45,6 +41,7 @@ namespace {
private:
bool runOnModule(Module &M) override;
+ };
/// Unify version metadata.
/// \return true if changes are made.
@@ -96,7 +93,7 @@ namespace {
SmallVector<Metadata *, 4> All;
for (auto MD : NamedMD->operands())
for (const auto &Op : MD->operands())
- if (std::find(All.begin(), All.end(), Op.get()) == All.end())
+ if (!llvm::is_contained(All, Op.get()))
All.push_back(Op.get());
NamedMD->eraseFromParent();
@@ -106,41 +103,42 @@ namespace {
return true;
}
-};
-} // end anonymous namespace
+ bool unifyMetadataImpl(Module &M) {
+ const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
+ const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
+ kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
-char AMDGPUUnifyMetadata::ID = 0;
+ bool Changed = false;
-char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
+ for (auto &I : Vers)
+ Changed |= unifyVersionMD(M, I, true);
-INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
- "Unify multiple OpenCL metadata due to linking",
- false, false)
+ for (auto &I : Exts)
+ Changed |= unifyExtensionMD(M, I);
-ModulePass* llvm::createAMDGPUUnifyMetadataPass() {
- return new AMDGPUUnifyMetadata();
-}
+ return Changed;
+ }
-bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
- const char* Vers[] = {
- kOCLMD::SpirVer,
- kOCLMD::OCLVer
- };
- const char* Exts[] = {
- kOCLMD::UsedExt,
- kOCLMD::UsedOptCoreFeat,
- kOCLMD::CompilerOptions,
- kOCLMD::LLVMIdent
- };
+ } // end anonymous namespace
- bool Changed = false;
+ char AMDGPUUnifyMetadata::ID = 0;
- for (auto &I : Vers)
- Changed |= unifyVersionMD(M, I, true);
+ char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
- for (auto &I : Exts)
- Changed |= unifyExtensionMD(M, I);
+ INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
+ "Unify multiple OpenCL metadata due to linking", false, false)
- return Changed;
-}
+ ModulePass *llvm::createAMDGPUUnifyMetadataPass() {
+ return new AMDGPUUnifyMetadata();
+ }
+
+ bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
+ return unifyMetadataImpl(M);
+ }
+
+ PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return unifyMetadataImpl(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+ }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 1f28688a7296..b9a8c6bd005d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -7,42 +7,17 @@
//==-----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "R600InstrInfo.h"
#include "R600RegisterInfo.h"
-#include "llvm/ADT/DepthFirstIterator.h"
+#include "R600Subtarget.h"
#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstddef>
-#include <deque>
-#include <iterator>
-#include <map>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -467,7 +442,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
const DebugLoc &DL) {
MachineInstr *MI =
MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
- if (MBB->begin() != MBB->end())
+ if (!MBB->empty())
MBB->insert(MBB->begin(), MI);
else
MBB->push_back(MI);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
index 3e658a144c1f..654153ea5151 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -11,12 +11,8 @@
#ifndef AMDKERNELCODET_H
#define AMDKERNELCODET_H
-#include "llvm/MC/SubtargetFeature.h"
-
-#include <cstddef>
#include <cstdint>
-#include "llvm/Support/Debug.h"
//---------------------------------------------------------------------------//
// AMD Kernel Code, and its dependencies //
//---------------------------------------------------------------------------//
@@ -527,7 +523,7 @@ typedef struct hsa_ext_control_directives_s {
/// the kernarg segment is constant for the duration of the kernel execution.
///
-typedef struct amd_kernel_code_s {
+struct amd_kernel_code_t {
uint32_t amd_kernel_code_version_major;
uint32_t amd_kernel_code_version_minor;
uint16_t amd_machine_kind;
@@ -650,6 +646,6 @@ typedef struct amd_kernel_code_s {
uint8_t reserved3[12];
uint64_t runtime_loader_kernel_symbol;
uint64_t control_directives[16];
-} amd_kernel_code_t;
+};
#endif // AMDKERNELCODET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 013b7a0cf25d..af4a47935e3f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6,7 +6,6 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
@@ -17,49 +16,23 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Error.h"
#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/SMLoc.h"
#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <iterator>
-#include <map>
-#include <memory>
-#include <string>
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -188,6 +161,12 @@ public:
ImmTyEndpgm,
};
+ enum ImmKindTy {
+ ImmKindTyNone,
+ ImmKindTyLiteral,
+ ImmKindTyConst,
+ };
+
private:
struct TokOp {
const char *Data;
@@ -198,6 +177,7 @@ private:
int64_t Val;
ImmTy Type;
bool IsFPImm;
+ mutable ImmKindTy Kind;
Modifiers Mods;
};
@@ -233,6 +213,29 @@ public:
return Kind == Immediate;
}
+ void setImmKindNone() const {
+ assert(isImm());
+ Imm.Kind = ImmKindTyNone;
+ }
+
+ void setImmKindLiteral() const {
+ assert(isImm());
+ Imm.Kind = ImmKindTyLiteral;
+ }
+
+ void setImmKindConst() const {
+ assert(isImm());
+ Imm.Kind = ImmKindTyConst;
+ }
+
+ bool IsImmKindLiteral() const {
+ return isImm() && Imm.Kind == ImmKindTyLiteral;
+ }
+
+ bool isImmKindConst() const {
+ return isImm() && Imm.Kind == ImmKindTyConst;
+ }
+
bool isInlinableImm(MVT type) const;
bool isLiteralImm(MVT type) const;
@@ -335,11 +338,14 @@ public:
bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isDLC() const { return isImmTy(ImmTyDLC); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
+ // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced
+ // value of the GLC operand.
+ bool isGLC_1() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isSWZ() const { return isImmTy(ImmTySWZ); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
- bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
+ bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -689,6 +695,11 @@ public:
return Imm.Val;
}
+ void setImm(int64_t Val) {
+ assert(isImm());
+ Imm.Val = Val;
+ }
+
ImmTy getImmTy() const {
assert(isImm());
return Imm.Type;
@@ -903,6 +914,7 @@ public:
auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser);
Op->Imm.Val = Val;
Op->Imm.IsFPImm = IsFPImm;
+ Op->Imm.Kind = ImmKindTyNone;
Op->Imm.Type = Type;
Op->Imm.Mods = Modifiers();
Op->StartLoc = Loc;
@@ -1065,7 +1077,7 @@ private:
std::string &CollectString);
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
- RegisterKind RegKind, unsigned Reg1);
+ RegisterKind RegKind, unsigned Reg1, SMLoc Loc);
bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
unsigned &RegNum, unsigned &RegWidth,
bool RestoreOnFailure = false);
@@ -1083,7 +1095,8 @@ private:
bool ParseRegRange(unsigned& Num, unsigned& Width);
unsigned getRegularReg(RegisterKind RegKind,
unsigned RegNum,
- unsigned RegWidth);
+ unsigned RegWidth,
+ SMLoc Loc);
bool isRegister();
bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -1127,7 +1140,7 @@ public:
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1144,7 +1157,7 @@ public:
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
}
- if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
initializeGprCountSymbol(IS_VGPR);
initializeGprCountSymbol(IS_SGPR);
} else
@@ -1184,10 +1197,16 @@ public:
return AMDGPU::isGFX9(getSTI());
}
+ bool isGFX9Plus() const {
+ return AMDGPU::isGFX9Plus(getSTI());
+ }
+
bool isGFX10() const {
return AMDGPU::isGFX10(getSTI());
}
+ bool isGFX10Plus() const { return AMDGPU::isGFX10Plus(getSTI()); }
+
bool isGFX10_BEncoding() const {
return AMDGPU::isGFX10_BEncoding(getSTI());
}
@@ -1204,9 +1223,7 @@ public:
return !isVI() && !isGFX9();
}
- bool hasSGPR104_SGPR105() const {
- return isGFX10();
- }
+ bool hasSGPR104_SGPR105() const { return isGFX10Plus(); }
bool hasIntClamp() const {
return getFeatureBits()[AMDGPU::FeatureIntClamp];
@@ -1240,6 +1257,7 @@ public:
bool isForcedDPP() const { return ForcedDPP; }
bool isForcedSDWA() const { return ForcedSDWA; }
ArrayRef<unsigned> getMatchedVariants() const;
+ StringRef getMatchedVariantName() const;
std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false);
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
@@ -1279,7 +1297,8 @@ public:
parseNamedBit(const char *Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
- StringRef &Value);
+ StringRef &Value,
+ SMLoc &StringLoc);
bool isModifier();
bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -1295,7 +1314,15 @@ public:
OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
- OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands);
+ OperandMatchResultTy parseDfmtNfmt(int64_t &Format);
+ OperandMatchResultTy parseUfmt(int64_t &Format);
+ OperandMatchResultTy parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format);
+ OperandMatchResultTy parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format);
+ OperandMatchResultTy parseFORMAT(OperandVector &Operands);
+ OperandMatchResultTy parseSymbolicOrNumericFormat(int64_t &Format);
+ OperandMatchResultTy parseNumericFormat(int64_t &Format);
+ bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val);
+ bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc);
void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
@@ -1308,6 +1335,7 @@ public:
private:
struct OperandInfoTy {
+ SMLoc Loc;
int64_t Id;
bool IsSymbolic = false;
bool IsDefined = false;
@@ -1318,30 +1346,35 @@ private:
bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
bool validateSendMsg(const OperandInfoTy &Msg,
const OperandInfoTy &Op,
- const OperandInfoTy &Stream,
- const SMLoc Loc);
+ const OperandInfoTy &Stream);
- bool parseHwregBody(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+ bool parseHwregBody(OperandInfoTy &HwReg,
+ OperandInfoTy &Offset,
+ OperandInfoTy &Width);
bool validateHwreg(const OperandInfoTy &HwReg,
- const int64_t Offset,
- const int64_t Width,
- const SMLoc Loc);
+ const OperandInfoTy &Offset,
+ const OperandInfoTy &Width);
- void errorExpTgt();
- OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
+ SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
+ const OperandVector &Operands) const;
+ SMLoc getImmLoc(AMDGPUOperand::ImmTy Type, const OperandVector &Operands) const;
+ SMLoc getRegLoc(unsigned Reg, const OperandVector &Operands) const;
+ SMLoc getLitLoc(const OperandVector &Operands) const;
+ SMLoc getConstLoc(const OperandVector &Operands) const;
+
bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
bool validateSOPLiteral(const MCInst &Inst) const;
- bool validateConstantBusLimitations(const MCInst &Inst);
- bool validateEarlyClobberLimitations(const MCInst &Inst);
+ bool validateConstantBusLimitations(const MCInst &Inst, const OperandVector &Operands);
+ bool validateEarlyClobberLimitations(const MCInst &Inst, const OperandVector &Operands);
bool validateIntClampSupported(const MCInst &Inst);
bool validateMIMGAtomicDMask(const MCInst &Inst);
bool validateMIMGGatherDMask(const MCInst &Inst);
- bool validateMovrels(const MCInst &Inst);
+ bool validateMovrels(const MCInst &Inst, const OperandVector &Operands);
bool validateMIMGDataSize(const MCInst &Inst);
bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
@@ -1349,13 +1382,23 @@ private:
bool validateLdsDirect(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
bool validateVccOperand(unsigned Reg) const;
- bool validateVOP3Literal(const MCInst &Inst) const;
- bool validateMAIAccWrite(const MCInst &Inst);
+ bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands);
+ bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+ bool validateDivScale(const MCInst &Inst);
+ bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
+ const SMLoc &IDLoc);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+ bool isSupportedMnemo(StringRef Mnemo,
+ const FeatureBitset &FBS);
+ bool isSupportedMnemo(StringRef Mnemo,
+ const FeatureBitset &FBS,
+ ArrayRef<unsigned> Variants);
+ bool checkUnsupportedInstruction(StringRef Name, const SMLoc &IDLoc);
+
bool isId(const StringRef Id) const;
bool isId(const AsmToken &Token, const StringRef Id) const;
bool isToken(const AsmToken::TokenKind Kind) const;
@@ -1364,9 +1407,11 @@ private:
bool trySkipToken(const AsmToken::TokenKind Kind);
bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+ bool parseId(StringRef &Val, const StringRef ErrMsg = "");
+
void peekTokens(MutableArrayRef<AsmToken> Tokens);
AsmToken::TokenKind getTokenKind() const;
- bool parseExpr(int64_t &Imm);
+ bool parseExpr(int64_t &Imm, StringRef Expected = "");
bool parseExpr(OperandVector &Operands);
StringRef getTokenStr() const;
AsmToken peekToken();
@@ -1385,6 +1430,11 @@ public:
OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
OperandMatchResultTy parseBoolReg(OperandVector &Operands);
+ bool parseSwizzleOperand(int64_t &Op,
+ const unsigned MinVal,
+ const unsigned MaxVal,
+ const StringRef ErrMsg,
+ SMLoc &Loc);
bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
const unsigned MinVal,
const unsigned MaxVal,
@@ -1409,6 +1459,7 @@ public:
AMDGPUOperand::Ptr defaultDLC() const;
AMDGPUOperand::Ptr defaultGLC() const;
+ AMDGPUOperand::Ptr defaultGLC_1() const;
AMDGPUOperand::Ptr defaultSLC() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
@@ -1429,10 +1480,14 @@ public:
void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
bool IsAtomic = false);
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+ void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands);
OperandMatchResultTy parseDim(OperandVector &Operands);
OperandMatchResultTy parseDPP8(OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+ bool isSupportedDPPCtrl(StringRef Ctrl, const OperandVector &Operands);
+ int64_t parseDPPCtrlSel(StringRef Ctrl);
+ int64_t parseDPPCtrlPerm();
AMDGPUOperand::Ptr defaultRowMask() const;
AMDGPUOperand::Ptr defaultBankMask() const;
AMDGPUOperand::Ptr defaultBoundCtrl() const;
@@ -1673,7 +1728,7 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
return isVReg32();
- else if (AsmParser->isGFX9() || AsmParser->isGFX10())
+ else if (AsmParser->isGFX9Plus())
return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type);
else
return false;
@@ -1726,6 +1781,7 @@ void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers
} else {
assert(!isImmTy(ImmTyNone) || !hasModifiers());
Inst.addOperand(MCOperand::createImm(Imm.Val));
+ setImmKindNone();
}
}
@@ -1753,6 +1809,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
+ setImmKindConst();
return;
}
@@ -1766,6 +1823,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
}
Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+ setImmKindLiteral();
return;
}
@@ -1802,6 +1860,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
Inst.addOperand(MCOperand::createImm(ImmVal));
+ setImmKindLiteral();
return;
}
default:
@@ -1826,10 +1885,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
+ setImmKindConst();
return;
}
Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
+ setImmKindLiteral();
return;
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -1838,10 +1899,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
+ setImmKindConst();
return;
}
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+ setImmKindLiteral();
return;
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -1854,10 +1917,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
+ setImmKindConst();
return;
}
Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+ setImmKindLiteral();
return;
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
@@ -1879,6 +1944,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
template <unsigned Bitwidth>
void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
APInt Literal(64, Imm.Val);
+ setImmKindNone();
if (!Imm.IsFPImm) {
// We got int literal token.
@@ -2051,7 +2117,8 @@ OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(unsigned &RegNo,
}
bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
- RegisterKind RegKind, unsigned Reg1) {
+ RegisterKind RegKind, unsigned Reg1,
+ SMLoc Loc) {
switch (RegKind) {
case IS_SPECIAL:
if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
@@ -2084,12 +2151,14 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
RegWidth = 2;
return true;
}
+ Error(Loc, "register does not fit in the list");
return false;
case IS_VGPR:
case IS_SGPR:
case IS_AGPR:
case IS_TTMP:
if (Reg1 != Reg + RegWidth) {
+ Error(Loc, "registers in a list must have consecutive indices");
return false;
}
RegWidth++;
@@ -2172,7 +2241,8 @@ AMDGPUAsmParser::isRegister()
unsigned
AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
unsigned RegNum,
- unsigned RegWidth) {
+ unsigned RegWidth,
+ SMLoc Loc) {
assert(isRegularReg(RegKind));
@@ -2183,18 +2253,24 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
AlignSize = std::min(RegWidth, 4u);
}
- if (RegNum % AlignSize != 0)
+ if (RegNum % AlignSize != 0) {
+ Error(Loc, "invalid register alignment");
return AMDGPU::NoRegister;
+ }
unsigned RegIdx = RegNum / AlignSize;
int RCID = getRegClass(RegKind, RegWidth);
- if (RCID == -1)
+ if (RCID == -1) {
+ Error(Loc, "invalid or unsupported register size");
return AMDGPU::NoRegister;
+ }
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
const MCRegisterClass RC = TRI->getRegClass(RCID);
- if (RegIdx >= RC.getNumRegs())
+ if (RegIdx >= RC.getNumRegs()) {
+ Error(Loc, "register index is out of range");
return AMDGPU::NoRegister;
+ }
return RC.getRegister(RegIdx);
}
@@ -2202,24 +2278,40 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
bool
AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
int64_t RegLo, RegHi;
- if (!trySkipToken(AsmToken::LBrac))
+ if (!skipToken(AsmToken::LBrac, "missing register index"))
return false;
+ SMLoc FirstIdxLoc = getLoc();
+ SMLoc SecondIdxLoc;
+
if (!parseExpr(RegLo))
return false;
if (trySkipToken(AsmToken::Colon)) {
+ SecondIdxLoc = getLoc();
if (!parseExpr(RegHi))
return false;
} else {
RegHi = RegLo;
}
- if (!trySkipToken(AsmToken::RBrac))
+ if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
return false;
- if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi)
+ if (!isUInt<32>(RegLo)) {
+ Error(FirstIdxLoc, "invalid register index");
return false;
+ }
+
+ if (!isUInt<32>(RegHi)) {
+ Error(SecondIdxLoc, "invalid register index");
+ return false;
+ }
+
+ if (RegLo > RegHi) {
+ Error(FirstIdxLoc, "first register index should not exceed second index");
+ return false;
+ }
Num = static_cast<unsigned>(RegLo);
Width = (RegHi - RegLo) + 1;
@@ -2246,10 +2338,14 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
SmallVectorImpl<AsmToken> &Tokens) {
assert(isToken(AsmToken::Identifier));
StringRef RegName = getTokenStr();
+ auto Loc = getLoc();
const RegInfo *RI = getRegularRegInfo(RegName);
- if (!RI)
+ if (!RI) {
+ Error(Loc, "invalid register name");
return AMDGPU::NoRegister;
+ }
+
Tokens.push_back(getToken());
lex(); // skip register name
@@ -2257,8 +2353,10 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
StringRef RegSuffix = RegName.substr(RI->Name.size());
if (!RegSuffix.empty()) {
// Single 32-bit register: vXX.
- if (!getRegNum(RegSuffix, RegNum))
+ if (!getRegNum(RegSuffix, RegNum)) {
+ Error(Loc, "invalid register index");
return AMDGPU::NoRegister;
+ }
RegWidth = 1;
} else {
// Range of registers: v[XX:YY]. ":YY" is optional.
@@ -2266,44 +2364,59 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
return AMDGPU::NoRegister;
}
- return getRegularReg(RegKind, RegNum, RegWidth);
+ return getRegularReg(RegKind, RegNum, RegWidth, Loc);
}
unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
unsigned &RegWidth,
SmallVectorImpl<AsmToken> &Tokens) {
unsigned Reg = AMDGPU::NoRegister;
+ auto ListLoc = getLoc();
- if (!trySkipToken(AsmToken::LBrac))
+ if (!skipToken(AsmToken::LBrac,
+ "expected a register or a list of registers")) {
return AMDGPU::NoRegister;
+ }
// List of consecutive registers, e.g.: [s0,s1,s2,s3]
+ auto Loc = getLoc();
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
return AMDGPU::NoRegister;
- if (RegWidth != 1)
+ if (RegWidth != 1) {
+ Error(Loc, "expected a single 32-bit register");
return AMDGPU::NoRegister;
+ }
for (; trySkipToken(AsmToken::Comma); ) {
RegisterKind NextRegKind;
unsigned NextReg, NextRegNum, NextRegWidth;
+ Loc = getLoc();
- if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth,
- Tokens))
+ if (!ParseAMDGPURegister(NextRegKind, NextReg,
+ NextRegNum, NextRegWidth,
+ Tokens)) {
return AMDGPU::NoRegister;
- if (NextRegWidth != 1)
+ }
+ if (NextRegWidth != 1) {
+ Error(Loc, "expected a single 32-bit register");
return AMDGPU::NoRegister;
- if (NextRegKind != RegKind)
+ }
+ if (NextRegKind != RegKind) {
+ Error(Loc, "registers in a list must be of the same kind");
return AMDGPU::NoRegister;
- if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg))
+ }
+ if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg, Loc))
return AMDGPU::NoRegister;
}
- if (!trySkipToken(AsmToken::RBrac))
+ if (!skipToken(AsmToken::RBrac,
+ "expected a comma or a closing square bracket")) {
return AMDGPU::NoRegister;
+ }
if (isRegularReg(RegKind))
- Reg = getRegularReg(RegKind, RegNum, RegWidth);
+ Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc);
return Reg;
}
@@ -2311,6 +2424,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
unsigned &RegNum, unsigned &RegWidth,
SmallVectorImpl<AsmToken> &Tokens) {
+ auto Loc = getLoc();
Reg = AMDGPU::NoRegister;
if (isToken(AsmToken::Identifier)) {
@@ -2322,12 +2436,26 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
}
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
+ if (Reg == AMDGPU::NoRegister) {
+ assert(Parser.hasPendingError());
+ return false;
+ }
+
+ if (!subtargetHasRegister(*TRI, Reg)) {
+ if (Reg == AMDGPU::SGPR_NULL) {
+ Error(Loc, "'null' operand is not supported on this GPU");
+ } else {
+ Error(Loc, "register not available on this GPU");
+ }
+ return false;
+ }
+
+ return true;
}
bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
unsigned &RegNum, unsigned &RegWidth,
- bool RestoreOnFailure) {
+ bool RestoreOnFailure /*=false*/) {
Reg = AMDGPU::NoRegister;
SmallVector<AsmToken, 1> Tokens;
@@ -2377,11 +2505,11 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
int64_t OldCount;
if (!Sym->isVariable())
- return !Error(getParser().getTok().getLoc(),
+ return !Error(getLoc(),
".amdgcn.next_free_{v,s}gpr symbols must be variable");
if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount))
return !Error(
- getParser().getTok().getLoc(),
+ getLoc(),
".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions");
if (OldCount <= NewMax)
@@ -2392,18 +2520,16 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
std::unique_ptr<AMDGPUOperand>
AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
- const auto &Tok = Parser.getTok();
+ const auto &Tok = getToken();
SMLoc StartLoc = Tok.getLoc();
SMLoc EndLoc = Tok.getEndLoc();
RegisterKind RegKind;
unsigned Reg, RegNum, RegWidth;
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
- //FIXME: improve error messages (bug 41303).
- Error(StartLoc, "not a valid operand.");
return nullptr;
}
- if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (isHsaAbiVersion3(&getSTI())) {
if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
@@ -2466,7 +2592,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
// This syntax is not compatible with syntax of standard
// MC expressions (due to the trailing '|').
SMLoc EndLoc;
- if (getParser().parsePrimaryExpr(Expr, EndLoc))
+ if (getParser().parsePrimaryExpr(Expr, EndLoc, nullptr))
return MatchOperand_ParseFail;
} else {
if (Parser.parseExpression(Expr))
@@ -2761,6 +2887,15 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_Success;
}
+static ArrayRef<unsigned> getAllVariants() {
+ static const unsigned Variants[] = {
+ AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
+ };
+
+ return makeArrayRef(Variants);
+}
+
// What asm variants we should check
ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
if (getForcedEncodingSize() == 32) {
@@ -2784,12 +2919,23 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
return makeArrayRef(Variants);
}
- static const unsigned Variants[] = {
- AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
- AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
- };
+ return getAllVariants();
+}
- return makeArrayRef(Variants);
+StringRef AMDGPUAsmParser::getMatchedVariantName() const {
+ if (getForcedEncodingSize() == 32)
+ return "e32";
+
+ if (isForcedVOP3())
+ return "e64";
+
+ if (isForcedSDWA())
+ return "sdwa";
+
+ if (isForcedDPP())
+ return "dpp";
+
+ return "";
}
unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
@@ -2858,20 +3004,20 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
}
unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
- if (!isGFX10())
+ if (!isGFX10Plus())
return 1;
switch (Opcode) {
// 64-bit shift instructions can use only one scalar value input
- case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
- case AMDGPU::V_LSHL_B64:
- case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
- case AMDGPU::V_LSHR_B64:
- case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
- case AMDGPU::V_ASHR_I64:
+ case AMDGPU::V_LSHL_B64_e64:
+ case AMDGPU::V_LSHR_B64_e64:
+ case AMDGPU::V_ASHR_I64_e64:
return 1;
default:
return 2;
@@ -2885,15 +3031,19 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
} else if (MO.isReg()) {
auto Reg = MO.getReg();
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL;
+ auto PReg = mc2PseudoReg(Reg);
+ return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
} else {
return true;
}
}
-bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
+bool
+AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst,
+ const OperandVector &Operands) {
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
+ unsigned LastSGPR = AMDGPU::NoRegister;
unsigned ConstantBusUseCount = 0;
unsigned NumLiterals = 0;
unsigned LiteralSize;
@@ -2927,15 +3077,15 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
const MCOperand &MO = Inst.getOperand(OpIdx);
if (usesConstantBus(Inst, OpIdx)) {
if (MO.isReg()) {
- const unsigned Reg = mc2PseudoReg(MO.getReg());
+ LastSGPR = mc2PseudoReg(MO.getReg());
// Pairs of registers with a partial intersections like these
// s0, s[0:1]
// flat_scratch_lo, flat_scratch
// flat_scratch_lo, flat_scratch_hi
// are theoretically valid but they are disabled anyway.
// Note that this code mimics SIInstrInfo::verifyInstruction
- if (!SGPRsUsed.count(Reg)) {
- SGPRsUsed.insert(Reg);
+ if (!SGPRsUsed.count(LastSGPR)) {
+ SGPRsUsed.insert(LastSGPR);
++ConstantBusUseCount;
}
} else { // Expression or a literal
@@ -2967,10 +3117,19 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
}
ConstantBusUseCount += NumLiterals;
- return ConstantBusUseCount <= getConstantBusLimit(Opcode);
+ if (ConstantBusUseCount <= getConstantBusLimit(Opcode))
+ return true;
+
+ SMLoc LitLoc = getLitLoc(Operands);
+ SMLoc RegLoc = getRegLoc(LastSGPR, Operands);
+ SMLoc Loc = (LitLoc.getPointer() < RegLoc.getPointer()) ? RegLoc : LitLoc;
+ Error(Loc, "invalid operand (violates constant bus restrictions)");
+ return false;
}
-bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+bool
+AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst,
+ const OperandVector &Operands) {
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
@@ -2999,6 +3158,8 @@ bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
if (Src.isReg()) {
const unsigned SrcReg = mc2PseudoReg(Src.getReg());
if (isRegIntersect(DstReg, SrcReg, TRI)) {
+ Error(getRegLoc(SrcReg, Operands),
+ "destination must be different than all sources");
return false;
}
}
@@ -3034,8 +3195,9 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
int TFEIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe);
assert(VDataIdx != -1);
- assert(DMaskIdx != -1);
- assert(TFEIdx != -1);
+
+ if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray
+ return true;
unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
@@ -3058,10 +3220,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10())
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10Plus())
return true;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
@@ -3070,9 +3233,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
assert(VAddr0Idx != -1);
assert(SrsrcIdx != -1);
- assert(DimIdx != -1);
assert(SrsrcIdx > VAddr0Idx);
+ if (DimIdx == -1)
+ return true; // intersect_ray
+
unsigned Dim = Inst.getOperand(DimIdx).getImm();
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
@@ -3148,7 +3313,8 @@ static bool IsMovrelsSDWAOpcode(const unsigned Opcode)
// movrels* opcodes should only allow VGPRS as src0.
// This is specified in .td description for vop1/vop3,
// but sdwa is handled differently. See isSDWAOperand.
-bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst,
+ const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
@@ -3159,16 +3325,24 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
assert(Src0Idx != -1);
+ SMLoc ErrLoc;
const MCOperand &Src0 = Inst.getOperand(Src0Idx);
- if (!Src0.isReg())
- return false;
+ if (Src0.isReg()) {
+ auto Reg = mc2PseudoReg(Src0.getReg());
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (!isSGPR(Reg, TRI))
+ return true;
+ ErrLoc = getRegLoc(Reg, Operands);
+ } else {
+ ErrLoc = getConstLoc(Operands);
+ }
- auto Reg = Src0.getReg();
- const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- return !isSGPR(mc2PseudoReg(Reg), TRI);
+ Error(ErrLoc, "source operand must be a VGPR");
+ return false;
}
-bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
+ const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
@@ -3182,16 +3356,45 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
if (!Src0.isReg())
return true;
- auto Reg = Src0.getReg();
+ auto Reg = mc2PseudoReg(Src0.getReg());
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- if (isSGPR(mc2PseudoReg(Reg), TRI)) {
- Error(getLoc(), "source operand must be either a VGPR or an inline constant");
+ if (isSGPR(Reg, TRI)) {
+ Error(getRegLoc(Reg, Operands),
+ "source operand must be either a VGPR or an inline constant");
return false;
}
return true;
}
+bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) {
+ switch (Inst.getOpcode()) {
+ default:
+ return true;
+ case V_DIV_SCALE_F32_gfx6_gfx7:
+ case V_DIV_SCALE_F32_vi:
+ case V_DIV_SCALE_F32_gfx10:
+ case V_DIV_SCALE_F64_gfx6_gfx7:
+ case V_DIV_SCALE_F64_vi:
+ case V_DIV_SCALE_F64_gfx10:
+ break;
+ }
+
+ // TODO: Check that src0 = src1 or src2.
+
+ for (auto Name : {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src2_modifiers,
+ AMDGPU::OpName::src2_modifiers}) {
+ if (Inst.getOperand(AMDGPU::getNamedOperandIdx(Inst.getOpcode(), Name))
+ .getImm() &
+ SISrcMods::ABS) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
@@ -3239,8 +3442,8 @@ static bool IsRevOpcode(const unsigned Opcode)
case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7:
case AMDGPU::V_SUBREV_F32_e64_vi:
- case AMDGPU::V_SUBREV_I32_e32:
- case AMDGPU::V_SUBREV_I32_e64:
+ case AMDGPU::V_SUBREV_CO_U32_e32:
+ case AMDGPU::V_SUBREV_CO_U32_e64:
case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7:
case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7:
@@ -3328,15 +3531,15 @@ static bool IsRevOpcode(const unsigned Opcode)
case AMDGPU::V_ASHRREV_I16_e64_vi:
case AMDGPU::V_ASHRREV_I16_gfx10:
- case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
case AMDGPU::V_LSHLREV_B64_vi:
- case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
case AMDGPU::V_LSHRREV_B64_vi:
- case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
case AMDGPU::V_ASHRREV_I64_vi:
@@ -3419,22 +3622,20 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
return false;
}
- // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
// For FLAT segment the offset must be positive;
// MSB is ignored and forced to zero.
- unsigned OffsetSize = isGFX9() ? 13 : 12;
- if (TSFlags & SIInstrFlags::IsNonFlatSeg) {
+ if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) {
+ unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true);
if (!isIntN(OffsetSize, Op.getImm())) {
Error(getFlatOffsetLoc(Operands),
- isGFX9() ? "expected a 13-bit signed offset" :
- "expected a 12-bit signed offset");
+ Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
return false;
}
} else {
- if (!isUIntN(OffsetSize - 1, Op.getImm())) {
+ unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), false);
+ if (!isUIntN(OffsetSize, Op.getImm())) {
Error(getFlatOffsetLoc(Operands),
- isGFX9() ? "expected a 12-bit unsigned offset" :
- "expected an 11-bit unsigned offset");
+ Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset");
return false;
}
}
@@ -3443,7 +3644,8 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
}
SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const {
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ // Start with second operand because SMEM Offset cannot be dst or src0.
+ for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
if (Op.isSMEMOffset())
return Op.getStartLoc();
@@ -3539,7 +3741,8 @@ bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
}
// VOP3 literal is only allowed in GFX10+ and only one can be used
-bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
+bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst,
+ const OperandVector &Operands) {
unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)))
@@ -3565,8 +3768,11 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
continue;
if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) &&
- getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug])
+ getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug]) {
+ Error(getConstLoc(Operands),
+ "inline constants are not allowed for this operand");
return false;
+ }
if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
uint32_t Value = static_cast<uint32_t>(MO.getImm());
@@ -3580,51 +3786,74 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
}
NumLiterals += NumExprs;
- return !NumLiterals ||
- (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
+ if (!NumLiterals)
+ return true;
+
+ if (!getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
+ Error(getLitLoc(Operands), "literal operands are not supported");
+ return false;
+ }
+
+ if (NumLiterals > 1) {
+ Error(getLitLoc(Operands), "only one literal operand is allowed");
+ return false;
+ }
+
+ return true;
+}
+
+bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
+ const OperandVector &Operands,
+ const SMLoc &IDLoc) {
+ int GLCPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::glc1);
+ if (GLCPos != -1) {
+ // -1 is set by GLC_1 default operand. In all cases "glc" must be present
+ // in the asm string, and the default value means it is not present.
+ if (Inst.getOperand(GLCPos).getImm() == -1) {
+ Error(IDLoc, "instruction must use glc");
+ return false;
+ }
+ }
+
+ return true;
}
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
if (!validateLdsDirect(Inst)) {
- Error(IDLoc,
+ Error(getRegLoc(AMDGPU::LDS_DIRECT, Operands),
"invalid use of lds_direct");
return false;
}
if (!validateSOPLiteral(Inst)) {
- Error(IDLoc,
+ Error(getLitLoc(Operands),
"only one literal operand is allowed");
return false;
}
- if (!validateVOP3Literal(Inst)) {
- Error(IDLoc,
- "invalid literal operand");
+ if (!validateVOP3Literal(Inst, Operands)) {
return false;
}
- if (!validateConstantBusLimitations(Inst)) {
- Error(IDLoc,
- "invalid operand (violates constant bus restrictions)");
+ if (!validateConstantBusLimitations(Inst, Operands)) {
return false;
}
- if (!validateEarlyClobberLimitations(Inst)) {
- Error(IDLoc,
- "destination must be different than all sources");
+ if (!validateEarlyClobberLimitations(Inst, Operands)) {
return false;
}
if (!validateIntClampSupported(Inst)) {
- Error(IDLoc,
+ Error(getImmLoc(AMDGPUOperand::ImmTyClampSI, Operands),
"integer clamping is not supported on this GPU");
return false;
}
if (!validateOpSel(Inst)) {
- Error(IDLoc,
+ Error(getImmLoc(AMDGPUOperand::ImmTyOpSel, Operands),
"invalid op_sel operand");
return false;
}
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
- Error(IDLoc,
+ Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
"d16 modifier is not supported on this GPU");
return false;
}
@@ -3643,17 +3872,16 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
return false;
}
if (!validateMIMGAtomicDMask(Inst)) {
- Error(IDLoc,
+ Error(getImmLoc(AMDGPUOperand::ImmTyDMask, Operands),
"invalid atomic image dmask");
return false;
}
if (!validateMIMGGatherDMask(Inst)) {
- Error(IDLoc,
+ Error(getImmLoc(AMDGPUOperand::ImmTyDMask, Operands),
"invalid image_gather dmask: only one bit must be set");
return false;
}
- if (!validateMovrels(Inst)) {
- Error(IDLoc, "source operand must be a VGPR");
+ if (!validateMovrels(Inst, Operands)) {
return false;
}
if (!validateFlatOffset(Inst, Operands)) {
@@ -3662,7 +3890,14 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateSMEMOffset(Inst, Operands)) {
return false;
}
- if (!validateMAIAccWrite(Inst)) {
+ if (!validateMAIAccWrite(Inst, Operands)) {
+ return false;
+ }
+ if (!validateDivScale(Inst)) {
+ Error(IDLoc, "ABS not allowed in VOP3B instructions");
+ return false;
+ }
+ if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
return false;
}
@@ -3673,6 +3908,57 @@ static std::string AMDGPUMnemonicSpellCheck(StringRef S,
const FeatureBitset &FBS,
unsigned VariantID = 0);
+static bool AMDGPUCheckMnemonic(StringRef Mnemonic,
+ const FeatureBitset &AvailableFeatures,
+ unsigned VariantID);
+
+bool AMDGPUAsmParser::isSupportedMnemo(StringRef Mnemo,
+ const FeatureBitset &FBS) {
+ return isSupportedMnemo(Mnemo, FBS, getAllVariants());
+}
+
+bool AMDGPUAsmParser::isSupportedMnemo(StringRef Mnemo,
+ const FeatureBitset &FBS,
+ ArrayRef<unsigned> Variants) {
+ for (auto Variant : Variants) {
+ if (AMDGPUCheckMnemonic(Mnemo, FBS, Variant))
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUAsmParser::checkUnsupportedInstruction(StringRef Mnemo,
+ const SMLoc &IDLoc) {
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+
+ // Check if requested instruction variant is supported.
+ if (isSupportedMnemo(Mnemo, FBS, getMatchedVariants()))
+ return false;
+
+ // This instruction is not supported.
+ // Clear any other pending errors because they are no longer relevant.
+ getParser().clearPendingErrors();
+
+ // Requested instruction variant is not supported.
+ // Check if any other variants are supported.
+ StringRef VariantName = getMatchedVariantName();
+ if (!VariantName.empty() && isSupportedMnemo(Mnemo, FBS)) {
+ return Error(IDLoc,
+ Twine(VariantName,
+ " variant of this instruction is not supported"));
+ }
+
+ // Finally check if this instruction is supported on any other GPU.
+ if (isSupportedMnemo(Mnemo, FeatureBitset().set())) {
+ return Error(IDLoc, "instruction not supported on this GPU");
+ }
+
+ // Instruction not supported on any GPU. Probably a typo.
+ std::string Suggestion = AMDGPUMnemonicSpellCheck(Mnemo, FBS);
+ return Error(IDLoc, "invalid instruction" + Suggestion);
+}
+
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
@@ -3702,27 +3988,28 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
break;
}
- switch (Result) {
- default: break;
- case Match_Success:
+ if (Result == Match_Success) {
if (!validateInstruction(Inst, IDLoc, Operands)) {
return true;
}
Inst.setLoc(IDLoc);
Out.emitInstruction(Inst, getSTI());
return false;
+ }
- case Match_MissingFeature:
- return Error(IDLoc, "instruction not supported on this GPU");
-
- case Match_MnemonicFail: {
- FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
- std::string Suggestion = AMDGPUMnemonicSpellCheck(
- ((AMDGPUOperand &)*Operands[0]).getToken(), FBS);
- return Error(IDLoc, "invalid instruction" + Suggestion,
- ((AMDGPUOperand &)*Operands[0]).getLocRange());
+ StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+ if (checkUnsupportedInstruction(Mnemo, IDLoc)) {
+ return true;
}
+ switch (Result) {
+ default: break;
+ case Match_MissingFeature:
+ // It has been verified that the specified instruction
+ // mnemonic is valid. A match was found but it requires
+ // features which are not supported on this GPU.
+ return Error(IDLoc, "operands are not valid for this GPU or mode");
+
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
if (ErrorInfo != ~0ULL) {
@@ -3739,13 +4026,15 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_PreferE32:
return Error(IDLoc, "internal error: instruction without _e64 suffix "
"should be encoded as e32");
+ case Match_MnemonicFail:
+ llvm_unreachable("Invalid instructions should have been handled already");
}
llvm_unreachable("Implement any new match types added!");
}
bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
int64_t Tmp = -1;
- if (getLexer().isNot(AsmToken::Integer) && getLexer().isNot(AsmToken::Identifier)) {
+ if (!isToken(AsmToken::Integer) && !isToken(AsmToken::Identifier)) {
return true;
}
if (getParser().parseAbsoluteExpression(Tmp)) {
@@ -3760,9 +4049,8 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
if (ParseAsAbsoluteExpression(Major))
return TokError("invalid major version");
- if (getLexer().isNot(AsmToken::Comma))
+ if (!trySkipToken(AsmToken::Comma))
return TokError("minor version number required, comma expected");
- Lex();
if (ParseAsAbsoluteExpression(Minor))
return TokError("invalid minor version");
@@ -3776,25 +4064,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
std::string Target;
- SMLoc TargetStart = getTok().getLoc();
+ SMLoc TargetStart = getLoc();
if (getParser().parseEscapedString(Target))
return true;
- SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+ SMRange TargetRange = SMRange(TargetStart, getLoc());
std::string ExpectedTarget;
raw_string_ostream ExpectedTargetOS(ExpectedTarget);
IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
if (Target != ExpectedTargetOS.str())
- return getParser().Error(TargetRange.Start, "target must match options",
- TargetRange);
+ return Error(TargetRange.Start, "target must match options", TargetRange);
getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
return false;
}
bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
- return getParser().Error(Range.Start, "value out of range", Range);
+ return Error(Range.Start, "value out of range", Range);
}
bool AMDGPUAsmParser::calculateGPRBlocks(
@@ -3865,15 +4152,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
Optional<bool> EnableWavefrontSize32;
while (true) {
- while (getLexer().is(AsmToken::EndOfStatement))
- Lex();
-
- if (getLexer().isNot(AsmToken::Identifier))
- return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel");
+ while (trySkipToken(AsmToken::EndOfStatement));
- StringRef ID = getTok().getIdentifier();
+ StringRef ID;
SMRange IDRange = getTok().getLocRange();
- Lex();
+ if (!parseId(ID, "expected .amdhsa_ directive or .end_amdhsa_kernel"))
+ return true;
if (ID == ".end_amdhsa_kernel")
break;
@@ -3882,11 +4166,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return TokError(".amdhsa_ directives cannot be repeated");
Seen.insert(ID);
- SMLoc ValStart = getTok().getLoc();
+ SMLoc ValStart = getLoc();
int64_t IVal;
if (getParser().parseAbsoluteExpression(IVal))
return true;
- SMLoc ValEnd = getTok().getLoc();
+ SMLoc ValEnd = getLoc();
SMRange ValRange = SMRange(ValStart, ValEnd);
if (IVal < 0)
@@ -3951,8 +4235,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
UserSGPRCount += 1;
} else if (ID == ".amdhsa_wavefront_size32") {
if (IVersion.Major < 10)
- return getParser().Error(IDRange.Start, "directive requires gfx10+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx10+", IDRange);
EnableWavefrontSize32 = Val;
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
@@ -3960,7 +4243,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
- COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val,
ValRange);
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
@@ -3994,15 +4277,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ReserveVCC = Val;
} else if (ID == ".amdhsa_reserve_flat_scratch") {
if (IVersion.Major < 7)
- return getParser().Error(IDRange.Start, "directive requires gfx7+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx7+", IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
ReserveFlatScr = Val;
} else if (ID == ".amdhsa_reserve_xnack_mask") {
if (IVersion.Major < 8)
- return getParser().Error(IDRange.Start, "directive requires gfx8+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx8+", IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
ReserveXNACK = Val;
@@ -4027,26 +4308,22 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
Val, ValRange);
} else if (ID == ".amdhsa_fp16_overflow") {
if (IVersion.Major < 9)
- return getParser().Error(IDRange.Start, "directive requires gfx9+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx9+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
if (IVersion.Major < 10)
- return getParser().Error(IDRange.Start, "directive requires gfx10+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx10+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val,
ValRange);
} else if (ID == ".amdhsa_memory_ordered") {
if (IVersion.Major < 10)
- return getParser().Error(IDRange.Start, "directive requires gfx10+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx10+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val,
ValRange);
} else if (ID == ".amdhsa_forward_progress") {
if (IVersion.Major < 10)
- return getParser().Error(IDRange.Start, "directive requires gfx10+",
- IDRange);
+ return Error(IDRange.Start, "directive requires gfx10+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
ValRange);
} else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
@@ -4080,8 +4357,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
Val, ValRange);
} else {
- return getParser().Error(IDRange.Start,
- "unknown .amdhsa_kernel directive", IDRange);
+ return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange);
}
#undef PARSE_BITS_ENTRY
@@ -4145,7 +4421,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
// If this directive has no arguments, then use the ISA version for the
// targeted GPU.
- if (getLexer().is(AsmToken::EndOfStatement)) {
+ if (isToken(AsmToken::EndOfStatement)) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
ISA.Stepping,
@@ -4156,32 +4432,23 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
if (ParseDirectiveMajorMinor(Major, Minor))
return true;
- if (getLexer().isNot(AsmToken::Comma))
+ if (!trySkipToken(AsmToken::Comma))
return TokError("stepping version number required, comma expected");
- Lex();
if (ParseAsAbsoluteExpression(Stepping))
return TokError("invalid stepping version");
- if (getLexer().isNot(AsmToken::Comma))
+ if (!trySkipToken(AsmToken::Comma))
return TokError("vendor name required, comma expected");
- Lex();
-
- if (getLexer().isNot(AsmToken::String))
- return TokError("invalid vendor name");
- VendorName = getLexer().getTok().getStringContents();
- Lex();
+ if (!parseString(VendorName, "invalid vendor name"))
+ return true;
- if (getLexer().isNot(AsmToken::Comma))
+ if (!trySkipToken(AsmToken::Comma))
return TokError("arch name required, comma expected");
- Lex();
- if (getLexer().isNot(AsmToken::String))
- return TokError("invalid arch name");
-
- ArchName = getLexer().getTok().getStringContents();
- Lex();
+ if (!parseString(ArchName, "invalid arch name"))
+ return true;
getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
VendorName, ArchName);
@@ -4206,7 +4473,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
if (ID == "enable_wavefront_size32") {
if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
- if (!isGFX10())
+ if (!isGFX10Plus())
return TokError("enable_wavefront_size32=1 is only allowed on GFX10+");
if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
return TokError("enable_wavefront_size32=1 requires +WavefrontSize32");
@@ -4218,7 +4485,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
if (ID == "wavefront_size") {
if (Header.wavefront_size == 5) {
- if (!isGFX10())
+ if (!isGFX10Plus())
return TokError("wavefront_size=5 is only allowed on GFX10+");
if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
return TokError("wavefront_size=5 requires +WavefrontSize32");
@@ -4229,17 +4496,20 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
}
if (ID == "enable_wgp_mode") {
- if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10())
+ if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) &&
+ !isGFX10Plus())
return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
}
if (ID == "enable_mem_ordered") {
- if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10())
+ if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) &&
+ !isGFX10Plus())
return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
}
if (ID == "enable_fwd_progress") {
- if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10())
+ if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) &&
+ !isGFX10Plus())
return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
}
@@ -4253,14 +4523,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
while (true) {
// Lex EndOfStatement. This is in a while loop, because lexing a comment
// will set the current token to EndOfStatement.
- while(getLexer().is(AsmToken::EndOfStatement))
- Lex();
+ while(trySkipToken(AsmToken::EndOfStatement));
- if (getLexer().isNot(AsmToken::Identifier))
- return TokError("expected value identifier or .end_amd_kernel_code_t");
-
- StringRef ID = getLexer().getTok().getIdentifier();
- Lex();
+ StringRef ID;
+ if (!parseId(ID, "expected value identifier or .end_amd_kernel_code_t"))
+ return true;
if (ID == ".end_amd_kernel_code_t")
break;
@@ -4275,34 +4542,32 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
}
bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
- if (getLexer().isNot(AsmToken::Identifier))
- return TokError("expected symbol name");
-
- StringRef KernelName = Parser.getTok().getString();
+ StringRef KernelName;
+ if (!parseId(KernelName, "expected symbol name"))
+ return true;
getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
ELF::STT_AMDGPU_HSA_KERNEL);
- Lex();
- if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()))
- KernelScope.initialize(getContext());
+
+ KernelScope.initialize(getContext());
return false;
}
bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) {
- return Error(getParser().getTok().getLoc(),
+ return Error(getLoc(),
".amd_amdgpu_isa directive is not available on non-amdgcn "
"architectures");
}
- auto ISAVersionStringFromASM = getLexer().getTok().getStringContents();
+ auto ISAVersionStringFromASM = getToken().getStringContents();
std::string ISAVersionStringFromSTI;
raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI);
IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI);
if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) {
- return Error(getParser().getTok().getLoc(),
+ return Error(getLoc(),
".amd_amdgpu_isa directive does not match triple and/or mcpu "
"arguments specified through the command line");
}
@@ -4317,14 +4582,14 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
const char *AssemblerDirectiveBegin;
const char *AssemblerDirectiveEnd;
std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
- AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())
+ isHsaAbiVersion3(&getSTI())
? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
HSAMD::V3::AssemblerDirectiveEnd)
: std::make_tuple(HSAMD::AssemblerDirectiveBegin,
HSAMD::AssemblerDirectiveEnd);
if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
- return Error(getParser().getTok().getLoc(),
+ return Error(getLoc(),
(Twine(AssemblerDirectiveBegin) + Twine(" directive is "
"not available on non-amdhsa OSes")).str());
}
@@ -4334,12 +4599,12 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
HSAMetadataString))
return true;
- if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (isHsaAbiVersion3(&getSTI())) {
if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ return Error(getLoc(), "invalid HSA metadata");
} else {
if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ return Error(getLoc(), "invalid HSA metadata");
}
return false;
@@ -4356,19 +4621,15 @@ bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin,
getLexer().setSkipSpace(false);
bool FoundEnd = false;
- while (!getLexer().is(AsmToken::Eof)) {
- while (getLexer().is(AsmToken::Space)) {
- CollectStream << getLexer().getTok().getString();
+ while (!isToken(AsmToken::Eof)) {
+ while (isToken(AsmToken::Space)) {
+ CollectStream << getTokenStr();
Lex();
}
- if (getLexer().is(AsmToken::Identifier)) {
- StringRef ID = getLexer().getTok().getIdentifier();
- if (ID == AssemblerDirectiveEnd) {
- Lex();
- FoundEnd = true;
- break;
- }
+ if (trySkipId(AssemblerDirectiveEnd)) {
+ FoundEnd = true;
+ break;
}
CollectStream << Parser.parseStringToEndOfStatement()
@@ -4379,7 +4640,7 @@ bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin,
getLexer().setSkipSpace(true);
- if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
+ if (isToken(AsmToken::Eof) && !FoundEnd) {
return TokError(Twine("expected directive ") +
Twine(AssemblerDirectiveEnd) + Twine(" not found"));
}
@@ -4397,14 +4658,14 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadataBegin() {
auto PALMetadata = getTargetStreamer().getPALMetadata();
if (!PALMetadata->setFromString(String))
- return Error(getParser().getTok().getLoc(), "invalid PAL metadata");
+ return Error(getLoc(), "invalid PAL metadata");
return false;
}
/// Parse the assembler directive for old linear-format PAL metadata.
bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) {
- return Error(getParser().getTok().getLoc(),
+ return Error(getLoc(),
(Twine(PALMD::AssemblerDirective) + Twine(" directive is "
"not available on non-amdpal OSes")).str());
}
@@ -4417,19 +4678,17 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
return TokError(Twine("invalid value in ") +
Twine(PALMD::AssemblerDirective));
}
- if (getLexer().isNot(AsmToken::Comma)) {
+ if (!trySkipToken(AsmToken::Comma)) {
return TokError(Twine("expected an even number of values in ") +
Twine(PALMD::AssemblerDirective));
}
- Lex();
if (ParseAsAbsoluteExpression(Value)) {
return TokError(Twine("invalid value in ") +
Twine(PALMD::AssemblerDirective));
}
PALMetadata->setRegister(Key, Value);
- if (getLexer().isNot(AsmToken::Comma))
+ if (!trySkipToken(AsmToken::Comma))
break;
- Lex();
}
return false;
}
@@ -4441,7 +4700,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
return true;
StringRef Name;
- SMLoc NameLoc = getLexer().getLoc();
+ SMLoc NameLoc = getLoc();
if (getParser().parseIdentifier(Name))
return TokError("expected identifier in directive");
@@ -4452,7 +4711,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI());
int64_t Size;
- SMLoc SizeLoc = getLexer().getLoc();
+ SMLoc SizeLoc = getLoc();
if (getParser().parseAbsoluteExpression(Size))
return true;
if (Size < 0)
@@ -4461,9 +4720,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
return Error(SizeLoc, "size is too large");
int64_t Alignment = 4;
- if (getLexer().is(AsmToken::Comma)) {
- Lex();
- SMLoc AlignLoc = getLexer().getLoc();
+ if (trySkipToken(AsmToken::Comma)) {
+ SMLoc AlignLoc = getLoc();
if (getParser().parseAbsoluteExpression(Alignment))
return true;
if (Alignment < 0 || !isPowerOf2_64(Alignment))
@@ -4491,7 +4749,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (isHsaAbiVersion3(&getSTI())) {
if (IDVal == ".amdgcn_target")
return ParseDirectiveAMDGCNTarget();
@@ -4539,7 +4797,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
R.isValid(); ++R) {
if (*R == RegNo)
- return isGFX9() || isGFX10();
+ return isGFX9Plus();
}
// GFX10 has 2 more SGPRs 104 and 105.
@@ -4555,20 +4813,20 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
- return !isCI() && !isSI() && !isVI();
+ return isGFX9Plus();
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
- return !isGFX9() && !isGFX10();
+ return !isGFX9Plus();
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
- return !isCI() && !isSI() && !isGFX10() && hasXNACK();
+ return (isVI() || isGFX9()) && hasXNACK();
case AMDGPU::SGPR_NULL:
- return isGFX10();
+ return isGFX10Plus();
default:
break;
}
@@ -4576,7 +4834,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
if (isCI())
return true;
- if (isSI() || isGFX10()) {
+ if (isSI() || isGFX10Plus()) {
// No flat_scr on SI.
// On GFX10 flat scratch is not a valid register operand and can only be
// accessed with s_setreg/s_getreg.
@@ -4614,35 +4872,33 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
// are appending default values to the Operands list. This is only done
// by custom parser, so we shouldn't continue on to the generic parsing.
if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
- getLexer().is(AsmToken::EndOfStatement))
+ isToken(AsmToken::EndOfStatement))
return ResTy;
- if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) {
+ SMLoc RBraceLoc;
+ SMLoc LBraceLoc = getLoc();
+ if (Mode == OperandMode_NSA && trySkipToken(AsmToken::LBrac)) {
unsigned Prefix = Operands.size();
- SMLoc LBraceLoc = getTok().getLoc();
- Parser.Lex(); // eat the '['
for (;;) {
ResTy = parseReg(Operands);
if (ResTy != MatchOperand_Success)
return ResTy;
- if (getLexer().is(AsmToken::RBrac))
+ RBraceLoc = getLoc();
+ if (trySkipToken(AsmToken::RBrac))
break;
- if (getLexer().isNot(AsmToken::Comma))
+ if (!trySkipToken(AsmToken::Comma))
return MatchOperand_ParseFail;
- Parser.Lex();
}
if (Operands.size() - Prefix > 1) {
Operands.insert(Operands.begin() + Prefix,
AMDGPUOperand::CreateToken(this, "[", LBraceLoc));
- Operands.push_back(AMDGPUOperand::CreateToken(this, "]",
- getTok().getLoc()));
+ Operands.push_back(AMDGPUOperand::CreateToken(this, "]", RBraceLoc));
}
- Parser.Lex(); // eat the ']'
return MatchOperand_Success;
}
@@ -4680,32 +4936,28 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
bool IsMIMG = Name.startswith("image_");
- while (!getLexer().is(AsmToken::EndOfStatement)) {
+ while (!trySkipToken(AsmToken::EndOfStatement)) {
OperandMode Mode = OperandMode_Default;
- if (IsMIMG && isGFX10() && Operands.size() == 2)
+ if (IsMIMG && isGFX10Plus() && Operands.size() == 2)
Mode = OperandMode_NSA;
OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
// Eat the comma or space if there is one.
- if (getLexer().is(AsmToken::Comma))
- Parser.Lex();
+ trySkipToken(AsmToken::Comma);
- switch (Res) {
- case MatchOperand_Success: break;
- case MatchOperand_ParseFail:
+ if (Res != MatchOperand_Success) {
+ checkUnsupportedInstruction(Name, NameLoc);
+ if (!Parser.hasPendingError()) {
// FIXME: use real operand location rather than the current location.
- Error(getLexer().getLoc(), "failed parsing operand.");
- while (!getLexer().is(AsmToken::EndOfStatement)) {
- Parser.Lex();
- }
- return true;
- case MatchOperand_NoMatch:
- // FIXME: use real operand location rather than the current location.
- Error(getLexer().getLoc(), "not a valid operand.");
- while (!getLexer().is(AsmToken::EndOfStatement)) {
- Parser.Lex();
- }
- return true;
+ StringRef Msg =
+ (Res == MatchOperand_ParseFail) ? "failed parsing operand." :
+ "not a valid operand.";
+ Error(getLoc(), Msg);
+ }
+ while (!trySkipToken(AsmToken::EndOfStatement)) {
+ lex();
+ }
+ return true;
}
}
@@ -4794,14 +5046,14 @@ OperandMatchResultTy
AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy) {
int64_t Bit = 0;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
// We are at the end of the statement, and this is a default argument, so
// use a default value.
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- switch(getLexer().getKind()) {
+ if (!isToken(AsmToken::EndOfStatement)) {
+ switch(getTokenKind()) {
case AsmToken::Identifier: {
- StringRef Tok = Parser.getTok().getString();
+ StringRef Tok = getTokenStr();
if (Tok == Name) {
if (Tok == "r128" && !hasMIMG_R128())
Error(S, "r128 modifier is not supported on this GPU");
@@ -4822,7 +5074,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
}
}
- if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC)
+ if (!isGFX10Plus() && ImmTy == AMDGPUOperand::ImmTyDLC)
return MatchOperand_ParseFail;
if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
@@ -4847,73 +5099,273 @@ static void addOptionalImmOperand(
}
OperandMatchResultTy
-AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
- if (getLexer().isNot(AsmToken::Identifier)) {
+AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix,
+ StringRef &Value,
+ SMLoc &StringLoc) {
+ if (!trySkipId(Prefix, AsmToken::Colon))
return MatchOperand_NoMatch;
+
+ StringLoc = getLoc();
+ return parseId(Value, "expected an identifier") ? MatchOperand_Success
+ : MatchOperand_ParseFail;
+}
+
+//===----------------------------------------------------------------------===//
+// MTBUF format
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
+ int64_t MaxVal,
+ int64_t &Fmt) {
+ int64_t Val;
+ SMLoc Loc = getLoc();
+
+ auto Res = parseIntWithPrefix(Pref, Val);
+ if (Res == MatchOperand_ParseFail)
+ return false;
+ if (Res == MatchOperand_NoMatch)
+ return true;
+
+ if (Val < 0 || Val > MaxVal) {
+ Error(Loc, Twine("out of range ", StringRef(Pref)));
+ return false;
}
- StringRef Tok = Parser.getTok().getString();
- if (Tok != Prefix) {
+
+ Fmt = Val;
+ return true;
+}
+
+// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
+// values to live in a joint format operand in the MCInst encoding.
+OperandMatchResultTy
+AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ int64_t Dfmt = DFMT_UNDEF;
+ int64_t Nfmt = NFMT_UNDEF;
+
+ // dfmt and nfmt can appear in either order, and each is optional.
+ for (int I = 0; I < 2; ++I) {
+ if (Dfmt == DFMT_UNDEF && !tryParseFmt("dfmt", DFMT_MAX, Dfmt))
+ return MatchOperand_ParseFail;
+
+ if (Nfmt == NFMT_UNDEF && !tryParseFmt("nfmt", NFMT_MAX, Nfmt)) {
+ return MatchOperand_ParseFail;
+ }
+ // Skip optional comma between dfmt/nfmt
+ // but guard against 2 commas following each other.
+ if ((Dfmt == DFMT_UNDEF) != (Nfmt == NFMT_UNDEF) &&
+ !peekToken().is(AsmToken::Comma)) {
+ trySkipToken(AsmToken::Comma);
+ }
+ }
+
+ if (Dfmt == DFMT_UNDEF && Nfmt == NFMT_UNDEF)
return MatchOperand_NoMatch;
+
+ Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt;
+ Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
+
+ Format = encodeDfmtNfmt(Dfmt, Nfmt);
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseUfmt(int64_t &Format) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ int64_t Fmt = UFMT_UNDEF;
+
+ if (!tryParseFmt("format", UFMT_MAX, Fmt))
+ return MatchOperand_ParseFail;
+
+ if (Fmt == UFMT_UNDEF)
+ return MatchOperand_NoMatch;
+
+ Format = Fmt;
+ return MatchOperand_Success;
+}
+
+bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt,
+ int64_t &Nfmt,
+ StringRef FormatStr,
+ SMLoc Loc) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+ int64_t Format;
+
+ Format = getDfmt(FormatStr);
+ if (Format != DFMT_UNDEF) {
+ Dfmt = Format;
+ return true;
}
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon)) {
+ Format = getNfmt(FormatStr, getSTI());
+ if (Format != NFMT_UNDEF) {
+ Nfmt = Format;
+ return true;
+ }
+
+ Error(Loc, "unsupported format");
+ return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
+ SMLoc FormatLoc,
+ int64_t &Format) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ int64_t Dfmt = DFMT_UNDEF;
+ int64_t Nfmt = NFMT_UNDEF;
+ if (!matchDfmtNfmt(Dfmt, Nfmt, FormatStr, FormatLoc))
return MatchOperand_ParseFail;
+
+ if (trySkipToken(AsmToken::Comma)) {
+ StringRef Str;
+ SMLoc Loc = getLoc();
+ if (!parseId(Str, "expected a format string") ||
+ !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) {
+ return MatchOperand_ParseFail;
+ }
+ if (Dfmt == DFMT_UNDEF) {
+ Error(Loc, "duplicate numeric format");
+ return MatchOperand_ParseFail;
+ } else if (Nfmt == NFMT_UNDEF) {
+ Error(Loc, "duplicate data format");
+ return MatchOperand_ParseFail;
+ }
}
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Identifier)) {
+ Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt;
+ Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
+
+ if (isGFX10Plus()) {
+ auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt);
+ if (Ufmt == UFMT_UNDEF) {
+ Error(FormatLoc, "unsupported format");
+ return MatchOperand_ParseFail;
+ }
+ Format = Ufmt;
+ } else {
+ Format = encodeDfmtNfmt(Dfmt, Nfmt);
+ }
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr,
+ SMLoc Loc,
+ int64_t &Format) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ auto Id = getUnifiedFormat(FormatStr);
+ if (Id == UFMT_UNDEF)
+ return MatchOperand_NoMatch;
+
+ if (!isGFX10Plus()) {
+ Error(Loc, "unified format is not supported on this GPU");
return MatchOperand_ParseFail;
}
- Value = Parser.getTok().getString();
+ Format = Id;
return MatchOperand_Success;
}
-// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
-// values to live in a joint format operand in the MCInst encoding.
OperandMatchResultTy
-AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) {
- SMLoc S = Parser.getTok().getLoc();
- int64_t Dfmt = 0, Nfmt = 0;
- // dfmt and nfmt can appear in either order, and each is optional.
- bool GotDfmt = false, GotNfmt = false;
- while (!GotDfmt || !GotNfmt) {
- if (!GotDfmt) {
- auto Res = parseIntWithPrefix("dfmt", Dfmt);
- if (Res != MatchOperand_NoMatch) {
- if (Res != MatchOperand_Success)
- return Res;
- if (Dfmt >= 16) {
- Error(Parser.getTok().getLoc(), "out of range dfmt");
- return MatchOperand_ParseFail;
- }
- GotDfmt = true;
- Parser.Lex();
- continue;
- }
- }
- if (!GotNfmt) {
- auto Res = parseIntWithPrefix("nfmt", Nfmt);
- if (Res != MatchOperand_NoMatch) {
- if (Res != MatchOperand_Success)
- return Res;
- if (Nfmt >= 8) {
- Error(Parser.getTok().getLoc(), "out of range nfmt");
- return MatchOperand_ParseFail;
- }
- GotNfmt = true;
- Parser.Lex();
- continue;
- }
- }
- break;
+AMDGPUAsmParser::parseNumericFormat(int64_t &Format) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+ SMLoc Loc = getLoc();
+
+ if (!parseExpr(Format))
+ return MatchOperand_ParseFail;
+ if (!isValidFormatEncoding(Format, getSTI())) {
+ Error(Loc, "out of range format");
+ return MatchOperand_ParseFail;
}
- if (!GotDfmt && !GotNfmt)
+
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ if (!trySkipId("format", AsmToken::Colon))
return MatchOperand_NoMatch;
- auto Format = Dfmt | Nfmt << 4;
+
+ if (trySkipToken(AsmToken::LBrac)) {
+ StringRef FormatStr;
+ SMLoc Loc = getLoc();
+ if (!parseId(FormatStr, "expected a format string"))
+ return MatchOperand_ParseFail;
+
+ auto Res = parseSymbolicUnifiedFormat(FormatStr, Loc, Format);
+ if (Res == MatchOperand_NoMatch)
+ Res = parseSymbolicSplitFormat(FormatStr, Loc, Format);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
+ return MatchOperand_ParseFail;
+
+ return MatchOperand_Success;
+ }
+
+ return parseNumericFormat(Format);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ int64_t Format = getDefaultFormatEncoding(getSTI());
+ OperandMatchResultTy Res;
+ SMLoc Loc = getLoc();
+
+ // Parse legacy format syntax.
+ Res = isGFX10Plus() ? parseUfmt(Format) : parseDfmtNfmt(Format);
+ if (Res == MatchOperand_ParseFail)
+ return Res;
+
+ bool FormatFound = (Res == MatchOperand_Success);
+
Operands.push_back(
- AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT));
+ AMDGPUOperand::CreateImm(this, Format, Loc, AMDGPUOperand::ImmTyFORMAT));
+
+ if (FormatFound)
+ trySkipToken(AsmToken::Comma);
+
+ if (isToken(AsmToken::EndOfStatement)) {
+ // We are expecting an soffset operand,
+ // but let matcher handle the error.
+ return MatchOperand_Success;
+ }
+
+ // Parse soffset.
+ Res = parseRegOrImm(Operands);
+ if (Res != MatchOperand_Success)
+ return Res;
+
+ trySkipToken(AsmToken::Comma);
+
+ if (!FormatFound) {
+ Res = parseSymbolicOrNumericFormat(Format);
+ if (Res == MatchOperand_ParseFail)
+ return Res;
+ if (Res == MatchOperand_Success) {
+ auto Size = Operands.size();
+ AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[Size - 2]);
+ assert(Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyFORMAT);
+ Op.setImm(Format);
+ }
+ return MatchOperand_Success;
+ }
+
+ if (isId("format") && peekToken().is(AsmToken::Colon)) {
+ Error(getLoc(), "duplicate format");
+ return MatchOperand_ParseFail;
+ }
return MatchOperand_Success;
}
@@ -5122,12 +5574,14 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
int64_t Waitcnt = getWaitcntBitMask(ISA);
SMLoc S = getLoc();
- // If parse failed, do not return error code
- // to avoid excessive error messages.
if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
- while (parseCnt(Waitcnt) && !isToken(AsmToken::EndOfStatement));
+ while (!isToken(AsmToken::EndOfStatement)) {
+ if (!parseCnt(Waitcnt))
+ return MatchOperand_ParseFail;
+ }
} else {
- parseExpr(Waitcnt);
+ if (!parseExpr(Waitcnt))
+ return MatchOperand_ParseFail;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
@@ -5145,16 +5599,17 @@ AMDGPUOperand::isSWaitCnt() const {
bool
AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
- int64_t &Offset,
- int64_t &Width) {
+ OperandInfoTy &Offset,
+ OperandInfoTy &Width) {
using namespace llvm::AMDGPU::Hwreg;
// The register may be specified by name or using a numeric code
+ HwReg.Loc = getLoc();
if (isToken(AsmToken::Identifier) &&
(HwReg.Id = getHwregId(getTokenStr())) >= 0) {
HwReg.IsSymbolic = true;
- lex(); // skip message name
- } else if (!parseExpr(HwReg.Id)) {
+ lex(); // skip register name
+ } else if (!parseExpr(HwReg.Id, "a register name")) {
return false;
}
@@ -5162,33 +5617,45 @@ AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
return true;
// parse optional params
- return
- skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis") &&
- parseExpr(Offset) &&
- skipToken(AsmToken::Comma, "expected a comma") &&
- parseExpr(Width) &&
- skipToken(AsmToken::RParen, "expected a closing parenthesis");
+ if (!skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis"))
+ return false;
+
+ Offset.Loc = getLoc();
+ if (!parseExpr(Offset.Id))
+ return false;
+
+ if (!skipToken(AsmToken::Comma, "expected a comma"))
+ return false;
+
+ Width.Loc = getLoc();
+ return parseExpr(Width.Id) &&
+ skipToken(AsmToken::RParen, "expected a closing parenthesis");
}
bool
AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
- const int64_t Offset,
- const int64_t Width,
- const SMLoc Loc) {
+ const OperandInfoTy &Offset,
+ const OperandInfoTy &Width) {
using namespace llvm::AMDGPU::Hwreg;
if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) {
- Error(Loc, "specified hardware register is not supported on this GPU");
+ Error(HwReg.Loc,
+ "specified hardware register is not supported on this GPU");
return false;
- } else if (!isValidHwreg(HwReg.Id)) {
- Error(Loc, "invalid code of hardware register: only 6-bit values are legal");
+ }
+ if (!isValidHwreg(HwReg.Id)) {
+ Error(HwReg.Loc,
+ "invalid code of hardware register: only 6-bit values are legal");
return false;
- } else if (!isValidHwregOffset(Offset)) {
- Error(Loc, "invalid bit offset: only 5-bit values are legal");
+ }
+ if (!isValidHwregOffset(Offset.Id)) {
+ Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
return false;
- } else if (!isValidHwregWidth(Width)) {
- Error(Loc, "invalid bitfield width: only values from 1 to 32 are legal");
+ }
+ if (!isValidHwregWidth(Width.Id)) {
+ Error(Width.Loc,
+ "invalid bitfield width: only values from 1 to 32 are legal");
return false;
}
return true;
@@ -5201,19 +5668,23 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
int64_t ImmVal = 0;
SMLoc Loc = getLoc();
- // If parse failed, do not return error code
- // to avoid excessive error messages.
if (trySkipId("hwreg", AsmToken::LParen)) {
OperandInfoTy HwReg(ID_UNKNOWN_);
- int64_t Offset = OFFSET_DEFAULT_;
- int64_t Width = WIDTH_DEFAULT_;
+ OperandInfoTy Offset(OFFSET_DEFAULT_);
+ OperandInfoTy Width(WIDTH_DEFAULT_);
if (parseHwregBody(HwReg, Offset, Width) &&
- validateHwreg(HwReg, Offset, Width, Loc)) {
- ImmVal = encodeHwreg(HwReg.Id, Offset, Width);
+ validateHwreg(HwReg, Offset, Width)) {
+ ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
+ } else {
+ return MatchOperand_ParseFail;
}
- } else if (parseExpr(ImmVal)) {
- if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ } else if (parseExpr(ImmVal, "a hwreg macro")) {
+ if (ImmVal < 0 || !isUInt<16>(ImmVal)) {
Error(Loc, "invalid immediate: only 16-bit values are legal");
+ return MatchOperand_ParseFail;
+ }
+ } else {
+ return MatchOperand_ParseFail;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
@@ -5234,24 +5705,27 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
OperandInfoTy &Stream) {
using namespace llvm::AMDGPU::SendMsg;
+ Msg.Loc = getLoc();
if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) {
Msg.IsSymbolic = true;
lex(); // skip message name
- } else if (!parseExpr(Msg.Id)) {
+ } else if (!parseExpr(Msg.Id, "a message name")) {
return false;
}
if (trySkipToken(AsmToken::Comma)) {
Op.IsDefined = true;
+ Op.Loc = getLoc();
if (isToken(AsmToken::Identifier) &&
(Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
lex(); // skip operation name
- } else if (!parseExpr(Op.Id)) {
+ } else if (!parseExpr(Op.Id, "an operation name")) {
return false;
}
if (trySkipToken(AsmToken::Comma)) {
Stream.IsDefined = true;
+ Stream.Loc = getLoc();
if (!parseExpr(Stream.Id))
return false;
}
@@ -5263,8 +5737,7 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
bool
AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
const OperandInfoTy &Op,
- const OperandInfoTy &Stream,
- const SMLoc S) {
+ const OperandInfoTy &Stream) {
using namespace llvm::AMDGPU::SendMsg;
// Validation strictness depends on whether message is specified
@@ -5273,21 +5746,27 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
bool Strict = Msg.IsSymbolic;
if (!isValidMsgId(Msg.Id, getSTI(), Strict)) {
- Error(S, "invalid message id");
+ Error(Msg.Loc, "invalid message id");
return false;
- } else if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
- Error(S, Op.IsDefined ?
- "message does not support operations" :
- "missing message operation");
+ }
+ if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
+ if (Op.IsDefined) {
+ Error(Op.Loc, "message does not support operations");
+ } else {
+ Error(Msg.Loc, "missing message operation");
+ }
return false;
- } else if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
- Error(S, "invalid operation id");
+ }
+ if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
+ Error(Op.Loc, "invalid operation id");
return false;
- } else if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
- Error(S, "message operation does not support streams");
+ }
+ if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
+ Error(Stream.Loc, "message operation does not support streams");
return false;
- } else if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
- Error(S, "invalid message stream id");
+ }
+ if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
+ Error(Stream.Loc, "invalid message stream id");
return false;
}
return true;
@@ -5300,19 +5779,23 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
int64_t ImmVal = 0;
SMLoc Loc = getLoc();
- // If parse failed, do not return error code
- // to avoid excessive error messages.
if (trySkipId("sendmsg", AsmToken::LParen)) {
OperandInfoTy Msg(ID_UNKNOWN_);
OperandInfoTy Op(OP_NONE_);
OperandInfoTy Stream(STREAM_ID_NONE_);
if (parseSendMsgBody(Msg, Op, Stream) &&
- validateSendMsg(Msg, Op, Stream, Loc)) {
+ validateSendMsg(Msg, Op, Stream)) {
ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
+ } else {
+ return MatchOperand_ParseFail;
}
- } else if (parseExpr(ImmVal)) {
- if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ } else if (parseExpr(ImmVal, "a sendmsg macro")) {
+ if (ImmVal < 0 || !isUInt<16>(ImmVal)) {
Error(Loc, "invalid immediate: only 16-bit values are legal");
+ return MatchOperand_ParseFail;
+ }
+ } else {
+ return MatchOperand_ParseFail;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg));
@@ -5328,34 +5811,40 @@ bool AMDGPUOperand::isSendMsg() const {
//===----------------------------------------------------------------------===//
OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
- if (getLexer().getKind() != AsmToken::Identifier)
+ StringRef Str;
+ SMLoc S = getLoc();
+
+ if (!parseId(Str))
return MatchOperand_NoMatch;
- StringRef Str = Parser.getTok().getString();
int Slot = StringSwitch<int>(Str)
.Case("p10", 0)
.Case("p20", 1)
.Case("p0", 2)
.Default(-1);
- SMLoc S = Parser.getTok().getLoc();
- if (Slot == -1)
+ if (Slot == -1) {
+ Error(S, "invalid interpolation slot");
return MatchOperand_ParseFail;
+ }
- Parser.Lex();
Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S,
AMDGPUOperand::ImmTyInterpSlot));
return MatchOperand_Success;
}
OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
- if (getLexer().getKind() != AsmToken::Identifier)
- return MatchOperand_NoMatch;
+ StringRef Str;
+ SMLoc S = getLoc();
- StringRef Str = Parser.getTok().getString();
- if (!Str.startswith("attr"))
+ if (!parseId(Str))
return MatchOperand_NoMatch;
+ if (!Str.startswith("attr")) {
+ Error(S, "invalid interpolation attribute");
+ return MatchOperand_ParseFail;
+ }
+
StringRef Chan = Str.take_back(2);
int AttrChan = StringSwitch<int>(Chan)
.Case(".x", 0)
@@ -5363,20 +5852,22 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
.Case(".z", 2)
.Case(".w", 3)
.Default(-1);
- if (AttrChan == -1)
+ if (AttrChan == -1) {
+ Error(S, "invalid or missing interpolation attribute channel");
return MatchOperand_ParseFail;
+ }
Str = Str.drop_back(2).drop_front(4);
uint8_t Attr;
- if (Str.getAsInteger(10, Attr))
+ if (Str.getAsInteger(10, Attr)) {
+ Error(S, "invalid or missing interpolation attribute number");
return MatchOperand_ParseFail;
+ }
- SMLoc S = Parser.getTok().getLoc();
- Parser.Lex();
if (Attr > 63) {
- Error(S, "out of bounds attr");
- return MatchOperand_Success;
+ Error(S, "out of bounds interpolation attribute number");
+ return MatchOperand_ParseFail;
}
SMLoc SChan = SMLoc::getFromPointer(Chan.data());
@@ -5392,86 +5883,24 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
// exp
//===----------------------------------------------------------------------===//
-void AMDGPUAsmParser::errorExpTgt() {
- Error(Parser.getTok().getLoc(), "invalid exp target");
-}
-
-OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
- uint8_t &Val) {
- if (Str == "null") {
- Val = 9;
- return MatchOperand_Success;
- }
-
- if (Str.startswith("mrt")) {
- Str = Str.drop_front(3);
- if (Str == "z") { // == mrtz
- Val = 8;
- return MatchOperand_Success;
- }
-
- if (Str.getAsInteger(10, Val))
- return MatchOperand_ParseFail;
-
- if (Val > 7)
- errorExpTgt();
-
- return MatchOperand_Success;
- }
-
- if (Str.startswith("pos")) {
- Str = Str.drop_front(3);
- if (Str.getAsInteger(10, Val))
- return MatchOperand_ParseFail;
-
- if (Val > 4 || (Val == 4 && !isGFX10()))
- errorExpTgt();
-
- Val += 12;
- return MatchOperand_Success;
- }
-
- if (isGFX10() && Str == "prim") {
- Val = 20;
- return MatchOperand_Success;
- }
-
- if (Str.startswith("param")) {
- Str = Str.drop_front(5);
- if (Str.getAsInteger(10, Val))
- return MatchOperand_ParseFail;
-
- if (Val >= 32)
- errorExpTgt();
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::Exp;
- Val += 32;
- return MatchOperand_Success;
- }
+ StringRef Str;
+ SMLoc S = getLoc();
- if (Str.startswith("invalid_target_")) {
- Str = Str.drop_front(15);
- if (Str.getAsInteger(10, Val))
- return MatchOperand_ParseFail;
+ if (!parseId(Str))
+ return MatchOperand_NoMatch;
- errorExpTgt();
- return MatchOperand_Success;
+ unsigned Id = getTgtId(Str);
+ if (Id == ET_INVALID || !isSupportedTgtId(Id, getSTI())) {
+ Error(S, (Id == ET_INVALID) ?
+ "invalid exp target" :
+ "exp target is not supported on this GPU");
+ return MatchOperand_ParseFail;
}
- return MatchOperand_NoMatch;
-}
-
-OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
- uint8_t Val;
- StringRef Str = Parser.getTok().getString();
-
- auto Res = parseExpTgtImpl(Str, Val);
- if (Res != MatchOperand_Success)
- return Res;
-
- SMLoc S = Parser.getTok().getLoc();
- Parser.Lex();
-
- Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S,
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Id, S,
AMDGPUOperand::ImmTyExpTgt));
return MatchOperand_Success;
}
@@ -5534,8 +5963,23 @@ AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
}
bool
-AMDGPUAsmParser::parseExpr(int64_t &Imm) {
- return !getParser().parseAbsoluteExpression(Imm);
+AMDGPUAsmParser::parseExpr(int64_t &Imm, StringRef Expected) {
+ SMLoc S = getLoc();
+
+ const MCExpr *Expr;
+ if (Parser.parseExpression(Expr))
+ return false;
+
+ if (Expr->evaluateAsAbsolute(Imm))
+ return true;
+
+ if (Expected.empty()) {
+ Error(S, "expected absolute expression");
+ } else {
+ Error(S, Twine("expected ", Expected) +
+ Twine(" or an absolute expression"));
+ }
+ return false;
}
bool
@@ -5567,6 +6011,19 @@ AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
}
}
+bool
+AMDGPUAsmParser::parseId(StringRef &Val, const StringRef ErrMsg) {
+ if (isToken(AsmToken::Identifier)) {
+ Val = getTokenStr();
+ lex();
+ return true;
+ } else {
+ if (!ErrMsg.empty())
+ Error(getLoc(), ErrMsg);
+ return false;
+ }
+}
+
AsmToken
AMDGPUAsmParser::getToken() const {
return Parser.getTok();
@@ -5574,7 +6031,7 @@ AMDGPUAsmParser::getToken() const {
AsmToken
AMDGPUAsmParser::peekToken() {
- return getLexer().peekTok();
+ return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok();
}
void
@@ -5605,6 +6062,49 @@ AMDGPUAsmParser::lex() {
Parser.Lex();
}
+SMLoc
+AMDGPUAsmParser::getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test,
+ const OperandVector &Operands) const {
+ for (unsigned i = Operands.size() - 1; i > 0; --i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (Test(Op))
+ return Op.getStartLoc();
+ }
+ return ((AMDGPUOperand &)*Operands[0]).getStartLoc();
+}
+
+SMLoc
+AMDGPUAsmParser::getImmLoc(AMDGPUOperand::ImmTy Type,
+ const OperandVector &Operands) const {
+ auto Test = [=](const AMDGPUOperand& Op) { return Op.isImmTy(Type); };
+ return getOperandLoc(Test, Operands);
+}
+
+SMLoc
+AMDGPUAsmParser::getRegLoc(unsigned Reg,
+ const OperandVector &Operands) const {
+ auto Test = [=](const AMDGPUOperand& Op) {
+ return Op.isRegKind() && Op.getReg() == Reg;
+ };
+ return getOperandLoc(Test, Operands);
+}
+
+SMLoc
+AMDGPUAsmParser::getLitLoc(const OperandVector &Operands) const {
+ auto Test = [](const AMDGPUOperand& Op) {
+ return Op.IsImmKindLiteral() || Op.isExpr();
+ };
+ return getOperandLoc(Test, Operands);
+}
+
+SMLoc
+AMDGPUAsmParser::getConstLoc(const OperandVector &Operands) const {
+ auto Test = [](const AMDGPUOperand& Op) {
+ return Op.isImmKindConst();
+ };
+ return getOperandLoc(Test, Operands);
+}
+
//===----------------------------------------------------------------------===//
// swizzle
//===----------------------------------------------------------------------===//
@@ -5623,22 +6123,35 @@ encodeBitmaskPerm(const unsigned AndMask,
}
bool
+AMDGPUAsmParser::parseSwizzleOperand(int64_t &Op,
+ const unsigned MinVal,
+ const unsigned MaxVal,
+ const StringRef ErrMsg,
+ SMLoc &Loc) {
+ if (!skipToken(AsmToken::Comma, "expected a comma")) {
+ return false;
+ }
+ Loc = getLoc();
+ if (!parseExpr(Op)) {
+ return false;
+ }
+ if (Op < MinVal || Op > MaxVal) {
+ Error(Loc, ErrMsg);
+ return false;
+ }
+
+ return true;
+}
+
+bool
AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
const unsigned MinVal,
const unsigned MaxVal,
const StringRef ErrMsg) {
+ SMLoc Loc;
for (unsigned i = 0; i < OpNum; ++i) {
- if (!skipToken(AsmToken::Comma, "expected a comma")){
+ if (!parseSwizzleOperand(Op[i], MinVal, MaxVal, ErrMsg, Loc))
return false;
- }
- SMLoc ExprLoc = Parser.getTok().getLoc();
- if (!parseExpr(Op[i])) {
- return false;
- }
- if (Op[i] < MinVal || Op[i] > MaxVal) {
- Error(ExprLoc, ErrMsg);
- return false;
- }
}
return true;
@@ -5664,22 +6177,24 @@ bool
AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) {
using namespace llvm::AMDGPU::Swizzle;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc Loc;
int64_t GroupSize;
int64_t LaneIdx;
- if (!parseSwizzleOperands(1, &GroupSize,
- 2, 32,
- "group size must be in the interval [2,32]")) {
+ if (!parseSwizzleOperand(GroupSize,
+ 2, 32,
+ "group size must be in the interval [2,32]",
+ Loc)) {
return false;
}
if (!isPowerOf2_64(GroupSize)) {
- Error(S, "group size must be a power of two");
+ Error(Loc, "group size must be a power of two");
return false;
}
- if (parseSwizzleOperands(1, &LaneIdx,
- 0, GroupSize - 1,
- "lane id must be in the interval [0,group size - 1]")) {
+ if (parseSwizzleOperand(LaneIdx,
+ 0, GroupSize - 1,
+ "lane id must be in the interval [0,group size - 1]",
+ Loc)) {
Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0);
return true;
}
@@ -5690,15 +6205,17 @@ bool
AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) {
using namespace llvm::AMDGPU::Swizzle;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc Loc;
int64_t GroupSize;
- if (!parseSwizzleOperands(1, &GroupSize,
- 2, 32, "group size must be in the interval [2,32]")) {
+ if (!parseSwizzleOperand(GroupSize,
+ 2, 32,
+ "group size must be in the interval [2,32]",
+ Loc)) {
return false;
}
if (!isPowerOf2_64(GroupSize)) {
- Error(S, "group size must be a power of two");
+ Error(Loc, "group size must be a power of two");
return false;
}
@@ -5710,15 +6227,17 @@ bool
AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) {
using namespace llvm::AMDGPU::Swizzle;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc Loc;
int64_t GroupSize;
- if (!parseSwizzleOperands(1, &GroupSize,
- 1, 16, "group size must be in the interval [1,16]")) {
+ if (!parseSwizzleOperand(GroupSize,
+ 1, 16,
+ "group size must be in the interval [1,16]",
+ Loc)) {
return false;
}
if (!isPowerOf2_64(GroupSize)) {
- Error(S, "group size must be a power of two");
+ Error(Loc, "group size must be a power of two");
return false;
}
@@ -5735,7 +6254,7 @@ AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
}
StringRef Ctl;
- SMLoc StrLoc = Parser.getTok().getLoc();
+ SMLoc StrLoc = getLoc();
if (!parseString(Ctl)) {
return false;
}
@@ -5776,9 +6295,9 @@ AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
bool
AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) {
- SMLoc OffsetLoc = Parser.getTok().getLoc();
+ SMLoc OffsetLoc = getLoc();
- if (!parseExpr(Imm)) {
+ if (!parseExpr(Imm, "a swizzle macro")) {
return false;
}
if (!isUInt<16>(Imm)) {
@@ -5794,7 +6313,7 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
if (skipToken(AsmToken::LParen, "expected a left parentheses")) {
- SMLoc ModeLoc = Parser.getTok().getLoc();
+ SMLoc ModeLoc = getLoc();
bool Ok = false;
if (trySkipId(IdSymbolic[ID_QUAD_PERM])) {
@@ -5819,7 +6338,7 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
OperandMatchResultTy
AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
int64_t Imm = 0;
if (trySkipId("offset")) {
@@ -5864,7 +6383,7 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
while (true) {
unsigned Mode = 0;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
if (trySkipId(IdSymbolic[ModeId])) {
@@ -5877,12 +6396,12 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
Error(S, (Imm == 0)?
"expected a VGPR index mode or a closing parenthesis" :
"expected a VGPR index mode");
- break;
+ return UNDEF;
}
if (Imm & Mode) {
Error(S, "duplicate VGPR index mode");
- break;
+ return UNDEF;
}
Imm |= Mode;
@@ -5890,7 +6409,7 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
break;
if (!skipToken(AsmToken::Comma,
"expected a comma or a closing parenthesis"))
- break;
+ return UNDEF;
}
return Imm;
@@ -5899,25 +6418,21 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
OperandMatchResultTy
AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
- int64_t Imm = 0;
- SMLoc S = Parser.getTok().getLoc();
-
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "gpr_idx" &&
- getLexer().peekTok().is(AsmToken::LParen)) {
+ using namespace llvm::AMDGPU::VGPRIndexMode;
- Parser.Lex();
- Parser.Lex();
+ int64_t Imm = 0;
+ SMLoc S = getLoc();
- // If parse failed, trigger an error but do not return error code
- // to avoid excessive error messages.
+ if (trySkipId("gpr_idx", AsmToken::LParen)) {
Imm = parseGPRIdxMacro();
-
+ if (Imm == UNDEF)
+ return MatchOperand_ParseFail;
} else {
if (getParser().parseAbsoluteExpression(Imm))
- return MatchOperand_NoMatch;
+ return MatchOperand_ParseFail;
if (Imm < 0 || !isUInt<4>(Imm)) {
Error(S, "invalid immediate: only 4-bit values are legal");
+ return MatchOperand_ParseFail;
}
}
@@ -5943,22 +6458,22 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
if (isRegister() || isModifier())
return MatchOperand_NoMatch;
- if (parseExpr(Operands)) {
+ if (!parseExpr(Operands))
+ return MatchOperand_ParseFail;
- AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
- assert(Opr.isImm() || Opr.isExpr());
- SMLoc Loc = Opr.getStartLoc();
+ AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
+ assert(Opr.isImm() || Opr.isExpr());
+ SMLoc Loc = Opr.getStartLoc();
- // Currently we do not support arbitrary expressions as branch targets.
- // Only labels and absolute expressions are accepted.
- if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
- Error(Loc, "expected an absolute expression or a label");
- } else if (Opr.isImm() && !Opr.isS16Imm()) {
- Error(Loc, "expected a 16-bit signed jump offset");
- }
+ // Currently we do not support arbitrary expressions as branch targets.
+ // Only labels and absolute expressions are accepted.
+ if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
+ Error(Loc, "expected an absolute expression or a label");
+ } else if (Opr.isImm() && !Opr.isS16Imm()) {
+ Error(Loc, "expected a 16-bit signed jump offset");
}
- return MatchOperand_Success; // avoid excessive error messages
+ return MatchOperand_Success;
}
//===----------------------------------------------------------------------===//
@@ -5982,6 +6497,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
}
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC_1() const {
+ return AMDGPUOperand::CreateImm(this, -1, SMLoc(), AMDGPUOperand::ImmTyGLC);
+}
+
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
}
@@ -6046,8 +6565,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
- if (!IsAtomic) { // glc is hard-coded.
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+ if (!IsAtomic || IsAtomicReturn) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC,
+ IsAtomicReturn ? -1 : 0);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
@@ -6055,7 +6575,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
- if (isGFX10())
+ if (isGFX10Plus())
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
}
@@ -6095,7 +6615,7 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
- if (isGFX10())
+ if (isGFX10Plus())
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
}
@@ -6132,22 +6652,22 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
}
}
- bool IsGFX10 = isGFX10();
+ bool IsGFX10Plus = isGFX10Plus();
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
- if (IsGFX10)
+ if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
- if (IsGFX10)
+ if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
- if (IsGFX10)
+ if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- if (!IsGFX10)
+ if (!IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
}
@@ -6156,6 +6676,17 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands)
cvtMIMG(Inst, Operands, true);
}
+void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst,
+ const OperandVector &Operands) {
+ for (unsigned I = 1; I < Operands.size(); ++I) {
+ auto &Operand = (AMDGPUOperand &)*Operands[I];
+ if (Operand.isReg())
+ Operand.addRegOperands(Inst, 1);
+ }
+
+ Inst.addOperand(MCOperand::createImm(1)); // a16
+}
+
//===----------------------------------------------------------------------===//
// smrd
//===----------------------------------------------------------------------===//
@@ -6242,7 +6773,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
{"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr},
- {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"swz", AMDGPUOperand::ImmTySWZ, true, nullptr},
@@ -6327,8 +6857,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.ConvertResult);
} else if (Op.Type == AMDGPUOperand::ImmTyDim) {
res = parseDim(Operands);
- } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) {
- res = parseDfmtNfmt(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
}
@@ -6340,7 +6868,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
}
OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) {
- StringRef Name = Parser.getTok().getString();
+ StringRef Name = getTokenStr();
if (Name == "mul") {
return parseIntWithPrefix("mul", Operands,
AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
@@ -6479,15 +7007,19 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
Opc == AMDGPU::V_MAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F32_e64_vi ||
+ Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
+ Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F32_e64_vi ||
+ Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F16_e64_gfx10) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
++it;
- Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+ // Copy the operand to ensure it's not invalidated when Inst grows.
+ Inst.insert(it, MCOperand(Inst.getOperand(0))); // src2 = dst
}
}
@@ -6636,35 +7168,27 @@ bool AMDGPUOperand::isU16Imm() const {
}
OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
- if (!isGFX10())
+ if (!isGFX10Plus())
return MatchOperand_NoMatch;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
- if (getLexer().isNot(AsmToken::Identifier))
- return MatchOperand_NoMatch;
- if (getLexer().getTok().getString() != "dim")
+ if (!trySkipId("dim", AsmToken::Colon))
return MatchOperand_NoMatch;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
-
- Parser.Lex();
-
// We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
// integer.
std::string Token;
- if (getLexer().is(AsmToken::Integer)) {
- SMLoc Loc = getLexer().getTok().getEndLoc();
- Token = std::string(getLexer().getTok().getString());
- Parser.Lex();
- if (getLexer().getTok().getLoc() != Loc)
+ if (isToken(AsmToken::Integer)) {
+ SMLoc Loc = getToken().getEndLoc();
+ Token = std::string(getTokenStr());
+ lex();
+ if (getLoc() != Loc)
return MatchOperand_ParseFail;
}
- if (getLexer().isNot(AsmToken::Identifier))
+ if (!isToken(AsmToken::Identifier))
return MatchOperand_ParseFail;
- Token += getLexer().getTok().getString();
+ Token += getTokenStr();
StringRef DimId = Token;
if (DimId.startswith("SQ_RSRC_IMG_"))
@@ -6674,7 +7198,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
if (!DimInfo)
return MatchOperand_ParseFail;
- Parser.Lex();
+ lex();
Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
AMDGPUOperand::ImmTyDim));
@@ -6682,52 +7206,33 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
}
OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
- SMLoc S = Parser.getTok().getLoc();
- StringRef Prefix;
-
- if (getLexer().getKind() == AsmToken::Identifier) {
- Prefix = Parser.getTok().getString();
- } else {
- return MatchOperand_NoMatch;
- }
+ SMLoc S = getLoc();
- if (Prefix != "dpp8")
- return parseDPPCtrl(Operands);
- if (!isGFX10())
+ if (!isGFX10Plus() || !trySkipId("dpp8", AsmToken::Colon))
return MatchOperand_NoMatch;
// dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]
int64_t Sels[8];
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
-
- Parser.Lex();
- if (getLexer().isNot(AsmToken::LBrac))
+ if (!skipToken(AsmToken::LBrac, "expected an opening square bracket"))
return MatchOperand_ParseFail;
- Parser.Lex();
- if (getParser().parseAbsoluteExpression(Sels[0]))
- return MatchOperand_ParseFail;
- if (0 > Sels[0] || 7 < Sels[0])
- return MatchOperand_ParseFail;
-
- for (size_t i = 1; i < 8; ++i) {
- if (getLexer().isNot(AsmToken::Comma))
+ for (size_t i = 0; i < 8; ++i) {
+ if (i > 0 && !skipToken(AsmToken::Comma, "expected a comma"))
return MatchOperand_ParseFail;
- Parser.Lex();
+ SMLoc Loc = getLoc();
if (getParser().parseAbsoluteExpression(Sels[i]))
return MatchOperand_ParseFail;
- if (0 > Sels[i] || 7 < Sels[i])
+ if (0 > Sels[i] || 7 < Sels[i]) {
+ Error(Loc, "expected a 3-bit value");
return MatchOperand_ParseFail;
+ }
}
- if (getLexer().isNot(AsmToken::RBrac))
+ if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
return MatchOperand_ParseFail;
- Parser.Lex();
unsigned DPP8 = 0;
for (size_t i = 0; i < 8; ++i)
@@ -6737,119 +7242,138 @@ OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
return MatchOperand_Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+bool
+AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl,
+ const OperandVector &Operands) {
+ if (Ctrl == "row_share" ||
+ Ctrl == "row_xmask")
+ return isGFX10Plus();
+
+ if (Ctrl == "wave_shl" ||
+ Ctrl == "wave_shr" ||
+ Ctrl == "wave_rol" ||
+ Ctrl == "wave_ror" ||
+ Ctrl == "row_bcast")
+ return isVI() || isGFX9();
+
+ return Ctrl == "row_mirror" ||
+ Ctrl == "row_half_mirror" ||
+ Ctrl == "quad_perm" ||
+ Ctrl == "row_shl" ||
+ Ctrl == "row_shr" ||
+ Ctrl == "row_ror";
+}
+
+int64_t
+AMDGPUAsmParser::parseDPPCtrlPerm() {
+ // quad_perm:[%d,%d,%d,%d]
+
+ if (!skipToken(AsmToken::LBrac, "expected an opening square bracket"))
+ return -1;
+
+ int64_t Val = 0;
+ for (int i = 0; i < 4; ++i) {
+ if (i > 0 && !skipToken(AsmToken::Comma, "expected a comma"))
+ return -1;
+
+ int64_t Temp;
+ SMLoc Loc = getLoc();
+ if (getParser().parseAbsoluteExpression(Temp))
+ return -1;
+ if (Temp < 0 || Temp > 3) {
+ Error(Loc, "expected a 2-bit value");
+ return -1;
+ }
+
+ Val += (Temp << i * 2);
+ }
+
+ if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
+ return -1;
+
+ return Val;
+}
+
+int64_t
+AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) {
using namespace AMDGPU::DPP;
- SMLoc S = Parser.getTok().getLoc();
- StringRef Prefix;
- int64_t Int;
+ // sel:%d
+
+ int64_t Val;
+ SMLoc Loc = getLoc();
- if (getLexer().getKind() == AsmToken::Identifier) {
- Prefix = Parser.getTok().getString();
+ if (getParser().parseAbsoluteExpression(Val))
+ return -1;
+
+ struct DppCtrlCheck {
+ int64_t Ctrl;
+ int Lo;
+ int Hi;
+ };
+
+ DppCtrlCheck Check = StringSwitch<DppCtrlCheck>(Ctrl)
+ .Case("wave_shl", {DppCtrl::WAVE_SHL1, 1, 1})
+ .Case("wave_rol", {DppCtrl::WAVE_ROL1, 1, 1})
+ .Case("wave_shr", {DppCtrl::WAVE_SHR1, 1, 1})
+ .Case("wave_ror", {DppCtrl::WAVE_ROR1, 1, 1})
+ .Case("row_shl", {DppCtrl::ROW_SHL0, 1, 15})
+ .Case("row_shr", {DppCtrl::ROW_SHR0, 1, 15})
+ .Case("row_ror", {DppCtrl::ROW_ROR0, 1, 15})
+ .Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15})
+ .Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15})
+ .Default({-1, 0, 0});
+
+ bool Valid;
+ if (Check.Ctrl == -1) {
+ Valid = (Ctrl == "row_bcast" && (Val == 15 || Val == 31));
+ Val = (Val == 15)? DppCtrl::BCAST15 : DppCtrl::BCAST31;
} else {
- return MatchOperand_NoMatch;
+ Valid = Check.Lo <= Val && Val <= Check.Hi;
+ Val = (Check.Lo == Check.Hi) ? Check.Ctrl : (Check.Ctrl | Val);
}
- if (Prefix == "row_mirror") {
- Int = DppCtrl::ROW_MIRROR;
- Parser.Lex();
- } else if (Prefix == "row_half_mirror") {
- Int = DppCtrl::ROW_HALF_MIRROR;
- Parser.Lex();
- } else {
- // Check to prevent parseDPPCtrlOps from eating invalid tokens
- if (Prefix != "quad_perm"
- && Prefix != "row_shl"
- && Prefix != "row_shr"
- && Prefix != "row_ror"
- && Prefix != "wave_shl"
- && Prefix != "wave_rol"
- && Prefix != "wave_shr"
- && Prefix != "wave_ror"
- && Prefix != "row_bcast"
- && Prefix != "row_share"
- && Prefix != "row_xmask") {
- return MatchOperand_NoMatch;
- }
-
- if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask"))
- return MatchOperand_NoMatch;
-
- if (!isVI() && !isGFX9() &&
- (Prefix == "wave_shl" || Prefix == "wave_shr" ||
- Prefix == "wave_rol" || Prefix == "wave_ror" ||
- Prefix == "row_bcast"))
- return MatchOperand_NoMatch;
-
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
+ if (!Valid) {
+ Error(Loc, Twine("invalid ", Ctrl) + Twine(" value"));
+ return -1;
+ }
- if (Prefix == "quad_perm") {
- // quad_perm:[%d,%d,%d,%d]
- Parser.Lex();
- if (getLexer().isNot(AsmToken::LBrac))
- return MatchOperand_ParseFail;
- Parser.Lex();
+ return Val;
+}
- if (getParser().parseAbsoluteExpression(Int) || !(0 <= Int && Int <=3))
- return MatchOperand_ParseFail;
+OperandMatchResultTy
+AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+ using namespace AMDGPU::DPP;
- for (int i = 0; i < 3; ++i) {
- if (getLexer().isNot(AsmToken::Comma))
- return MatchOperand_ParseFail;
- Parser.Lex();
+ if (!isToken(AsmToken::Identifier) ||
+ !isSupportedDPPCtrl(getTokenStr(), Operands))
+ return MatchOperand_NoMatch;
- int64_t Temp;
- if (getParser().parseAbsoluteExpression(Temp) || !(0 <= Temp && Temp <=3))
- return MatchOperand_ParseFail;
- const int shift = i*2 + 2;
- Int += (Temp << shift);
- }
+ SMLoc S = getLoc();
+ int64_t Val = -1;
+ StringRef Ctrl;
- if (getLexer().isNot(AsmToken::RBrac))
- return MatchOperand_ParseFail;
- Parser.Lex();
- } else {
- // sel:%d
- Parser.Lex();
- if (getParser().parseAbsoluteExpression(Int))
- return MatchOperand_ParseFail;
+ parseId(Ctrl);
- if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
- Int |= DppCtrl::ROW_SHL0;
- } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
- Int |= DppCtrl::ROW_SHR0;
- } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
- Int |= DppCtrl::ROW_ROR0;
- } else if (Prefix == "wave_shl" && 1 == Int) {
- Int = DppCtrl::WAVE_SHL1;
- } else if (Prefix == "wave_rol" && 1 == Int) {
- Int = DppCtrl::WAVE_ROL1;
- } else if (Prefix == "wave_shr" && 1 == Int) {
- Int = DppCtrl::WAVE_SHR1;
- } else if (Prefix == "wave_ror" && 1 == Int) {
- Int = DppCtrl::WAVE_ROR1;
- } else if (Prefix == "row_bcast") {
- if (Int == 15) {
- Int = DppCtrl::BCAST15;
- } else if (Int == 31) {
- Int = DppCtrl::BCAST31;
- } else {
- return MatchOperand_ParseFail;
- }
- } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) {
- Int |= DppCtrl::ROW_SHARE_FIRST;
- } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) {
- Int |= DppCtrl::ROW_XMASK_FIRST;
+ if (Ctrl == "row_mirror") {
+ Val = DppCtrl::ROW_MIRROR;
+ } else if (Ctrl == "row_half_mirror") {
+ Val = DppCtrl::ROW_HALF_MIRROR;
+ } else {
+ if (skipToken(AsmToken::Colon, "expected a colon")) {
+ if (Ctrl == "quad_perm") {
+ Val = parseDPPCtrlPerm();
} else {
- return MatchOperand_ParseFail;
+ Val = parseDPPCtrlSel(Ctrl);
}
}
}
- Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTyDppCtrl));
+ if (Val == -1)
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, Val, S, AMDGPUOperand::ImmTyDppCtrl));
return MatchOperand_Success;
}
@@ -6947,11 +7471,12 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
AMDGPUOperand::ImmTy Type) {
using namespace llvm::AMDGPU::SDWA;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
StringRef Value;
OperandMatchResultTy res;
- res = parseStringWithPrefix(Prefix, Value);
+ SMLoc StringLoc;
+ res = parseStringWithPrefix(Prefix, Value, StringLoc);
if (res != MatchOperand_Success) {
return res;
}
@@ -6966,9 +7491,9 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
.Case("WORD_1", SdwaSel::WORD_1)
.Case("DWORD", SdwaSel::DWORD)
.Default(0xffffffff);
- Parser.Lex(); // eat last token
if (Int == 0xffffffff) {
+ Error(StringLoc, "invalid " + Twine(Prefix) + " value");
return MatchOperand_ParseFail;
}
@@ -6980,11 +7505,12 @@ OperandMatchResultTy
AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
using namespace llvm::AMDGPU::SDWA;
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
StringRef Value;
OperandMatchResultTy res;
- res = parseStringWithPrefix("dst_unused", Value);
+ SMLoc StringLoc;
+ res = parseStringWithPrefix("dst_unused", Value, StringLoc);
if (res != MatchOperand_Success) {
return res;
}
@@ -6995,9 +7521,9 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
.Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT)
.Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
.Default(0xffffffff);
- Parser.Lex(); // eat last token
if (Int == 0xffffffff) {
+ Error(StringLoc, "invalid dst_unused value");
return MatchOperand_ParseFail;
}
@@ -7146,6 +7672,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
#define GET_REGISTER_MATCHER
#define GET_MATCHER_IMPLEMENTATION
#define GET_MNEMONIC_SPELL_CHECKER
+#define GET_MNEMONIC_CHECKER
#include "AMDGPUGenAsmMatcher.inc"
// This fuction should be defined after auto-generated include so that we have
@@ -7210,7 +7737,7 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
//===----------------------------------------------------------------------===//
OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
int64_t Imm = 0;
if (!parseExpr(Imm)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index fa42ddc54b56..5dc5481df49e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -114,6 +114,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
let Constraints = ps.Constraints;
@@ -168,29 +169,29 @@ class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
class getMTBUFAsmOps<int addrKind> {
string Pfx =
- !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset",
+ !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc,$format $soffset",
!if(!eq(addrKind, BUFAddrKind.OffEn),
- "$vaddr, $srsrc, $format, $soffset offen",
+ "$vaddr, $srsrc,$format $soffset offen",
!if(!eq(addrKind, BUFAddrKind.IdxEn),
- "$vaddr, $srsrc, $format, $soffset idxen",
+ "$vaddr, $srsrc,$format $soffset idxen",
!if(!eq(addrKind, BUFAddrKind.BothEn),
- "$vaddr, $srsrc, $format, $soffset idxen offen",
+ "$vaddr, $srsrc,$format $soffset idxen offen",
!if(!eq(addrKind, BUFAddrKind.Addr64),
- "$vaddr, $srsrc, $format, $soffset addr64",
+ "$vaddr, $srsrc,$format $soffset addr64",
"")))));
string ret = Pfx # "$offset";
}
class MTBUF_SetupAddr<int addrKind> {
- bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
- !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+ bits<1> offen = !or(!eq(addrKind, BUFAddrKind.OffEn),
+ !eq(addrKind, BUFAddrKind.BothEn));
- bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
- !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+ bits<1> idxen = !or(!eq(addrKind, BUFAddrKind.IdxEn),
+ !eq(addrKind, BUFAddrKind.BothEn));
- bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+ bits<1> addr64 = !eq(addrKind, BUFAddrKind.Addr64);
- bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+ bits<1> has_vaddr = !ne(addrKind, BUFAddrKind.Offset);
}
class MTBUF_Load_Pseudo <string opName,
@@ -349,11 +350,13 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
- let Constraints = ps.Constraints;
- let DisableEncoding = ps.DisableEncoding;
- let TSFlags = ps.TSFlags;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let OtherPredicates = ps.OtherPredicates;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
bits<12> offset;
bits<1> glc;
@@ -461,15 +464,15 @@ class getMUBUFAsmOps<int addrKind> {
}
class MUBUF_SetupAddr<int addrKind> {
- bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
- !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+ bits<1> offen = !or(!eq(addrKind, BUFAddrKind.OffEn),
+ !eq(addrKind, BUFAddrKind.BothEn));
- bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
- !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+ bits<1> idxen = !or(!eq(addrKind, BUFAddrKind.IdxEn),
+ !eq(addrKind, BUFAddrKind.BothEn));
- bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+ bits<1> addr64 = !eq(addrKind, BUFAddrKind.Addr64);
- bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+ bits<1> has_vaddr = !ne(addrKind, BUFAddrKind.Offset);
}
class MUBUF_Load_Pseudo <string opName,
@@ -485,7 +488,7 @@ class MUBUF_Load_Pseudo <string opName,
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
!if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe") # "$dlc" # "$swz",
+ !if(isLds, " lds", "$tfe") # "$dlc$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -497,7 +500,7 @@ class MUBUF_Load_Pseudo <string opName,
let mayStore = 0;
let maybeAtomic = 1;
let Uses = !if(isLds, [EXEC, M0], [EXEC]);
- let has_tfe = !if(isLds, 0, 1);
+ let has_tfe = !not(isLds);
let lds = isLds;
let elements = getMUBUFElements<vdata_vt>.ret;
}
@@ -528,21 +531,23 @@ multiclass MUBUF_Pseudo_Loads<string opName,
bit TiedDest = 0,
bit isLds = 0> {
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>,
+ defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds>;
}
}
@@ -576,25 +581,27 @@ multiclass MUBUF_Pseudo_Stores<string opName,
ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt,
- [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ defvar legal_store_vt = !if(!eq(!cast<string>(store_vt), !cast<string>(v3f16)), v4f16, store_vt);
+
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt,
+ [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<0, NAME>;
- def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt,
- [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt,
+ [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<1, NAME>;
- def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
- def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
- def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>;
- def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
- def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
- def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt>;
}
}
@@ -622,9 +629,9 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
dag ret = !if(vdata_in,
!if(!empty(vaddrList),
(ins vdataClass:$vdata_in,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc),
(ins vdataClass:$vdata_in, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc)
),
!if(!empty(vaddrList),
(ins vdataClass:$vdata,
@@ -701,7 +708,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
(outs vdataClassCopy:$vdata),
getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # " glc$slc",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc1$slc",
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
@@ -1006,7 +1013,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
let SubtargetPredicate = HasGFX10_BEncoding in
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN <
- "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32
+ "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
>;
let SubtargetPredicate = isGFX8GFX9 in {
@@ -1093,14 +1100,12 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
let SubtargetPredicate = HasAtomicFaddInsts in {
-
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32
>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
>;
-
} // End SubtargetPredicate = HasAtomicFaddInsts
//===----------------------------------------------------------------------===//
@@ -1163,9 +1168,11 @@ let SubtargetPredicate = isGFX10Plus in {
//===----------------------------------------------------------------------===//
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_load<name, memoryVt>);
+
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1173,7 +1180,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1181,7 +1188,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1189,7 +1196,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1213,6 +1220,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1222,6 +1230,8 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZ", v3f16>;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZ", v3i16>;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
@@ -1244,9 +1254,11 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_store<name, memoryVt>);
+
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
@@ -1254,7 +1266,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_glc $auxiliary),
@@ -1263,7 +1275,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_glc $auxiliary),
@@ -1272,7 +1284,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
@@ -1297,6 +1309,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1306,6 +1319,8 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZ", v3f16>;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZ", v3i16>;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
@@ -1367,6 +1382,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
}
defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, f32, "BUFFER_ATOMIC_SWAP">;
defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">;
defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">;
defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">;
@@ -1392,46 +1408,56 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
+class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
+ (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
+ (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
+ [{ return SDValue(N, 0).use_empty(); }]> {
+
+ let GISelPredicateCode = [{
+ return MRI.use_nodbg_empty(MI.getOperand(0).getReg());
+ }];
+}
+
multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm),
+ (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
- $vdata_in,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+ getVregSrcForVT<vt>.ret:$vdata_in,
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy))
>;
}
let SubtargetPredicate = HasAtomicFaddInsts in {
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
-defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
}
def : GCNPat<
@@ -1568,6 +1594,7 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
>;
}
+let OtherPredicates = [DisableFlatScratch] in {
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>;
@@ -1586,7 +1613,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSE
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
@@ -1602,6 +1629,8 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D1
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
}
+} // End OtherPredicates = [DisableFlatScratch]
+
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is first
@@ -1652,6 +1681,7 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
>;
}
+let OtherPredicates = [DisableFlatScratch] in {
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
@@ -1666,7 +1696,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OF
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
@@ -1674,6 +1704,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_D16_HI_OFFEN, BUFFER_STORE_SHORT
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>;
}
}
+} // End OtherPredicates = [DisableFlatScratch]
//===----------------------------------------------------------------------===//
// MTBUF Patterns
@@ -1684,9 +1715,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
//===----------------------------------------------------------------------===//
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_load<name, memoryVt>);
+
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
@@ -1695,7 +1728,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
@@ -1704,7 +1737,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
@@ -1713,7 +1746,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1737,6 +1770,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1744,13 +1778,16 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZ", v3f16>;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_store<name, memoryVt>);
+
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
@@ -1759,7 +1796,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
@@ -1768,7 +1805,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
@@ -1777,7 +1814,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
timm:$offset, timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
@@ -1801,6 +1838,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1808,6 +1846,7 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZ", v3f16>;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
@@ -1825,7 +1864,7 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{16} = !if(ps.lds, 1, 0);
+ let Inst{16} = ps.lds;
let Inst{24-18} = op;
let Inst{31-26} = 0x38;
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
@@ -2176,7 +2215,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{16} = !if(ps.lds, 1, 0);
+ let Inst{16} = ps.lds;
let Inst{17} = !if(ps.has_slc, slc, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
@@ -2226,7 +2265,7 @@ class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{16} = !if(ps.lds, 1, 0);
+ let Inst{16} = ps.lds;
let Inst{17} = !if(ps.has_slc, slc, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
index beb01b1abf0f..328c81005df4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -53,7 +53,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
}
class DS_Real <DS_Pseudo ds> :
- InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # " " # ds.AsmOperands, []>,
+ InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # ds.AsmOperands, []>,
Enc64 {
let isPseudo = 0;
@@ -87,7 +87,7 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
(ins rc:$data0, offset:$offset, gds:$gds),
- "$data0$offset$gds"> {
+ " $data0$offset$gds"> {
let has_addr = 0;
let has_data1 = 0;
@@ -98,7 +98,7 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
- "$addr, $data0$offset$gds"> {
+ " $addr, $data0$offset$gds"> {
let has_data1 = 0;
let has_vdst = 0;
@@ -118,7 +118,7 @@ class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
- "$addr, $data0, $data1"#"$offset"#"$gds"> {
+ " $addr, $data0, $data1$offset$gds"> {
let has_vdst = 0;
}
@@ -138,7 +138,7 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
(outs),
(ins VGPR_32:$addr, rc:$data0, rc:$data1,
offset0:$offset0, offset1:$offset1, gds:$gds),
- "$addr, $data0, $data1$offset0$offset1$gds"> {
+ " $addr, $data0, $data1$offset0$offset1$gds"> {
let has_vdst = 0;
let has_offset = 0;
@@ -157,7 +157,7 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
(ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
- "$vdst, $addr, $data0$offset$gds"> {
+ " $vdst, $addr, $data0$offset$gds"> {
let hasPostISelHook = 1;
let has_data1 = 0;
@@ -166,12 +166,12 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
string NoRetOp = ""> {
def "" : DS_1A1D_RET<opName, rc>,
- AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>;
+ AtomicNoRet<NoRetOp, !ne(NoRetOp, "")>;
let has_m0_read = 0 in {
def _gfx9 : DS_1A1D_RET<opName, rc>,
AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp#"_gfx9"),
- !if(!eq(NoRetOp, ""), 0, 1)>;
+ !ne(NoRetOp, "")>;
}
}
@@ -181,7 +181,7 @@ class DS_1A2D_RET<string opName,
: DS_Pseudo<opName,
(outs rc:$vdst),
(ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
- "$vdst, $addr, $data0, $data1$offset$gds"> {
+ " $vdst, $addr, $data0, $data1$offset$gds"> {
let hasPostISelHook = 1;
}
@@ -191,11 +191,11 @@ multiclass DS_1A2D_RET_mc<string opName,
string NoRetOp = "",
RegisterClass src = rc> {
def "" : DS_1A2D_RET<opName, rc, src>,
- AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>;
+ AtomicNoRet<NoRetOp, !ne(NoRetOp, "")>;
let has_m0_read = 0 in {
def _gfx9 : DS_1A2D_RET<opName, rc, src>,
- AtomicNoRet<NoRetOp#"_gfx9", !if(!eq(NoRetOp, ""), 0, 1)>;
+ AtomicNoRet<NoRetOp#"_gfx9", !ne(NoRetOp, "")>;
}
}
@@ -205,7 +205,7 @@ class DS_1A2D_Off8_RET<string opName,
: DS_Pseudo<opName,
(outs rc:$vdst),
(ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
- "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
+ " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
let has_offset = 0;
let AsmMatchConverter = "cvtDSOffset01";
@@ -230,7 +230,7 @@ class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0
!if(HasTiedOutput,
(ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
(ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
- "$vdst, $addr$offset$gds"> {
+ " $vdst, $addr$offset$gds"> {
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
let has_data0 = 0;
@@ -252,7 +252,7 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
(ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
- "$vdst, $addr$offset0$offset1$gds"> {
+ " $vdst, $addr$offset0$offset1$gds"> {
let has_offset = 0;
let has_data0 = 0;
@@ -271,7 +271,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
(outs VGPR_32:$vdst),
(ins VGPR_32:$addr, offset:$offset),
- "$vdst, $addr$offset gds"> {
+ " $vdst, $addr$offset gds"> {
let has_data0 = 0;
let has_data1 = 0;
@@ -283,7 +283,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
class DS_0A_RET <string opName> : DS_Pseudo<opName,
(outs VGPR_32:$vdst),
(ins offset:$offset, gds:$gds),
- "$vdst$offset$gds"> {
+ " $vdst$offset$gds"> {
let mayLoad = 1;
let mayStore = 1;
@@ -296,7 +296,7 @@ class DS_0A_RET <string opName> : DS_Pseudo<opName,
class DS_1A <string opName> : DS_Pseudo<opName,
(outs),
(ins VGPR_32:$addr, offset:$offset, gds:$gds),
- "$addr$offset$gds"> {
+ " $addr$offset$gds"> {
let mayLoad = 1;
let mayStore = 1;
@@ -330,13 +330,13 @@ class DS_GWS <string opName, dag ins, string asmOps>
class DS_GWS_0D <string opName>
: DS_GWS<opName,
- (ins offset:$offset, gds:$gds), "$offset gds"> {
+ (ins offset:$offset), "$offset gds"> {
let hasSideEffects = 1;
}
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
+ (ins VGPR_32:$data0, offset:$offset), " $data0$offset gds"> {
let has_gws_data0 = 1;
let hasSideEffects = 1;
@@ -364,7 +364,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
: DS_Pseudo<opName,
(outs VGPR_32:$vdst),
(ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
- "$vdst, $addr, $data0$offset",
+ " $vdst, $addr, $data0$offset",
[(set i32:$vdst,
(node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
@@ -680,7 +680,29 @@ foreach vt = VReg_64.RegTypes in {
defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
}
-defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
+let SubtargetPredicate = isGFX7Plus in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
+}
+
+let SubtargetPredicate = HasUnalignedAccessMode in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
+}
+
+} // End SubtargetPredicate = HasUnalignedAccessMode
+
+} // End SubtargetPredicate = isGFX7Plus
} // End AddedComplexity = 100
@@ -719,7 +741,7 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
// normal store.
class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
- (inst $ptr, $value, offset:$offset, (i1 0))
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 0))
>;
multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -761,6 +783,18 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> :
(i1 0))
>;
+class DS128Bit8ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+ (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+ (inst $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DS128Bit8ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
+ (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+ (inst $ptr, (i64 (EXTRACT_SUBREG VReg_128:$value, sub0_sub1)),
+ (i64 (EXTRACT_SUBREG VReg_128:$value, sub2_sub3)), $offset0, $offset1,
+ (i1 0))
+>;
+
multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
@@ -773,21 +807,60 @@ multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
}
}
+multiclass DS128Bit8ByteAlignedPat_mc<ValueType vt> {
+ let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
+ def : DS128Bit8ByteAlignedReadPat<DS_READ2_B64, vt, load_local_m0>;
+ def : DS128Bit8ByteAlignedWritePat<DS_WRITE2_B64, vt, store_local_m0>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DS128Bit8ByteAlignedReadPat<DS_READ2_B64_gfx9, vt, load_local>;
+ def : DS128Bit8ByteAlignedWritePat<DS_WRITE2_B64_gfx9, vt, store_local>;
+ }
+}
+
// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
// related to bounds checking.
foreach vt = VReg_64.RegTypes in {
defm : DS64Bit4ByteAlignedPat_mc<vt>;
}
+foreach vt = VReg_128.RegTypes in {
+defm : DS128Bit8ByteAlignedPat_mc<vt>;
+}
+
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
}
-defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
+let SubtargetPredicate = isGFX7Plus in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
+}
+
+let SubtargetPredicate = HasUnalignedAccessMode in {
+
+foreach vt = VReg_96.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
+}
+
+} // End SubtargetPredicate = HasUnalignedAccessMode
+
+} // End SubtargetPredicate = isGFX7Plus
} // End AddedComplexity = 100
+
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 9c2f2e7eecd1..8061c6c509e0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -17,42 +17,24 @@
// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
#include "Disassembler/AMDGPUDisassembler.h"
-#include "AMDGPU.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm-c/Disassembler.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
+#include "llvm-c/DisassemblerTypes.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <tuple>
-#include <vector>
using namespace llvm;
#define DEBUG_TYPE "amdgpu-disassembler"
-#define SGPR_MAX (isGFX10() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \
- : AMDGPU::EncValues::SGPR_MAX_SI)
+#define SGPR_MAX \
+ (isGFX10Plus() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \
+ : AMDGPU::EncValues::SGPR_MAX_SI)
using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
@@ -63,7 +45,7 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) {
// ToDo: AMDGPUDisassembler supports only VI ISA.
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10())
+ if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10Plus())
report_fatal_error("Disassembly not yet supported for subtarget");
}
@@ -139,6 +121,8 @@ DECODE_OPERAND_REG(VS_128)
DECODE_OPERAND_REG(VReg_64)
DECODE_OPERAND_REG(VReg_96)
DECODE_OPERAND_REG(VReg_128)
+DECODE_OPERAND_REG(VReg_256)
+DECODE_OPERAND_REG(VReg_512)
DECODE_OPERAND_REG(SReg_32)
DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
@@ -382,15 +366,24 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
+ MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
+ MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MUBUF | SIInstrFlags::FLAT)) &&
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::glc1) != -1) {
+ insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1);
+ }
+
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
@@ -499,8 +492,16 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AMDGPU::OpName::d16);
assert(VDataIdx != -1);
- assert(DMaskIdx != -1);
- assert(TFEIdx != -1);
+ if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray
+ if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) {
+ assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa ||
+ MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa ||
+ MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa ||
+ MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa);
+ addOperand(MI, MCOperand::createImm(1));
+ }
+ return MCDisassembler::Success;
+ }
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
bool IsAtomic = (VDstIdx != -1);
@@ -544,9 +545,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
DstSize = (DstSize + 1) / 2;
}
- // FIXME: Add tfe support
if (MI.getOperand(TFEIdx).getImm())
- return MCDisassembler::Success;
+ DstSize += 1;
if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
return MCDisassembler::Success;
@@ -996,10 +996,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
using namespace AMDGPU::EncValues;
- unsigned TTmpMin =
- (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MIN : TTMP_VI_MIN;
- unsigned TTmpMax =
- (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MAX : TTMP_VI_MAX;
+ unsigned TTmpMin = isGFX9Plus() ? TTMP_GFX9PLUS_MIN : TTMP_VI_MIN;
+ unsigned TTmpMax = isGFX9Plus() ? TTMP_GFX9PLUS_MAX : TTMP_VI_MAX;
return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
}
@@ -1017,7 +1015,8 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
: getVgprClassId(Width), Val - VGPR_MIN);
}
if (Val <= SGPR_MAX) {
- assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+ // "SGPR_MIN <= Val" is always true and causes compilation warning.
+ static_assert(SGPR_MIN == 0, "");
return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
}
@@ -1054,7 +1053,8 @@ MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) c
assert(Width == OPW256 || Width == OPW512);
if (Val <= SGPR_MAX) {
- assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+ // "SGPR_MIN <= Val" is always true and causes compilation warning.
+ static_assert(SGPR_MIN == 0, "");
return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
}
@@ -1137,8 +1137,8 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
Val - SDWA9EncValues::SRC_VGPR_MIN);
}
if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
- Val <= (isGFX10() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
- : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
+ Val <= (isGFX10Plus() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
+ : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
return createSRegOperand(getSgprClassId(Width),
Val - SDWA9EncValues::SRC_SGPR_MIN);
}
@@ -1207,12 +1207,358 @@ bool AMDGPUDisassembler::isVI() const {
return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
}
-bool AMDGPUDisassembler::isGFX9() const {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
+
+bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
+
+bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
+
+bool AMDGPUDisassembler::isGFX10Plus() const {
+ return AMDGPU::isGFX10Plus(STI);
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU specific symbol handling
+//===----------------------------------------------------------------------===//
+#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \
+ do { \
+ KdStream << Indent << DIRECTIVE " " \
+ << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \
+ } while (0)
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
+ uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
+ using namespace amdhsa;
+ StringRef Indent = "\t";
+
+ // We cannot accurately backward compute #VGPRs used from
+ // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
+ // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
+ // simply calculate the inverse of what the assembler does.
+
+ uint32_t GranulatedWorkitemVGPRCount =
+ (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >>
+ COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT;
+
+ uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) *
+ AMDGPU::IsaInfo::getVGPREncodingGranule(&STI);
+
+ KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
+
+ // We cannot backward compute values used to calculate
+ // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
+ // directives can't be computed:
+ // .amdhsa_reserve_vcc
+ // .amdhsa_reserve_flat_scratch
+ // .amdhsa_reserve_xnack_mask
+ // They take their respective default values if not specified in the assembly.
+ //
+ // GRANULATED_WAVEFRONT_SGPR_COUNT
+ // = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
+ //
+ // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
+ // are set to 0. So while disassembling we consider that:
+ //
+ // GRANULATED_WAVEFRONT_SGPR_COUNT
+ // = f(NEXT_FREE_SGPR + 0 + 0 + 0)
+ //
+ // The disassembler cannot recover the original values of those 3 directives.
+
+ uint32_t GranulatedWavefrontSGPRCount =
+ (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >>
+ COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT;
+
+ if (isGFX10Plus() && GranulatedWavefrontSGPRCount)
+ return MCDisassembler::Fail;
+
+ uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
+ AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
+
+ KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
+ KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
+ KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
+ KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
+ return MCDisassembler::Fail;
+
+ PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
+ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+ PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
+ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+ PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
+ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+ PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
+ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
+ return MCDisassembler::Fail;
+
+ PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
+ return MCDisassembler::Fail;
+
+ PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
+ return MCDisassembler::Fail;
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
+ return MCDisassembler::Fail;
+
+ PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
+ return MCDisassembler::Fail;
+
+ if (isGFX10Plus()) {
+ PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
+ COMPUTE_PGM_RSRC1_WGP_MODE);
+ PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
+ PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+ }
+ return MCDisassembler::Success;
}
-bool AMDGPUDisassembler::isGFX10() const {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+// NOLINTNEXTLINE(readability-identifier-naming)
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
+ uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
+ using namespace amdhsa;
+ StringRef Indent = "\t";
+ PRINT_DIRECTIVE(
+ ".amdhsa_system_sgpr_private_segment_wavefront_offset",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
+ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+ PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
+ COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
+ return MCDisassembler::Fail;
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
+ return MCDisassembler::Fail;
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
+ return MCDisassembler::Fail;
+
+ PRINT_DIRECTIVE(
+ ".amdhsa_exception_fp_ieee_invalid_op",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
+ PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+ PRINT_DIRECTIVE(
+ ".amdhsa_exception_fp_ieee_div_zero",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
+ PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+ PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+ PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+ PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
+ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
+ return MCDisassembler::Fail;
+
+ return MCDisassembler::Success;
+}
+
+#undef PRINT_DIRECTIVE
+
+MCDisassembler::DecodeStatus
+AMDGPUDisassembler::decodeKernelDescriptorDirective(
+ DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
+ raw_string_ostream &KdStream) const {
+#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \
+ do { \
+ KdStream << Indent << DIRECTIVE " " \
+ << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \
+ } while (0)
+
+ uint16_t TwoByteBuffer = 0;
+ uint32_t FourByteBuffer = 0;
+ uint64_t EightByteBuffer = 0;
+
+ StringRef ReservedBytes;
+ StringRef Indent = "\t";
+
+ assert(Bytes.size() == 64);
+ DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
+
+ switch (Cursor.tell()) {
+ case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
+ FourByteBuffer = DE.getU32(Cursor);
+ KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
+ << '\n';
+ return MCDisassembler::Success;
+
+ case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
+ FourByteBuffer = DE.getU32(Cursor);
+ KdStream << Indent << ".amdhsa_private_segment_fixed_size "
+ << FourByteBuffer << '\n';
+ return MCDisassembler::Success;
+
+ case amdhsa::RESERVED0_OFFSET:
+ // 8 reserved bytes, must be 0.
+ EightByteBuffer = DE.getU64(Cursor);
+ if (EightByteBuffer) {
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+
+ case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
+ // KERNEL_CODE_ENTRY_BYTE_OFFSET
+ // So far no directive controls this for Code Object V3, so simply skip for
+ // disassembly.
+ DE.skip(Cursor, 8);
+ return MCDisassembler::Success;
+
+ case amdhsa::RESERVED1_OFFSET:
+ // 20 reserved bytes, must be 0.
+ ReservedBytes = DE.getBytes(Cursor, 20);
+ for (int I = 0; I < 20; ++I) {
+ if (ReservedBytes[I] != 0) {
+ return MCDisassembler::Fail;
+ }
+ }
+ return MCDisassembler::Success;
+
+ case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
+ // COMPUTE_PGM_RSRC3
+ // - Only set for GFX10, GFX6-9 have this to be 0.
+ // - Currently no directives directly control this.
+ FourByteBuffer = DE.getU32(Cursor);
+ if (!isGFX10Plus() && FourByteBuffer) {
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+
+ case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
+ FourByteBuffer = DE.getU32(Cursor);
+ if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) ==
+ MCDisassembler::Fail) {
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+
+ case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
+ FourByteBuffer = DE.getU32(Cursor);
+ if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) ==
+ MCDisassembler::Fail) {
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+
+ case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
+ using namespace amdhsa;
+ TwoByteBuffer = DE.getU16(Cursor);
+
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+
+ if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
+ return MCDisassembler::Fail;
+
+ // Reserved for GFX9
+ if (isGFX9() &&
+ (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
+ return MCDisassembler::Fail;
+ } else if (isGFX10Plus()) {
+ PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
+ KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+ }
+
+ if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
+ return MCDisassembler::Fail;
+
+ return MCDisassembler::Success;
+
+ case amdhsa::RESERVED2_OFFSET:
+ // 6 bytes from here are reserved, must be 0.
+ ReservedBytes = DE.getBytes(Cursor, 6);
+ for (int I = 0; I < 6; ++I) {
+ if (ReservedBytes[I] != 0)
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+
+ default:
+ llvm_unreachable("Unhandled index. Case statements cover everything.");
+ return MCDisassembler::Fail;
+ }
+#undef PRINT_DIRECTIVE
+}
+
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
+ StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
+ // CP microcode requires the kernel descriptor to be 64 aligned.
+ if (Bytes.size() != 64 || KdAddress % 64 != 0)
+ return MCDisassembler::Fail;
+
+ std::string Kd;
+ raw_string_ostream KdStream(Kd);
+ KdStream << ".amdhsa_kernel " << KdName << '\n';
+
+ DataExtractor::Cursor C(0);
+ while (C && C.tell() < Bytes.size()) {
+ MCDisassembler::DecodeStatus Status =
+ decodeKernelDescriptorDirective(C, Bytes, KdStream);
+
+ cantFail(C.takeError());
+
+ if (Status == MCDisassembler::Fail)
+ return MCDisassembler::Fail;
+ }
+ KdStream << ".end_amdhsa_kernel\n";
+ outs() << KdStream.str();
+ return MCDisassembler::Success;
+}
+
+Optional<MCDisassembler::DecodeStatus>
+AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &CStream) const {
+ // Right now only kernel descriptor needs to be handled.
+ // We ignore all other symbols for target specific handling.
+ // TODO:
+ // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
+ // Object V2 and V3 when symbols are marked protected.
+
+ // amd_kernel_code_t for Code Object V2.
+ if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
+ Size = 256;
+ return MCDisassembler::Fail;
+ }
+
+ // Code Object V3 kernel descriptors.
+ StringRef Name = Symbol.Name;
+ if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
+ Size = 64; // Size = 64 regardless of success or failure.
+ return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
+ }
+ return None;
}
//===----------------------------------------------------------------------===//
@@ -1233,11 +1579,10 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
if (!Symbols)
return false;
- auto Result = std::find_if(Symbols->begin(), Symbols->end(),
- [Value](const SymbolInfoTy& Val) {
- return Val.Addr == static_cast<uint64_t>(Value)
- && Val.Type == ELF::STT_NOTYPE;
- });
+ auto Result = llvm::find_if(*Symbols, [Value](const SymbolInfoTy &Val) {
+ return Val.Addr == static_cast<uint64_t>(Value) &&
+ Val.Type == ELF::STT_NOTYPE;
+ });
if (Result != Symbols->end()) {
auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f975af409a09..714dabbc5184 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -15,15 +15,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
-
-#include <algorithm>
-#include <cstdint>
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/DataExtractor.h"
#include <memory>
namespace llvm {
@@ -66,6 +60,33 @@ public:
DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
uint64_t Address) const;
+ Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CStream) const override;
+
+ DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef<uint8_t> Bytes,
+ uint64_t KdAddress) const;
+
+ DecodeStatus
+ decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor,
+ ArrayRef<uint8_t> Bytes,
+ raw_string_ostream &KdStream) const;
+
+ /// Decode as directives that handle COMPUTE_PGM_RSRC1.
+ /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC1.
+ /// \param KdStream - Stream to write the disassembled directives to.
+ // NOLINTNEXTLINE(readability-identifier-naming)
+ DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer,
+ raw_string_ostream &KdStream) const;
+
+ /// Decode as directives that handle COMPUTE_PGM_RSRC2.
+ /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC2.
+ /// \param KdStream - Stream to write the disassembled directives to.
+ // NOLINTNEXTLINE(readability-identifier-naming)
+ DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
+ raw_string_ostream &KdStream) const;
+
DecodeStatus convertSDWAInst(MCInst &MI) const;
DecodeStatus convertDPP8Inst(MCInst &MI) const;
DecodeStatus convertMIMGInst(MCInst &MI) const;
@@ -140,7 +161,9 @@ public:
bool isVI() const;
bool isGFX9() const;
+ bool isGFX9Plus() const;
bool isGFX10() const;
+ bool isGFX10Plus() const;
};
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
new file mode 100644
index 000000000000..b3b55ddd2c97
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -0,0 +1,125 @@
+//===-- EXPInstructions.td - Export Instruction Definitions ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXPCommon<bit done, string asm = ""> : InstSI<
+ (outs),
+ (ins exp_tgt:$tgt,
+ ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
+ exp_vm:$vm, exp_compr:$compr, i32imm:$en),
+ asm> {
+ let EXP = 1;
+ let EXP_CNT = 1;
+ let mayLoad = done;
+ let mayStore = 1;
+ let UseNamedOperandTable = 1;
+ let Uses = [EXEC];
+ let SchedRW = [WriteExport];
+ let DisableWQM = 1;
+}
+
+class EXP_Pseudo<bit done> : EXPCommon<done>,
+ SIMCInstr <NAME, SIEncodingFamily.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
+
+class EXP_Real<bit done, string pseudo, int subtarget>
+ : EXPCommon<done, "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")
+ #"$compr$vm">,
+ SIMCInstr <pseudo, subtarget> {
+ let AsmMatchConverter = "cvtExp";
+}
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+def EXP : EXP_Pseudo<0>;
+def EXP_DONE : EXP_Pseudo<1>;
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_si<bit _done, string pseudo>
+ : EXP_Real<_done, pseudo, SIEncodingFamily.SI>, EXPe {
+ let AssemblerPredicate = isGFX6GFX7;
+ let DecoderNamespace = "GFX6GFX7";
+ let done = _done;
+}
+
+def EXP_si : EXP_Real_si<0, "EXP">;
+def EXP_DONE_si : EXP_Real_si<1, "EXP_DONE">;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_vi<bit _done, string pseudo>
+ : EXP_Real<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi {
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
+ let done = _done;
+}
+
+def EXP_vi : EXP_Real_vi<0, "EXP">;
+def EXP_DONE_vi : EXP_Real_vi<1, "EXP_DONE">;
+
+//===----------------------------------------------------------------------===//
+// GFX10+
+//===----------------------------------------------------------------------===//
+
+class EXP_Real_gfx10<bit _done, string pseudo>
+ : EXP_Real<_done, pseudo, SIEncodingFamily.GFX10>, EXPe {
+ let AssemblerPredicate = isGFX10Plus;
+ let DecoderNamespace = "GFX10";
+ let done = _done;
+}
+
+def EXP_gfx10 : EXP_Real_gfx10<0, "EXP">;
+def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">;
+
+//===----------------------------------------------------------------------===//
+// EXP Patterns
+//===----------------------------------------------------------------------===//
+
+class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+ done_val, timm:$vm),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
+>;
+
+class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp_compr timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ done_val, timm:$vm),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
+>;
+
+// FIXME: The generated DAG matcher seems to have strange behavior
+// with a 1-bit literal to match, so use a -1 for checking a true
+// 1-bit value.
+def : ExpPattern<i32, EXP, 0>;
+def : ExpPattern<i32, EXP_DONE, -1>;
+def : ExpPattern<f32, EXP, 0>;
+def : ExpPattern<f32, EXP_DONE, -1>;
+
+def : ExpComprPattern<v2i16, EXP, 0>;
+def : ExpComprPattern<v2i16, EXP_DONE, -1>;
+def : ExpComprPattern<v2f16, EXP, 0>;
+def : ExpComprPattern<v2f16, EXP_DONE, -1>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 97104a242d8c..8d3e138ba56a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -30,6 +30,15 @@ class EGOrCaymanPat<dag pattern, dag result> : AMDGPUPat<pattern, result> {
let SubtargetPredicate = isEGorCayman;
}
+def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
+ return isMask_32(Imm);
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
//===----------------------------------------------------------------------===//
// Evergreen / Cayman store instructions
//===----------------------------------------------------------------------===//
@@ -69,7 +78,7 @@ multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> {
def _RTN: CF_MEM_RAT <op_ret, 0, 0xf,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
(outs R600_Reg128:$out_gpr),
- name # "_RTN" # " $rw_gpr, $index_gpr", [] >;
+ name # "_RTN $rw_gpr, $index_gpr", [] >;
def _NORET: CF_MEM_RAT <op_noret, 0, 0xf,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
(outs R600_Reg128:$out_gpr),
@@ -394,7 +403,41 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
VecALU
>;
-defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>;
+// Bitfield extract patterns
+
+def : AMDGPUPat <
+ (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask),
+ (BFE_UINT_eg $src, $rshift, (MOV_IMM_I32 (i32 (IMMPopCount $mask))))
+>;
+
+// x & ((1 << y) - 1)
+def : AMDGPUPat <
+ (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+ (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+// x & ~(-1 << y)
+def : AMDGPUPat <
+ (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+ (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+// x & (-1 >> (bitwidth - y))
+def : AMDGPUPat <
+ (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+ (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+ (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+ (BFE_UINT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
+
+def : AMDGPUPat <
+ (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+ (BFE_INT_eg $src, (MOV_IMM_I32 (i32 0)), $width)
+>;
def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
[(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
@@ -408,7 +451,74 @@ def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i8)),
def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i16)),
(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
-defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
+// BFI patterns
+
+// Definition from ISA doc:
+// (y & x) | (z & ~x)
+def : AMDGPUPat <
+ (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+ (BFI_INT_eg $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+ (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+ (REG_SEQUENCE R600_Reg64,
+ (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub0))), sub0,
+ (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub1))), sub1)
+>;
+
+// SHA-256 Ch function
+// z ^ (x & (y ^ z))
+def : AMDGPUPat <
+ (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+ (BFI_INT_eg $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+ (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (REG_SEQUENCE R600_Reg64,
+ (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub0))), sub0,
+ (BFI_INT_eg (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f32:$src0, f32:$src1),
+ (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)), $src0, $src1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f32:$src0, f64:$src1),
+ (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)), $src0,
+ (i32 (EXTRACT_SUBREG R600_Reg64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+ (fcopysign f64:$src0, f64:$src1),
+ (REG_SEQUENCE R600_Reg64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$src0, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$src1, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f64:$src0, f32:$src1),
+ (REG_SEQUENCE R600_Reg64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (BFI_INT_eg (MOV_IMM_I32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$src0, sub1)),
+ $src1), sub1)
+>;
def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
[(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -692,8 +802,26 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
-// SHA-256 Patterns
-defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>;
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
+def : AMDGPUPat <
+ (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+ (BFI_INT_eg (XOR_INT i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+def : AMDGPUPat <
+ (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
+ (REG_SEQUENCE R600_Reg64,
+ (BFI_INT_eg (XOR_INT (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0))),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub0)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub0))), sub0,
+ (BFI_INT_eg (XOR_INT (i32 (EXTRACT_SUBREG R600_Reg64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1))),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$z, sub1)),
+ (i32 (EXTRACT_SUBREG R600_Reg64:$y, sub1))), sub1)
+>;
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 69facada2e96..57a355a55a02 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,11 +6,12 @@
//
//===----------------------------------------------------------------------===//
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [SDNPWantRoot], -10>;
-def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
+def FLATOffset : ComplexPattern<i64, 2, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
+def FLATOffsetSigned : ComplexPattern<i64, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<i32, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
-def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
-def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [SDNPWantRoot], -10>;
+def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -64,9 +65,11 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
// Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
// and are not considered done until both have been decremented.
let VM_CNT = 1;
- let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1);
+ let LGKM_CNT = !not(!or(is_flat_global, is_flat_scratch));
- let IsNonFlatSeg = !if(!or(is_flat_global, is_flat_scratch), 1, 0);
+ let IsFlatGlobal = is_flat_global;
+
+ let IsFlatScratch = is_flat_scratch;
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -79,6 +82,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let OtherPredicates = ps.OtherPredicates;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
@@ -140,10 +144,13 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
(outs regClass:$vdst),
!con(
!con(
- !con((ins VReg_64:$vaddr),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
- !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+ !if(EnableSaddr,
+ (ins SReg_64:$saddr, VGPR_32:$vaddr),
+ (ins VReg_64:$vaddr)),
+ (ins flat_offset:$offset)),
+ // FIXME: Operands with default values do not work with following non-optional operands.
+ !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
+ (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
" $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
let has_data = 0;
let mayLoad = 1;
@@ -161,9 +168,10 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
opName,
(outs),
!con(
- !con((ins VReg_64:$vaddr, vdataClass:$vdata),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ !if(EnableSaddr,
+ (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr),
+ (ins VReg_64:$vaddr, vdataClass:$vdata)),
+ (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)),
" $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
let mayLoad = 0;
let mayStore = 1;
@@ -184,24 +192,34 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
}
class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
- bit HasTiedOutput = 0, bit HasSignedOffset = 0> : FLAT_Pseudo<
+ bit HasTiedOutput = 0, bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
- !con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+ !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
+ (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
!if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
- " $vdst, $saddr$offset$glc$slc$dlc"> {
+ " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
let is_flat_global = 1;
let has_data = 0;
let mayLoad = 1;
let has_vaddr = 0;
let has_saddr = 1;
- let enabled_saddr = 1;
+ let enabled_saddr = EnableSaddr;
let maybeAtomic = 1;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
+multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
+ bit HasTiedOutput = 0, bit HasSignedOffset = 0> {
+ def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, HasSignedOffset>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, HasSignedOffset, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
@@ -212,68 +230,107 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
}
class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
- bit HasSignedOffset = 0> : FLAT_Pseudo<
+ bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs),
- !con(
- (ins vdataClass:$vdata, SReg_64:$saddr),
- (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
- " $vdata, $saddr$offset$glc$slc$dlc"> {
+ !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)),
+ (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
let is_flat_global = 1;
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
let has_vaddr = 0;
let has_saddr = 1;
- let enabled_saddr = 1;
+ let enabled_saddr = EnableSaddr;
let maybeAtomic = 1;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+}
+
+multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass,
+ bit HasSignedOffset = 0> {
+ def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, HasSignedOffset>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, HasSignedOffset, 1>,
+ GlobalSaddrTable<1, opName>;
+}
+
+class FlatScratchInst <string sv_op, string mode> {
+ string SVOp = sv_op;
+ string Mode = mode;
}
class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
- bit EnableSaddr = 0>: FLAT_Pseudo<
+ bit HasTiedOutput = 0,
+ bit EnableSaddr = 0,
+ bit EnableVaddr = !not(EnableSaddr)>
+ : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
- !if(EnableSaddr,
- (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
- (ins VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
- " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc$dlc"> {
+ !con(
+ !if(EnableSaddr,
+ (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+ !if(EnableVaddr,
+ (ins VGPR_32:$vaddr, flat_offset:$offset),
+ (ins flat_offset:$offset))),
+ !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
+ (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
+ " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
- let has_vaddr = !if(EnableSaddr, 0, 1);
- let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let has_vaddr = EnableVaddr;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
let maybeAtomic = 1;
+
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0> : FLAT_Pseudo<
+class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
+ bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo<
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
- " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+ !if(EnableVaddr,
+ (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+ (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
+ " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
let has_saddr = 1;
let enabled_saddr = EnableSaddr;
- let has_vaddr = !if(EnableSaddr, 0, 1);
- let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+ let has_vaddr = EnableVaddr;
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"));
let maybeAtomic = 1;
}
-multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
let is_flat_scratch = 1 in {
- def "" : FLAT_Scratch_Load_Pseudo<opName, regClass>;
- def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, 1>;
+ def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
+ FlatScratchInst<opName, "SV">;
+ def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
+ FlatScratchInst<opName, "SS">;
+
+ let SubtargetPredicate = HasFlatScratchSTMode in
+ def _ST : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>,
+ FlatScratchInst<opName, "ST">;
}
}
multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
let is_flat_scratch = 1 in {
- def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>;
- def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>;
+ def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
+ FlatScratchInst<opName, "SV">;
+ def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
+ FlatScratchInst<opName, "SS">;
+
+ let SubtargetPredicate = HasFlatScratchSTMode in
+ def _ST : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>,
+ FlatScratchInst<opName, "ST">;
}
}
@@ -310,7 +367,7 @@ multiclass FLAT_Atomic_Pseudo<
bit isFP = isFloatType<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
" $vaddr, $vdata$offset$slc">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
@@ -321,10 +378,10 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
- " $vdst, $vaddr, $vdata$offset glc$slc",
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
+ " $vdst, $vaddr, $vdata$offset$glc1$slc",
[(set vt:$vdst,
- (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1>{
let FPAtomic = isFP;
@@ -343,7 +400,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
" $vaddr, $vdata, off$offset$slc">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
@@ -354,7 +411,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
+ (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
@@ -376,10 +433,10 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
- " $vdst, $vaddr, $vdata, off$offset glc$slc",
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
+ " $vdst, $vaddr, $vdata, off$offset$glc1$slc",
[(set vt:$vdst,
- (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ (atomic (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
@@ -388,8 +445,8 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
- " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+ (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
+ " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc">,
GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
@@ -564,7 +621,7 @@ defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_
defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
let OtherPredicates = [HasGFX10_BEncoding] in
-def GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
@@ -573,7 +630,7 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VR
defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
let OtherPredicates = [HasGFX10_BEncoding] in
-def GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
@@ -662,7 +719,7 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
let SubtargetPredicate = HasGFX10_BEncoding in
defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
- VGPR_32, i32, atomic_csub_global_32>;
+ VGPR_32, i32, int_amdgcn_global_atomic_csub>;
} // End is_flat_global = 1
@@ -677,12 +734,12 @@ defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", V
defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>;
defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>;
-defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32>;
-defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32>;
-defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32>;
-defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32>;
-defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32>;
-defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32>;
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32, 1>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32, 1>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32, 1>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>;
defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo <"scratch_store_byte", VGPR_32>;
defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo <"scratch_store_short", VGPR_32>;
@@ -711,16 +768,16 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
-let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
-
-defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
->;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
->;
-
-} // End SubtargetPredicate = HasAtomicFaddInsts
+let is_flat_global = 1 in {
+let OtherPredicates = [HasAtomicFaddInsts] in {
+ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+ "global_atomic_add_f32", VGPR_32, f32
+ >;
+ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+ "global_atomic_pk_add_f16", VGPR_32, v2f16
+ >;
+} // End OtherPredicates = [HasAtomicFaddInsts]
+} // End is_flat_global = 1
//===----------------------------------------------------------------------===//
// Flat Patterns
@@ -728,69 +785,135 @@ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, 0, $slc)
+ (vt (node (FLATOffset i64:$vaddr, i16:$offset))),
+ (inst $vaddr, $offset)
>;
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
- (inst $vaddr, $offset, 0, 0, $slc, $in)
+ (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, 0, 0, $in)
>;
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
- (inst $vaddr, $offset, 0, 0, $slc, $in)
+ (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, 0, 0, $in)
>;
-class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATAtomic (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, 0, $slc)
+class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)),
+ (inst $saddr, $voffset, $offset, 0, 0, 0, $in)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, 0, $slc)
+ (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset))),
+ (inst $vaddr, $offset)
+>;
+
+class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))),
+ (inst $saddr, $voffset, $offset, 0, 0, 0)
>;
-class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
- (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)),
- (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
+ (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset)),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
-class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
- (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)),
- (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+class GlobalAtomicStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
-class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
+class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt, ValueType data_vt = vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), data_vt:$data)),
+ (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset)
+>;
+
+class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data),
+ (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+>;
+
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset)),
+ (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset)),
+ (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+ (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+ (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
-class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
+class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt, ValueType data_vt = vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
+ (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data),
+ (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
- (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
- (inst $vaddr, $data, $offset, $slc)
+ (vt (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (inst $vaddr, $data, $offset)
>;
class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, $slc)
+ (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+ (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffsetSigned i64:$vaddr, i16:$offset), vt:$data),
+ (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
- ValueType data_vt = vt> : GCNPat <
- (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
- (inst $vaddr, $data, $offset, $slc)
+ ValueType data_vt = vt> : GCNPat <
+ (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$data)),
+ (inst $vaddr, $data, $offset)
+>;
+
+class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))),
+ (inst $vaddr, $offset)
+>;
+
+class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, 0, 0, $in)
+>;
+
+class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)),
+ (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
+>;
+
+class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))),
+ (inst $saddr, $offset)
+>;
+
+class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+ (inst $saddr, $offset, 0, 0, 0, $in)
+>;
+
+class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> : GCNPat <
+ (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)),
+ (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
let OtherPredicates = [HasFlatAddressSpace] in {
@@ -807,8 +930,8 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -819,19 +942,19 @@ def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
}
foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
}
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
+def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
foreach vt = VReg_128.RegTypes in {
def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt, VReg_128>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
}
def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64>;
def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
@@ -885,101 +1008,258 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
} // End OtherPredicates = [HasFlatAddressSpace]
-let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
+multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatSignedLoadPat_D16 <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> {
+ def : FlatStoreSignedPat <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
+// Deal with swapped operands for atomic_store vs. regular store
+multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatStoreSignedAtomicPat <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalAtomicStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node,
+ ValueType vt, ValueType data_vt = vt> {
+ def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(nortn_inst_name#"_SADDR_RTN"), node, vt, data_vt> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> {
+ def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalAtomicNoRtnSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : ScratchLoadSignedPat <inst, node, vt> {
+ let AddedComplexity = 25;
+ }
+
+ def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 26;
+ }
+}
+
+multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
+ ValueType vt> {
+ def : ScratchStoreSignedPat <inst, node, vt> {
+ let AddedComplexity = 25;
+ }
+
+ def : ScratchStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 26;
+ }
+}
+
+multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : ScratchLoadSignedPat_D16 <inst, node, vt> {
+ let AddedComplexity = 25;
+ }
+
+ def : ScratchLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 26;
+ }
+}
+
+let OtherPredicates = [HasFlatGlobalInsts] in {
+
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16>;
foreach vt = Reg32Types.types in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, load_global, vt>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, store_global, vt>;
}
foreach vt = VReg_64.RegTypes in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, load_global, vt>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, store_global, vt>;
}
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
foreach vt = VReg_128.RegTypes in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, vt>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, vt, VReg_128>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, load_global, vt>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
}
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>;
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>;
+// There is no distinction for atomic load lowering during selection;
+// the memory legalizer will set the cache bits and insert the
+// appropriate waits.
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
let OtherPredicates = [D16PreservesUnusedBits] in {
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
-
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
-
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
-def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
+
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
+
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
+defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64, VReg_64>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CSUB_RTN, atomic_csub_global_32, i32>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
-
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
+defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64>;
+
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", atomic_load_add_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", atomic_load_sub_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", atomic_inc_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", atomic_dec_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", atomic_load_and_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", atomic_load_max_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", atomic_load_umax_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", atomic_load_min_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", atomic_load_umin_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", atomic_load_or_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", atomic_swap_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", atomic_load_xor_global_32, i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>;
+
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", atomic_load_add_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", atomic_load_sub_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", atomic_inc_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", atomic_dec_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", atomic_load_and_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", atomic_load_max_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", atomic_load_umax_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", atomic_load_min_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", atomic_load_umin_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", atomic_load_or_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
+
+let OtherPredicates = [HasAtomicFaddInsts] in {
+defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_global_noret_32, f32>;
+defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
+}
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
+let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
+
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, load_private, i16>;
+
+foreach vt = Reg32Types.types in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORD, store_private, vt>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX2, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX2, store_private, vt>;
+}
+
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX3, load_private, v3i32>;
+
+foreach vt = VReg_128.RegTypes in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX4, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX4, store_private, vt>;
+}
+
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
+
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
+
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16_HI, load_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16_HI, load_d16_hi_private, v2f16>;
+
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16, az_extloadi8_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16, az_extloadi8_d16_lo_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16, sextloadi8_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16, sextloadi8_d16_lo_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f16>;
+}
+
+} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
//===----------------------------------------------------------------------===//
// Target
@@ -1246,6 +1526,13 @@ multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
}
+multiclass FLAT_Real_ST_gfx10<bits<7> op> {
+ def _ST_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")> {
+ let Inst{54-48} = !cast<int>(EXEC_HI.HWEncoding);
+ let OtherPredicates = [HasFlatScratchSTMode];
+ }
+}
multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
FLAT_Real_Base_gfx10<op>,
@@ -1264,6 +1551,11 @@ multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
FLAT_Real_RTN_gfx10<op>,
FLAT_Real_SADDR_RTN_gfx10<op>;
+multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> :
+ FLAT_Real_Base_gfx10<op>,
+ FLAT_Real_SADDR_gfx10<op>,
+ FLAT_Real_ST_gfx10<op>;
+
// ENC_FLAT.
defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>;
defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>;
@@ -1377,32 +1669,32 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
-defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x016>;
-defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x017>;
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>;
// ENC_FLAT_SCRATCH.
-defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
-defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>;
-defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>;
-defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>;
-defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>;
-defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>;
-defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>;
-defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>;
-defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>;
-defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>;
-defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>;
-defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>;
-defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>;
-defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>;
-defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>;
-defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>;
-defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>;
-defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>;
-defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>;
-defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>;
-defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>;
-defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>;
+defm SCRATCH_LOAD_UBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x008>;
+defm SCRATCH_LOAD_SBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x009>;
+defm SCRATCH_LOAD_USHORT : FLAT_Real_ScratchAllAddr_gfx10<0x00a>;
+defm SCRATCH_LOAD_SSHORT : FLAT_Real_ScratchAllAddr_gfx10<0x00b>;
+defm SCRATCH_LOAD_DWORD : FLAT_Real_ScratchAllAddr_gfx10<0x00c>;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_ScratchAllAddr_gfx10<0x00d>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_ScratchAllAddr_gfx10<0x00e>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_ScratchAllAddr_gfx10<0x00f>;
+defm SCRATCH_STORE_BYTE : FLAT_Real_ScratchAllAddr_gfx10<0x018>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x019>;
+defm SCRATCH_STORE_SHORT : FLAT_Real_ScratchAllAddr_gfx10<0x01a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x01b>;
+defm SCRATCH_STORE_DWORD : FLAT_Real_ScratchAllAddr_gfx10<0x01c>;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Real_ScratchAllAddr_gfx10<0x01d>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Real_ScratchAllAddr_gfx10<0x01e>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Real_ScratchAllAddr_gfx10<0x01f>;
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x020>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x021>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x022>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
let SubtargetPredicate = HasAtomicFaddInsts in {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 719a968b8314..e4eacd101ce8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -38,22 +38,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Pass.h"
-#include <cassert>
using namespace llvm;
@@ -274,14 +262,14 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
default: break;
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_U32_e64:
- case AMDGPU::V_ADD_I32_e32:
- case AMDGPU::V_ADD_I32_e64:
+ case AMDGPU::V_ADD_CO_U32_e32:
+ case AMDGPU::V_ADD_CO_U32_e64:
case AMDGPU::V_OR_B32_e32:
case AMDGPU::V_OR_B32_e64:
case AMDGPU::V_SUBREV_U32_e32:
case AMDGPU::V_SUBREV_U32_e64:
- case AMDGPU::V_SUBREV_I32_e32:
- case AMDGPU::V_SUBREV_I32_e64:
+ case AMDGPU::V_SUBREV_CO_U32_e32:
+ case AMDGPU::V_SUBREV_CO_U32_e64:
case AMDGPU::V_MAX_U32_e32:
case AMDGPU::V_MAX_U32_e64:
case AMDGPU::V_XOR_B32_e32:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8482dbfec250..ed1dc77bd545 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -11,25 +11,11 @@
//===----------------------------------------------------------------------===//
#include "GCNHazardRecognizer.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <set>
-#include <vector>
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
@@ -50,6 +36,10 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
TSchedModel.init(&ST);
}
+void GCNHazardRecognizer::Reset() {
+ EmittedInstrs.clear();
+}
+
void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
EmitInstruction(SU->getInstr());
}
@@ -59,7 +49,7 @@ void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
}
static bool isDivFMas(unsigned Opcode) {
- return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
+ return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
}
static bool isSGetReg(unsigned Opcode) {
@@ -67,7 +57,14 @@ static bool isSGetReg(unsigned Opcode) {
}
static bool isSSetReg(unsigned Opcode) {
- return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
+ switch (Opcode) {
+ case AMDGPU::S_SETREG_B32:
+ case AMDGPU::S_SETREG_B32_mode:
+ case AMDGPU::S_SETREG_IMM32_B32:
+ case AMDGPU::S_SETREG_IMM32_B32_mode:
+ return true;
+ }
+ return false;
}
static bool isRWLane(unsigned Opcode) {
@@ -118,8 +115,8 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
static bool isPermlane(const MachineInstr &MI) {
unsigned Opcode = MI.getOpcode();
- return Opcode == AMDGPU::V_PERMLANE16_B32 ||
- Opcode == AMDGPU::V_PERMLANEX16_B32;
+ return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
}
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
@@ -131,75 +128,83 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
ScheduleHazardRecognizer::HazardType
GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
MachineInstr *MI = SU->getInstr();
+ // If we are not in "HazardRecognizerMode" and therefore not being run from
+ // the scheduler, track possible stalls from hazards but don't insert noops.
+ auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
+
if (MI->isBundle())
return NoHazard;
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
// FIXME: Should flat be considered vmem?
if ((SIInstrInfo::isVMEM(*MI) ||
SIInstrInfo::isFLAT(*MI))
&& checkVMEMHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (checkFPAtomicToDenormModeHazard(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (ST.hasNoDataDepHazard())
return NoHazard;
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (ST.hasReadM0MovRelInterpHazard() &&
(TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
checkReadM0Hazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
checkReadM0Hazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
- if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
- return NoopHazard;
+ if ((SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) ||
+ SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
+ return HazardType;
if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
- return NoopHazard;
-
- if (checkAnyInstHazards(MI) > 0)
- return NoopHazard;
+ return HazardType;
return NoHazard;
}
-static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
- .addImm(0);
+static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
+ unsigned Quantity) {
+ while (Quantity > 0) {
+ unsigned Arg = std::min(Quantity, 8u);
+ Quantity -= Arg;
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
+ .addImm(Arg - 1);
+ }
}
void GCNHazardRecognizer::processBundle() {
@@ -210,11 +215,11 @@ void GCNHazardRecognizer::processBundle() {
CurrCycleInstr = &*MI;
unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
- if (IsHazardRecognizerMode)
+ if (IsHazardRecognizerMode) {
fixHazards(CurrCycleInstr);
- for (unsigned i = 0; i < WaitStates; ++i)
- insertNoopInBundle(CurrCycleInstr, TII);
+ insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
+ }
// It’s unnecessary to track more than MaxLookAhead instructions. Since we
// include the bundled MI directly after, only add a maximum of
@@ -241,7 +246,7 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (MI->isBundle())
return 0;
- int WaitStates = std::max(0, checkAnyInstHazards(MI));
+ int WaitStates = 0;
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
@@ -291,7 +296,9 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (SIInstrInfo::isMAI(*MI))
return std::max(WaitStates, checkMAIHazards(MI));
- if (MI->mayLoadOrStore())
+ if (SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) ||
+ SIInstrInfo::isDS(*MI))
return std::max(WaitStates, checkMAILdStHazards(MI));
return WaitStates;
@@ -304,15 +311,19 @@ void GCNHazardRecognizer::EmitNoop() {
void GCNHazardRecognizer::AdvanceCycle() {
// When the scheduler detects a stall, it will call AdvanceCycle() without
// emitting any instructions.
- if (!CurrCycleInstr)
+ if (!CurrCycleInstr) {
+ EmittedInstrs.push_front(nullptr);
return;
+ }
// Do not track non-instructions which do not affect the wait states.
// If included, these instructions can lead to buffer overflow such that
// detectable hazards are missed.
if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
- CurrCycleInstr->isKill())
+ CurrCycleInstr->isKill()) {
+ CurrCycleInstr = nullptr;
return;
+ }
if (CurrCycleInstr->isBundle()) {
processBundle();
@@ -367,7 +378,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
if (IsHazard(&*I))
return WaitStates;
- if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
+ if (I->isInlineAsm() || I->isMetaInstruction())
continue;
WaitStates += SIInstrInfo::getNumWaitStates(*I);
@@ -460,8 +471,8 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
// No-op Hazard Detection
//===----------------------------------------------------------------------===//
-static void addRegUnits(const SIRegisterInfo &TRI,
- BitVector &BV, unsigned Reg) {
+static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
+ MCRegister Reg) {
for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
BV.set(*RUI);
}
@@ -471,7 +482,7 @@ static void addRegsToSet(const SIRegisterInfo &TRI,
BitVector &Set) {
for (const MachineOperand &Op : Ops) {
if (Op.isReg())
- addRegUnits(TRI, Set, Op.getReg());
+ addRegUnits(TRI, Set, Op.getReg().asMCReg());
}
}
@@ -718,8 +729,9 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
return -1;
}
-int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
- const MachineRegisterInfo &MRI) {
+int
+GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
+ const MachineRegisterInfo &MRI) {
// Helper to check for the hazard where VMEM instructions that store more than
// 8 bytes can have there store data over written by the next instruction.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -821,34 +833,6 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
return RFEWaitStates - WaitStatesNeeded;
}
-int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
- if (MI->isDebugInstr())
- return 0;
-
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- if (!ST.hasSMovFedHazard())
- return 0;
-
- // Check for any instruction reading an SGPR after a write from
- // s_mov_fed_b32.
- int MovFedWaitStates = 1;
- int WaitStatesNeeded = 0;
-
- for (const MachineOperand &Use : MI->uses()) {
- if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
- continue;
- auto IsHazardFn = [] (MachineInstr *MI) {
- return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
- };
- int WaitStatesNeededForUse =
- MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
- MovFedWaitStates);
- WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
- }
-
- return WaitStatesNeeded;
-}
-
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
const int SMovRelWaitStates = 1;
@@ -930,10 +914,12 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
return false;
};
- auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ auto IsExpiredFn = [](MachineInstr *MI, int) {
return MI && (SIInstrInfo::isVALU(*MI) ||
(MI->getOpcode() == AMDGPU::S_WAITCNT &&
- !MI->getOperand(0).getImm()));
+ !MI->getOperand(0).getImm()) ||
+ (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ MI->getOperand(0).getImm() == 0xffe3));
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -941,7 +927,9 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xffe3);
return true;
}
@@ -955,7 +943,6 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
unsigned SDSTName;
switch (MI->getOpcode()) {
case AMDGPU::V_READLANE_B32:
- case AMDGPU::V_READLANE_B32_gfx10:
case AMDGPU::V_READFIRSTLANE_B32:
SDSTName = AMDGPU::OpName::vdst;
break;
@@ -1183,7 +1170,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
case AMDGPU::S_WAITCNT_VMCNT:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
- case AMDGPU::S_WAITCNT_IDLE:
+ case AMDGPU::S_WAIT_IDLE:
return true;
default:
break;
@@ -1207,7 +1194,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
return SIInstrInfo::isVALU(*MI);
};
- if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
+ if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
const int LegacyVALUWritesVGPRWaitStates = 2;
const int VALUWritesExecWaitStates = 4;
const int MaxWaitStates = 4;
@@ -1235,15 +1222,15 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
auto IsMFMAFn = [] (MachineInstr *MI) {
return SIInstrInfo::isMAI(*MI) &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
};
for (const MachineOperand &Op : MI->explicit_operands()) {
if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
continue;
- if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
+ if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
continue;
const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
@@ -1277,7 +1264,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
int OpNo = MI->getOperandNo(&Op);
if (OpNo == SrcCIdx) {
NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
- } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
+ } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
switch (HazardDefLatency) {
case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
break;
@@ -1287,7 +1274,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
break;
}
- } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+ } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
switch (HazardDefLatency) {
case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
break;
@@ -1306,7 +1293,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
return WaitStatesNeeded; // Early exit.
auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
return false;
Register DstReg = MI->getOperand(0).getReg();
return TRI.regsOverlap(Reg, DstReg);
@@ -1318,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
if (OpNo == SrcCIdx)
NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
- else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
+ else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
WaitStatesNeededForUse = NeedWaitStates -
@@ -1329,7 +1316,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
return WaitStatesNeeded; // Early exit.
}
- if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
@@ -1373,7 +1360,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
int WaitStatesNeeded = 0;
auto IsAccVgprReadFn = [] (MachineInstr *MI) {
- return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
+ return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
};
for (const MachineOperand &Op : MI->explicit_uses()) {
@@ -1383,7 +1370,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
Register Reg = Op.getReg();
const int AccVgprReadLdStWaitStates = 2;
- const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
+ const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
const int MaxWaitStates = 2;
int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
@@ -1393,8 +1380,9 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
if (WaitStatesNeeded == MaxWaitStates)
return WaitStatesNeeded; // Early exit.
- auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
+ auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
return false;
auto IsVALUFn = [] (MachineInstr *MI) {
return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
@@ -1403,10 +1391,34 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
std::numeric_limits<int>::max();
};
- WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
- getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
+ WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
+ getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
return WaitStatesNeeded;
}
+
+bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+ if (!SU->isInstr())
+ return false;
+
+ MachineInstr *MAI = nullptr;
+ auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
+ MAI = nullptr;
+ if (SIInstrInfo::isMAI(*MI) &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
+ MAI = MI;
+ return MAI != nullptr;
+ };
+
+ MachineInstr *MI = SU->getInstr();
+ if (IsMFMAFn(MI)) {
+ int W = getWaitStatesSince(IsMFMAFn, 16);
+ if (MAI)
+ return W < (int)TSchedModel.computeInstrLatency(MAI);
+ }
+
+ return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index cd17f2755bd1..447ca828ae64 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -83,7 +83,6 @@ private:
int checkRWLaneHazards(MachineInstr *RWLane);
int checkRFEHazards(MachineInstr *RFE);
int checkInlineAsmHazards(MachineInstr *IA);
- int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
@@ -109,6 +108,8 @@ public:
unsigned PreEmitNoopsCommon(MachineInstr *);
void AdvanceCycle() override;
void RecedeCycle() override;
+ bool ShouldPreferAnother(SUnit *SU) override;
+ void Reset() override;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
index 39072af7d871..1eb617640c32 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Debug.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 75a02c839034..f3f9eb53355f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -12,29 +12,8 @@
//===----------------------------------------------------------------------===//
#include "GCNIterativeScheduler.h"
-#include "AMDGPUSubtarget.h"
-#include "GCNRegPressure.h"
#include "GCNSchedStrategy.h"
#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <type_traits>
-#include <vector>
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
index a0d4f432aa48..c0228540b7a2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -18,13 +18,7 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
#include "GCNRegPressure.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Allocator.h"
-#include <limits>
-#include <memory>
-#include <vector>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 884b2e17289c..443472a3b99a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -13,20 +13,7 @@
///
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ilist_node.h"
-#include "llvm/ADT/simple_ilist.h"
#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <vector>
-
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 57346087d017..fc7105bc15a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -14,18 +14,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/MathExtras.h"
-#include <algorithm>
using namespace llvm;
@@ -114,15 +109,15 @@ GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
unsigned NumRegs = Intervals.size();
for (unsigned N = 0; N < NumRegs; ++N)
- if (VRM->hasPhys(Intervals[N]->reg))
+ if (VRM->hasPhys(Intervals[N]->reg()))
LRM->unassign(*Intervals[N]);
for (unsigned N = 0; N < NumRegs; ++N)
- if (LRM->checkInterference(*Intervals[N], StartReg + N))
+ if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
return false;
for (unsigned N = 0; N < NumRegs; ++N)
- LRM->assign(*Intervals[N], StartReg + N);
+ LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
return true;
}
@@ -175,7 +170,7 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
Register Reg = Op.getReg();
- if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
return NSA_Status::FIXED;
Register PhysReg = VRM->getPhys(Reg);
@@ -273,13 +268,13 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
SmallVector<LiveInterval *, 16> Intervals;
- SmallVector<unsigned, 16> OrigRegs;
+ SmallVector<MCRegister, 16> OrigRegs;
SlotIndex MinInd, MaxInd;
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
Register Reg = Op.getReg();
LiveInterval *LI = &LIS->getInterval(Reg);
- if (llvm::find(Intervals, LI) != Intervals.end()) {
+ if (llvm::is_contained(Intervals, LI)) {
// Same register used, unable to make sequential
Intervals.clear();
break;
@@ -302,14 +297,15 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
<< "\tOriginal allocation:\t";
- for(auto *LI : Intervals)
- dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
+ for (auto *LI
+ : Intervals) dbgs()
+ << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
dbgs() << '\n');
bool Success = scavengeRegs(Intervals);
if (!Success) {
LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
- if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
+ if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
continue;
} else {
// Check we did not make it worse for other instructions.
@@ -328,7 +324,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
if (!Success) {
for (unsigned I = 0; I < Info->VAddrDwords; ++I)
- if (VRM->hasPhys(Intervals[I]->reg))
+ if (VRM->hasPhys(Intervals[I]->reg()))
LRM->unassign(*Intervals[I]);
for (unsigned I = 0; I < Info->VAddrDwords; ++I)
@@ -339,11 +335,12 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
C.second = true;
++NumNSAConverted;
- LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
- << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
- << " : "
- << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
- << "]\n");
+ LLVM_DEBUG(
+ dbgs() << "\tNew allocation:\t\t ["
+ << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
+ << " : "
+ << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
+ << "]\n");
Changed = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 17e6098d880d..7447ec2db188 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -32,20 +32,24 @@ def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
FeatureISAVersion6_0_1.Features
>;
-def : ProcessorModel<"hainan", SIQuarterSpeedModel,
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
FeatureISAVersion6_0_1.Features
>;
-def : ProcessorModel<"oland", SIQuarterSpeedModel,
+def : ProcessorModel<"verde", SIQuarterSpeedModel,
FeatureISAVersion6_0_1.Features
>;
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
- FeatureISAVersion6_0_1.Features
+def : ProcessorModel<"gfx602", SIQuarterSpeedModel,
+ FeatureISAVersion6_0_2.Features
>;
-def : ProcessorModel<"verde", SIQuarterSpeedModel,
- FeatureISAVersion6_0_1.Features
+def : ProcessorModel<"hainan", SIQuarterSpeedModel,
+ FeatureISAVersion6_0_2.Features
+>;
+
+def : ProcessorModel<"oland", SIQuarterSpeedModel,
+ FeatureISAVersion6_0_2.Features
>;
//===------------------------------------------------------------===//
@@ -92,6 +96,10 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
FeatureISAVersion7_0_4.Features
>;
+def : ProcessorModel<"gfx705", SIQuarterSpeedModel,
+ FeatureISAVersion7_0_5.Features
+>;
+
//===------------------------------------------------------------===//
// GCN GFX8 (Volcanic Islands (VI)).
//===------------------------------------------------------------===//
@@ -132,6 +140,14 @@ def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
FeatureISAVersion8_0_3.Features
>;
+def : ProcessorModel<"gfx805", SIQuarterSpeedModel,
+ FeatureISAVersion8_0_5.Features
+>;
+
+def : ProcessorModel<"tongapro", SIQuarterSpeedModel,
+ FeatureISAVersion8_0_5.Features
+>;
+
def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
FeatureISAVersion8_1_0.Features
>;
@@ -168,6 +184,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
FeatureISAVersion9_0_9.Features
>;
+def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
+ FeatureISAVersion9_0_C.Features
+>;
+
//===----------------------------------------------------------------------===//
// GCN GFX10.
//===----------------------------------------------------------------------===//
@@ -187,3 +207,15 @@ def : ProcessorModel<"gfx1012", GFX10SpeedModel,
def : ProcessorModel<"gfx1030", GFX10SpeedModel,
FeatureISAVersion10_3_0.Features
>;
+
+def : ProcessorModel<"gfx1031", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
+
+def : ProcessorModel<"gfx1032", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
+
+def : ProcessorModel<"gfx1033", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index 98d971630ca4..a12e9ab03e1d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -31,20 +31,15 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/MathExtras.h"
using namespace llvm;
@@ -76,41 +71,59 @@ class GCNRegBankReassign : public MachineFunctionPass {
public:
OperandMask(unsigned r, unsigned s, unsigned m)
: Reg(r), SubReg(s), Mask(m) {}
- unsigned Reg;
+ Register Reg;
unsigned SubReg;
unsigned Mask;
};
class Candidate {
public:
- Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
- unsigned weight)
- : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
-
- bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
+ Candidate(MachineInstr *mi, Register reg, unsigned subreg,
+ unsigned freebanks)
+ : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const GCNRegBankReassign *P) const {
MI->dump();
dbgs() << P->printReg(Reg) << " to banks ";
dumpFreeBanks(FreeBanks);
- dbgs() << " weight " << Weight << '\n';
+ dbgs() << '\n';
}
#endif
MachineInstr *MI;
- unsigned Reg;
+ Register Reg;
+ unsigned SubReg;
unsigned FreeBanks;
- unsigned Weight;
};
- class CandidateList : public std::list<Candidate> {
+ class CandidateList : public std::map<unsigned, std::list<Candidate>> {
public:
- // Speedup subsequent sort.
- void push(const Candidate&& C) {
- if (C.Weight) push_back(C);
- else push_front(C);
+ void push(unsigned Weight, const Candidate&& C) {
+ operator[](Weight).push_front(C);
+ }
+
+ Candidate &back() {
+ return rbegin()->second.back();
+ }
+
+ void pop_back() {
+ rbegin()->second.pop_back();
+ if (rbegin()->second.empty())
+ erase(rbegin()->first);
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const GCNRegBankReassign *P) const {
+ dbgs() << "\nCandidates:\n\n";
+ for (auto &B : *this) {
+ dbgs() << " Weight " << B.first << ":\n";
+ for (auto &C : B.second)
+ C.dump(P);
+ }
+ dbgs() << "\n\n";
}
+#endif
};
public:
@@ -162,32 +175,32 @@ private:
const MCPhysReg *CSRegs;
// Returns bank for a phys reg.
- unsigned getPhysRegBank(unsigned Reg) const;
+ unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
// Return a bit set for each register bank used. 4 banks for VGPRs and
// 8 banks for SGPRs.
// Registers already processed and recorded in RegsUsed are excluded.
// If Bank is not -1 assume Reg:SubReg to belong to that Bank.
- uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+ uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
// Analyze one instruction returning the number of stalls and a mask of the
// banks used by all operands.
// If Reg and Bank are provided, assume all uses of Reg will be replaced with
// a register chosen from Bank.
std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
- unsigned Reg = AMDGPU::NoRegister,
- int Bank = -1);
+ Register Reg = Register(),
+ unsigned SubReg = 0, int Bank = -1);
// Return true if register is regular VGPR or SGPR or their tuples.
// Returns false for special registers like m0, vcc etc.
- bool isReassignable(unsigned Reg) const;
+ bool isReassignable(Register Reg) const;
// Check if registers' defs are old and may be pre-loaded.
// Returns 0 if both registers are old enough, 1 or 2 if one or both
// registers will not likely be pre-loaded.
unsigned getOperandGatherWeight(const MachineInstr& MI,
- unsigned Reg1,
- unsigned Reg2,
+ Register Reg1,
+ Register Reg2,
unsigned StallCycles) const;
@@ -197,7 +210,7 @@ private:
// Find all bank bits in UsedBanks where Mask can be relocated to.
// Bank is relative to the register and not its subregister component.
// Returns 0 is a register is not reassignable.
- unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
+ unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
unsigned UsedBanks) const;
// Add cadidate instruction to the work list.
@@ -209,18 +222,20 @@ private:
unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
// Remove all candidates that read specified register.
- void removeCandidates(unsigned Reg);
+ void removeCandidates(Register Reg);
// Compute stalls within the uses of SrcReg replaced by a register from
// Bank. If Bank is -1 does not perform substitution. If Collect is set
// candidates are collected and added to work list.
- unsigned computeStallCycles(unsigned SrcReg,
- unsigned Reg = AMDGPU::NoRegister,
- int Bank = -1, bool Collect = false);
+ unsigned computeStallCycles(Register SrcReg,
+ Register Reg = Register(),
+ unsigned SubReg = 0, int Bank = -1,
+ bool Collect = false);
// Search for a register in Bank unused within LI.
// Returns phys reg or NoRegister.
- unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
+ MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
+ unsigned SubReg) const;
// Try to reassign candidate. Returns number or stall cycles saved.
unsigned tryReassign(Candidate &C);
@@ -231,9 +246,9 @@ private:
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
public:
- Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
+ Printable printReg(Register Reg, unsigned SubReg = 0) const {
return Printable([Reg, SubReg, this](raw_ostream &OS) {
- if (Register::isPhysicalRegister(Reg)) {
+ if (Reg.isPhysical()) {
OS << llvm::printReg(Reg, TRI);
return;
}
@@ -277,28 +292,37 @@ char GCNRegBankReassign::ID = 0;
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
-unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
- assert(Register::isPhysicalRegister(Reg));
+unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
+ unsigned SubReg) const {
+ assert(Reg.isPhysical());
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
if (Size == 16)
Reg = TRI->get32BitRegister(Reg);
- else if (Size > 32)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ else if (Size > 32) {
+ if (SubReg) {
+ const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
+ Reg = TRI->getSubReg(Reg, SubReg);
+ if (TRI->getRegSizeInBits(*SubRC) > 32)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ } else {
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ }
+ }
if (TRI->hasVGPRs(RC)) {
- Reg -= AMDGPU::VGPR0;
- return Reg % NUM_VGPR_BANKS;
+ unsigned RegNo = Reg - AMDGPU::VGPR0;
+ return RegNo % NUM_VGPR_BANKS;
}
- Reg = TRI->getEncodingValue(Reg) / 2;
- return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
+ unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
+ return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
}
-uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
int Bank) {
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
if (!VRM->isAssignedReg(Reg))
return 0;
@@ -323,23 +347,23 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
if (TRI->hasVGPRs(RC)) {
// VGPRs have 4 banks assigned in a round-robin fashion.
- Reg -= AMDGPU::VGPR0;
+ unsigned RegNo = Reg - AMDGPU::VGPR0;
uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
unsigned Used = 0;
// Bitmask lacks an extract method
for (unsigned I = 0; I < Size; ++I)
- if (RegsUsed.test(Reg + I))
+ if (RegsUsed.test(RegNo + I))
Used |= 1 << I;
- RegsUsed.set(Reg, Reg + Size);
+ RegsUsed.set(RegNo, RegNo + Size);
Mask &= ~Used;
- Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank);
+ Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
}
// SGPRs have 8 banks holding 2 consequitive registers each.
- Reg = TRI->getEncodingValue(Reg) / 2;
+ unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
- if (Reg + StartBit >= RegsUsed.size())
+ if (RegNo + StartBit >= RegsUsed.size())
return 0;
if (Size > 1)
@@ -347,11 +371,11 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
unsigned Mask = (1 << Size) - 1;
unsigned Used = 0;
for (unsigned I = 0; I < Size; ++I)
- if (RegsUsed.test(StartBit + Reg + I))
+ if (RegsUsed.test(StartBit + RegNo + I))
Used |= 1 << I;
- RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
+ RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
Mask &= ~Used;
- Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
+ Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
: unsigned(Bank - SGPR_BANK_OFFSET);
Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
// Reserve 4 bank ids for VGPRs.
@@ -359,8 +383,8 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
}
std::pair<unsigned, unsigned>
-GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
- int Bank) {
+GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
+ unsigned SubReg, int Bank) {
unsigned StallCycles = 0;
unsigned UsedBanks = 0;
@@ -375,26 +399,39 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
if (!Op.isReg() || Op.isUndef())
continue;
- Register R = Op.getReg();
- if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
- continue;
+ const Register R = Op.getReg();
+ const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
- unsigned ShiftedBank = Bank;
+ // Do not compute stalls for AGPRs
+ if (TRI->hasAGPRs(RC))
+ continue;
- if (Bank != -1 && R == Reg && Op.getSubReg()) {
- unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
+ // Do not compute stalls if sub-register covers all banks
+ if (Op.getSubReg()) {
LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
- if (Offset && Bank < NUM_VGPR_BANKS) {
- // If a register spans all banks we cannot shift it to avoid conflict.
+ if (TRI->hasVGPRs(RC)) {
if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
continue;
- ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
- } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
- // If a register spans all banks we cannot shift it to avoid conflict.
+ } else {
if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
continue;
+ }
+ }
+
+ unsigned ShiftedBank = Bank;
+
+ if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
+ unsigned RegOffset =
+ TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
+ unsigned Offset = TRI->getChannelFromSubReg(
+ Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
+ if (Bank < NUM_VGPR_BANKS) {
+ unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
+ ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
+ } else if (Bank >= SGPR_BANK_OFFSET) {
+ unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
ShiftedBank = SGPR_BANK_OFFSET +
- (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
+ (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
}
}
@@ -409,8 +446,8 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
}
unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
- unsigned Reg1,
- unsigned Reg2,
+ Register Reg1,
+ Register Reg2,
unsigned StallCycles) const
{
unsigned Defs = 0;
@@ -430,8 +467,8 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
return countPopulation(Defs);
}
-bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
- if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+bool GCNRegBankReassign::isReassignable(Register Reg) const {
+ if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
return false;
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
@@ -506,7 +543,7 @@ unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
return FreeBanks;
}
-unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
+unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
unsigned SubReg,
unsigned Mask,
unsigned UsedBanks) const {
@@ -556,8 +593,8 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
continue;
- unsigned Reg1 = OperandMasks[I].Reg;
- unsigned Reg2 = OperandMasks[J].Reg;
+ Register Reg1 = OperandMasks[I].Reg;
+ Register Reg2 = OperandMasks[J].Reg;
unsigned SubReg1 = OperandMasks[I].SubReg;
unsigned SubReg2 = OperandMasks[J].SubReg;
unsigned Mask1 = OperandMasks[I].Mask;
@@ -576,17 +613,17 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
if (FreeBanks1)
- Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
- + ((Size2 > Size1) ? 1 : 0)));
+ Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
+ Candidate(&MI, Reg1, SubReg1, FreeBanks1));
if (FreeBanks2)
- Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
- + ((Size1 > Size2) ? 1 : 0)));
+ Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
+ Candidate(&MI, Reg2, SubReg2, FreeBanks2));
}
}
}
-unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
- unsigned Reg, int Bank,
+unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
+ unsigned SubReg, int Bank,
bool Collect) {
unsigned TotalStallCycles = 0;
SmallSet<const MachineInstr *, 16> Visited;
@@ -598,7 +635,7 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
continue;
unsigned StallCycles;
unsigned UsedBanks;
- std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
+ std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
TotalStallCycles += StallCycles;
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
@@ -607,26 +644,26 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
return TotalStallCycles;
}
-unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
- unsigned Bank) const {
- const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
+MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
+ unsigned SubReg) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
: MaxNumSGPRs;
unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
: AMDGPU::SGPR0);
- for (unsigned Reg : RC->getRegisters()) {
+ for (MCRegister Reg : RC->getRegisters()) {
// Check occupancy limit.
if (TRI->isSubRegisterEq(Reg, MaxReg))
break;
- if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
+ if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
continue;
for (unsigned I = 0; CSRegs[I]; ++I)
if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
!LRM->isPhysRegUsed(CSRegs[I]))
- return AMDGPU::NoRegister;
+ return MCRegister::from(AMDGPU::NoRegister);
LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
@@ -634,7 +671,7 @@ unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
return Reg;
}
- return AMDGPU::NoRegister;
+ return MCRegister::from(AMDGPU::NoRegister);
}
unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
@@ -669,7 +706,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
if (C.FreeBanks & (1 << Bank)) {
LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
- unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
+ unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
if (Stalls < OrigStalls) {
LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
<< Stalls << '\n');
@@ -679,11 +716,11 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
}
llvm::sort(BankStalls);
- Register OrigReg = VRM->getPhys(C.Reg);
+ MCRegister OrigReg = VRM->getPhys(C.Reg);
LRM->unassign(LI);
while (!BankStalls.empty()) {
BankStall BS = BankStalls.pop_back_val();
- unsigned Reg = scavengeReg(LI, BS.Bank);
+ MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
if (Reg == AMDGPU::NoRegister) {
LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
<< '\n');
@@ -735,10 +772,16 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
return TotalStallCycles;
}
-void GCNRegBankReassign::removeCandidates(unsigned Reg) {
- Candidates.remove_if([Reg, this](const Candidate& C) {
- return C.MI->readsRegister(Reg, TRI);
- });
+void GCNRegBankReassign::removeCandidates(Register Reg) {
+ typename CandidateList::iterator Next;
+ for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
+ Next = std::next(I);
+ I->second.remove_if([Reg, this](const Candidate& C) {
+ return C.MI->readsRegister(Reg, TRI);
+ });
+ if (I->second.empty())
+ Candidates.erase(I);
+ }
}
bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
@@ -770,9 +813,10 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
CSRegs = MRI->getCalleeSavedRegs();
-
- RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
- TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
+ unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
+ // Not a tight bound
+ AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
+ RegsUsed.resize(NumRegBanks);
LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
<< '\n');
@@ -783,11 +827,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
"function " << MF.getName() << '\n');
- Candidates.sort();
-
- LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
- for (auto C : Candidates) C.dump(this);
- dbgs() << "\n\n");
+ LLVM_DEBUG(Candidates.dump(this));
unsigned CyclesSaved = 0;
while (!Candidates.empty()) {
@@ -801,13 +841,9 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
Candidates.pop_back();
if (LocalCyclesSaved) {
removeCandidates(C.Reg);
- computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
- Candidates.sort();
+ computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
- LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
- for (auto C : Candidates)
- C.dump(this);
- dbgs() << "\n\n");
+ LLVM_DEBUG(Candidates.dump(this));
}
}
NumStallsRecovered += CyclesSaved;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 86a3cb9af32f..aeec3e886327 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,25 +12,7 @@
//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
-#include "AMDGPUSubtarget.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
using namespace llvm;
@@ -87,9 +69,9 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
///////////////////////////////////////////////////////////////////////////////
// GCNRegPressure
-unsigned GCNRegPressure::getRegKind(unsigned Reg,
+unsigned GCNRegPressure::getRegKind(Register Reg,
const MachineRegisterInfo &MRI) {
- assert(Register::isVirtualRegister(Reg));
+ assert(Reg.isVirtual());
const auto RC = MRI.getRegClass(Reg);
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
@@ -199,7 +181,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
static LaneBitmask getDefRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI) {
- assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
+ assert(MO.isDef() && MO.isReg() && MO.getReg().isVirtual());
// We don't rely on read-undef flag because in case of tentative schedule
// tracking it isn't set correctly yet. This works correctly however since
@@ -212,7 +194,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO,
static LaneBitmask getUsedRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI,
const LiveIntervals &LIS) {
- assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
+ assert(MO.isUse() && MO.isReg() && MO.getReg().isVirtual());
if (auto SubReg = MO.getSubReg())
return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
@@ -233,7 +215,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
SmallVector<RegisterMaskPair, 8> Res;
for (const auto &MO : MI.operands()) {
- if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
+ if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
if (!MO.isUse() || !MO.readsReg())
continue;
@@ -241,9 +223,8 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
auto const UsedMask = getUsedRegMask(MO, MRI, LIS);
auto Reg = MO.getReg();
- auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) {
- return RM.RegUnit == Reg;
- });
+ auto I = llvm::find_if(
+ Res, [Reg](const RegisterMaskPair &RM) { return RM.RegUnit == Reg; });
if (I != Res.end())
I->LaneMask |= UsedMask;
else
@@ -330,8 +311,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
MaxPressure = max(AtMIPressure, MaxPressure);
for (const auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.isDef() ||
- !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
+ if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual() || MO.isDead())
continue;
auto Reg = MO.getReg();
@@ -410,7 +390,7 @@ void GCNDownwardRPTracker::advanceToNext() {
if (!MO.isReg() || !MO.isDef())
continue;
Register Reg = MO.getReg();
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
continue;
auto &LiveMask = LiveRegs[Reg];
auto PrevMask = LiveMask;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 2ef79410719f..ba8c85aa502b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -17,21 +17,15 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/DenseMap.h"
+#include "GCNSubtarget.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/Debug.h"
#include <algorithm>
-#include <limits>
namespace llvm {
class MachineRegisterInfo;
class raw_ostream;
+class SlotIndex;
struct GCNRegPressure {
enum RegKind {
@@ -90,7 +84,7 @@ struct GCNRegPressure {
private:
unsigned Value[TOTAL_KINDS];
- static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+ static unsigned getRegKind(Register Reg, const MachineRegisterInfo &MRI);
friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index deed50b6db7d..6e2550298dc6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -12,13 +12,7 @@
//===----------------------------------------------------------------------===//
#include "GCNSchedStrategy.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/Support/MathExtras.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -567,8 +561,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
SavedMutations.swap(Mutations);
for (auto Region : Regions) {
- if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+ if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) {
+ ++RegionIdx;
continue;
+ }
RegionBegin = Region.first;
RegionEnd = Region.second;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
new file mode 100644
index 000000000000..7a7178126444
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -0,0 +1,1064 @@
+//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// AMD GCN specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
+
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUSubtarget.h"
+#include "SIFrameLowering.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+
+} // namespace llvm
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class GCNTargetMachine;
+
+class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
+ public AMDGPUSubtarget {
+
+ using AMDGPUSubtarget::getMaxWavesPerEU;
+
+public:
+ enum TrapHandlerAbi {
+ TrapHandlerAbiNone = 0,
+ TrapHandlerAbiHsa = 1
+ };
+
+ enum TrapID {
+ TrapIDHardwareReserved = 0,
+ TrapIDHSADebugTrap = 1,
+ TrapIDLLVMTrap = 2,
+ TrapIDLLVMDebugTrap = 3,
+ TrapIDDebugBreakpoint = 7,
+ TrapIDDebugReserved8 = 8,
+ TrapIDDebugReservedFE = 0xfe,
+ TrapIDDebugReservedFF = 0xff
+ };
+
+ enum TrapRegValues {
+ LLVMTrapHandlerRegValue = 1
+ };
+
+private:
+ /// GlobalISel related APIs.
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+protected:
+ // Basic subtarget description.
+ Triple TargetTriple;
+ AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
+ unsigned Gen;
+ InstrItineraryData InstrItins;
+ int LDSBankCount;
+ unsigned MaxPrivateElementSize;
+
+ // Possibly statically set by tablegen, but may want to be overridden.
+ bool FastFMAF32;
+ bool FastDenormalF32;
+ bool HalfRate64Ops;
+
+ // Dynamically set bits that enable features.
+ bool FlatForGlobal;
+ bool AutoWaitcntBeforeBarrier;
+ bool UnalignedScratchAccess;
+ bool UnalignedAccessMode;
+ bool HasApertureRegs;
+ bool SupportsXNACK;
+
+ // This should not be used directly. 'TargetID' tracks the dynamic settings
+ // for XNACK.
+ bool EnableXNACK;
+
+ bool EnableCuMode;
+ bool TrapHandler;
+
+ // Used as options.
+ bool EnableLoadStoreOpt;
+ bool EnableUnsafeDSOffsetFolding;
+ bool EnableSIScheduler;
+ bool EnableDS128;
+ bool EnablePRTStrictNull;
+ bool DumpCode;
+
+ // Subtarget statically properties set by tablegen
+ bool FP64;
+ bool FMA;
+ bool MIMG_R128;
+ bool GCN3Encoding;
+ bool CIInsts;
+ bool GFX8Insts;
+ bool GFX9Insts;
+ bool GFX10Insts;
+ bool GFX10_3Insts;
+ bool GFX7GFX8GFX9Insts;
+ bool SGPRInitBug;
+ bool HasSMemRealTime;
+ bool HasIntClamp;
+ bool HasFmaMixInsts;
+ bool HasMovrel;
+ bool HasVGPRIndexMode;
+ bool HasScalarStores;
+ bool HasScalarAtomics;
+ bool HasSDWAOmod;
+ bool HasSDWAScalar;
+ bool HasSDWASdst;
+ bool HasSDWAMac;
+ bool HasSDWAOutModsVOPC;
+ bool HasDPP;
+ bool HasDPP8;
+ bool HasR128A16;
+ bool HasGFX10A16;
+ bool HasG16;
+ bool HasNSAEncoding;
+ bool GFX10_BEncoding;
+ bool HasDLInsts;
+ bool HasDot1Insts;
+ bool HasDot2Insts;
+ bool HasDot3Insts;
+ bool HasDot4Insts;
+ bool HasDot5Insts;
+ bool HasDot6Insts;
+ bool HasMAIInsts;
+ bool HasPkFmacF16Inst;
+ bool HasAtomicFaddInsts;
+ bool SupportsSRAMECC;
+
+ // This should not be used directly. 'TargetID' tracks the dynamic settings
+ // for SRAMECC.
+ bool EnableSRAMECC;
+
+ bool HasNoSdstCMPX;
+ bool HasVscnt;
+ bool HasGetWaveIdInst;
+ bool HasSMemTimeInst;
+ bool HasRegisterBanking;
+ bool HasVOP3Literal;
+ bool HasNoDataDepHazard;
+ bool FlatAddressSpace;
+ bool FlatInstOffsets;
+ bool FlatGlobalInsts;
+ bool FlatScratchInsts;
+ bool ScalarFlatScratchInsts;
+ bool AddNoCarryInsts;
+ bool HasUnpackedD16VMem;
+ bool LDSMisalignedBug;
+ bool HasMFMAInlineLiteralBug;
+ bool UnalignedBufferAccess;
+ bool UnalignedDSAccess;
+ bool ScalarizeGlobal;
+
+ bool HasVcmpxPermlaneHazard;
+ bool HasVMEMtoScalarWriteHazard;
+ bool HasSMEMtoVectorWriteHazard;
+ bool HasInstFwdPrefetchBug;
+ bool HasVcmpxExecWARHazard;
+ bool HasLdsBranchVmemWARHazard;
+ bool HasNSAtoVMEMBug;
+ bool HasOffset3fBug;
+ bool HasFlatSegmentOffsetBug;
+ bool HasImageStoreD16Bug;
+ bool HasImageGather4D16Bug;
+
+ // Dummy feature to use for assembler in tablegen.
+ bool FeatureDisable;
+
+ SelectionDAGTargetInfo TSInfo;
+private:
+ SIInstrInfo InstrInfo;
+ SITargetLowering TLInfo;
+ SIFrameLowering FrameLowering;
+
+public:
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
+ static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
+
+ GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+ const GCNTargetMachine &TM);
+ ~GCNSubtarget() override;
+
+ GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS);
+
+ const SIInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
+
+ const SIFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const SITargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const SIRegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+
+ const InlineAsmLowering *getInlineAsmLowering() const override {
+ return InlineAsmLoweringInfo.get();
+ }
+
+ InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
+ }
+
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
+ }
+
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
+
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+ Generation getGeneration() const {
+ return (Generation)Gen;
+ }
+
+ /// Return the number of high bits known to be zero fror a frame index.
+ unsigned getKnownHighZeroBitsForFrameIndex() const {
+ return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+ }
+
+ int getLDSBankCount() const {
+ return LDSBankCount;
+ }
+
+ unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
+ return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
+ }
+
+ unsigned getConstantBusLimit(unsigned Opcode) const;
+
+ bool hasIntClamp() const {
+ return HasIntClamp;
+ }
+
+ bool hasFP64() const {
+ return FP64;
+ }
+
+ bool hasMIMG_R128() const {
+ return MIMG_R128;
+ }
+
+ bool hasHWFP64() const {
+ return FP64;
+ }
+
+ bool hasFastFMAF32() const {
+ return FastFMAF32;
+ }
+
+ bool hasHalfRate64Ops() const {
+ return HalfRate64Ops;
+ }
+
+ bool hasAddr64() const {
+ return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
+ }
+
+ bool hasFlat() const {
+ return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
+ }
+
+ // Return true if the target only has the reverse operand versions of VALU
+ // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
+ bool hasOnlyRevVALUShifts() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasFractBug() const {
+ return getGeneration() == SOUTHERN_ISLANDS;
+ }
+
+ bool hasBFE() const {
+ return true;
+ }
+
+ bool hasBFI() const {
+ return true;
+ }
+
+ bool hasBFM() const {
+ return hasBFE();
+ }
+
+ bool hasBCNT(unsigned Size) const {
+ return true;
+ }
+
+ bool hasFFBL() const {
+ return true;
+ }
+
+ bool hasFFBH() const {
+ return true;
+ }
+
+ bool hasMed3_16() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasMin3Max3_16() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasFmaMixInsts() const {
+ return HasFmaMixInsts;
+ }
+
+ bool hasCARRY() const {
+ return true;
+ }
+
+ bool hasFMA() const {
+ return FMA;
+ }
+
+ bool hasSwap() const {
+ return GFX9Insts;
+ }
+
+ bool hasScalarPackInsts() const {
+ return GFX9Insts;
+ }
+
+ bool hasScalarMulHiInsts() const {
+ return GFX9Insts;
+ }
+
+ TrapHandlerAbi getTrapHandlerAbi() const {
+ return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+ }
+
+ /// True if the offset field of DS instructions works as expected. On SI, the
+ /// offset uses a 16-bit adder and does not always wrap properly.
+ bool hasUsableDSOffset() const {
+ return getGeneration() >= SEA_ISLANDS;
+ }
+
+ bool unsafeDSOffsetFoldingEnabled() const {
+ return EnableUnsafeDSOffsetFolding;
+ }
+
+ /// Condition output from div_scale is usable.
+ bool hasUsableDivScaleConditionOutput() const {
+ return getGeneration() != SOUTHERN_ISLANDS;
+ }
+
+ /// Extra wait hazard is needed in some cases before
+ /// s_cbranch_vccnz/s_cbranch_vccz.
+ bool hasReadVCCZBug() const {
+ return getGeneration() <= SEA_ISLANDS;
+ }
+
+ /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
+ bool partialVCCWritesUpdateVCCZ() const {
+ return getGeneration() >= GFX10;
+ }
+
+ /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
+ /// was written by a VALU instruction.
+ bool hasSMRDReadVALUDefHazard() const {
+ return getGeneration() == SOUTHERN_ISLANDS;
+ }
+
+ /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
+ /// SGPR was written by a VALU Instruction.
+ bool hasVMEMReadSGPRVALUDefHazard() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasRFEHazards() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
+ unsigned getSetRegWaitStates() const {
+ return getGeneration() <= SEA_ISLANDS ? 1 : 2;
+ }
+
+ bool dumpCode() const {
+ return DumpCode;
+ }
+
+ /// Return the amount of LDS that can be used that will not restrict the
+ /// occupancy lower than WaveCount.
+ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+ const Function &) const;
+
+ bool supportsMinMaxDenormModes() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
+ /// \returns If target supports S_DENORM_MODE.
+ bool hasDenormModeInst() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX10;
+ }
+
+ bool useFlatForGlobal() const {
+ return FlatForGlobal;
+ }
+
+ /// \returns If target supports ds_read/write_b128 and user enables generation
+ /// of ds_read/write_b128.
+ bool useDS128() const {
+ return CIInsts && EnableDS128;
+ }
+
+ /// \return If target supports ds_read/write_b96/128.
+ bool hasDS96AndDS128() const {
+ return CIInsts;
+ }
+
+ /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
+ bool haveRoundOpsF64() const {
+ return CIInsts;
+ }
+
+ /// \returns If MUBUF instructions always perform range checking, even for
+ /// buffer resources used for private memory access.
+ bool privateMemoryResourceIsRangeChecked() const {
+ return getGeneration() < AMDGPUSubtarget::GFX9;
+ }
+
+ /// \returns If target requires PRT Struct NULL support (zero result registers
+ /// for sparse texture support).
+ bool usePRTStrictNull() const {
+ return EnablePRTStrictNull;
+ }
+
+ bool hasAutoWaitcntBeforeBarrier() const {
+ return AutoWaitcntBeforeBarrier;
+ }
+
+ bool hasUnalignedBufferAccess() const {
+ return UnalignedBufferAccess;
+ }
+
+ bool hasUnalignedBufferAccessEnabled() const {
+ return UnalignedBufferAccess && UnalignedAccessMode;
+ }
+
+ bool hasUnalignedDSAccess() const {
+ return UnalignedDSAccess;
+ }
+
+ bool hasUnalignedDSAccessEnabled() const {
+ return UnalignedDSAccess && UnalignedAccessMode;
+ }
+
+ bool hasUnalignedScratchAccess() const {
+ return UnalignedScratchAccess;
+ }
+
+ bool hasUnalignedAccessMode() const {
+ return UnalignedAccessMode;
+ }
+
+ bool hasApertureRegs() const {
+ return HasApertureRegs;
+ }
+
+ bool isTrapHandlerEnabled() const {
+ return TrapHandler;
+ }
+
+ bool isXNACKEnabled() const {
+ return TargetID.isXnackOnOrAny();
+ }
+
+ bool isCuModeEnabled() const {
+ return EnableCuMode;
+ }
+
+ bool hasFlatAddressSpace() const {
+ return FlatAddressSpace;
+ }
+
+ bool hasFlatScrRegister() const {
+ return hasFlatAddressSpace();
+ }
+
+ bool hasFlatInstOffsets() const {
+ return FlatInstOffsets;
+ }
+
+ bool hasFlatGlobalInsts() const {
+ return FlatGlobalInsts;
+ }
+
+ bool hasFlatScratchInsts() const {
+ return FlatScratchInsts;
+ }
+
+ // Check if target supports ST addressing mode with FLAT scratch instructions.
+ // The ST addressing mode means no registers are used, either VGPR or SGPR,
+ // but only immediate offset is swizzled and added to the FLAT scratch base.
+ bool hasFlatScratchSTMode() const {
+ return hasFlatScratchInsts() && hasGFX10_3Insts();
+ }
+
+ bool hasScalarFlatScratchInsts() const {
+ return ScalarFlatScratchInsts;
+ }
+
+ bool hasGlobalAddTidInsts() const {
+ return GFX10_BEncoding;
+ }
+
+ bool hasAtomicCSub() const {
+ return GFX10_BEncoding;
+ }
+
+ bool hasMultiDwordFlatScratchAddressing() const {
+ return getGeneration() >= GFX9;
+ }
+
+ bool hasFlatSegmentOffsetBug() const {
+ return HasFlatSegmentOffsetBug;
+ }
+
+ bool hasFlatLgkmVMemCountInOrder() const {
+ return getGeneration() > GFX9;
+ }
+
+ bool hasD16LoadStore() const {
+ return getGeneration() >= GFX9;
+ }
+
+ bool d16PreservesUnusedBits() const {
+ return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
+ }
+
+ bool hasD16Images() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ /// Return if most LDS instructions have an m0 use that require m0 to be
+ /// iniitalized.
+ bool ldsRequiresM0Init() const {
+ return getGeneration() < GFX9;
+ }
+
+ // True if the hardware rewinds and replays GWS operations if a wave is
+ // preempted.
+ //
+ // If this is false, a GWS operation requires testing if a nack set the
+ // MEM_VIOL bit, and repeating if so.
+ bool hasGWSAutoReplay() const {
+ return getGeneration() >= GFX9;
+ }
+
+ /// \returns if target has ds_gws_sema_release_all instruction.
+ bool hasGWSSemaReleaseAll() const {
+ return CIInsts;
+ }
+
+ /// \returns true if the target has integer add/sub instructions that do not
+ /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
+ /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
+ /// for saturation.
+ bool hasAddNoCarry() const {
+ return AddNoCarryInsts;
+ }
+
+ bool hasUnpackedD16VMem() const {
+ return HasUnpackedD16VMem;
+ }
+
+ // Covers VS/PS/CS graphics shaders
+ bool isMesaGfxShader(const Function &F) const {
+ return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
+ }
+
+ bool hasMad64_32() const {
+ return getGeneration() >= SEA_ISLANDS;
+ }
+
+ bool hasSDWAOmod() const {
+ return HasSDWAOmod;
+ }
+
+ bool hasSDWAScalar() const {
+ return HasSDWAScalar;
+ }
+
+ bool hasSDWASdst() const {
+ return HasSDWASdst;
+ }
+
+ bool hasSDWAMac() const {
+ return HasSDWAMac;
+ }
+
+ bool hasSDWAOutModsVOPC() const {
+ return HasSDWAOutModsVOPC;
+ }
+
+ bool hasDLInsts() const {
+ return HasDLInsts;
+ }
+
+ bool hasDot1Insts() const {
+ return HasDot1Insts;
+ }
+
+ bool hasDot2Insts() const {
+ return HasDot2Insts;
+ }
+
+ bool hasDot3Insts() const {
+ return HasDot3Insts;
+ }
+
+ bool hasDot4Insts() const {
+ return HasDot4Insts;
+ }
+
+ bool hasDot5Insts() const {
+ return HasDot5Insts;
+ }
+
+ bool hasDot6Insts() const {
+ return HasDot6Insts;
+ }
+
+ bool hasMAIInsts() const {
+ return HasMAIInsts;
+ }
+
+ bool hasPkFmacF16Inst() const {
+ return HasPkFmacF16Inst;
+ }
+
+ bool hasAtomicFaddInsts() const {
+ return HasAtomicFaddInsts;
+ }
+
+ bool hasNoSdstCMPX() const {
+ return HasNoSdstCMPX;
+ }
+
+ bool hasVscnt() const {
+ return HasVscnt;
+ }
+
+ bool hasGetWaveIdInst() const {
+ return HasGetWaveIdInst;
+ }
+
+ bool hasSMemTimeInst() const {
+ return HasSMemTimeInst;
+ }
+
+ bool hasRegisterBanking() const {
+ return HasRegisterBanking;
+ }
+
+ bool hasVOP3Literal() const {
+ return HasVOP3Literal;
+ }
+
+ bool hasNoDataDepHazard() const {
+ return HasNoDataDepHazard;
+ }
+
+ bool vmemWriteNeedsExpWaitcnt() const {
+ return getGeneration() < SEA_ISLANDS;
+ }
+
+ // Scratch is allocated in 256 dword per wave blocks for the entire
+ // wavefront. When viewed from the perspecive of an arbitrary workitem, this
+ // is 4-byte aligned.
+ //
+ // Only 4-byte alignment is really needed to access anything. Transformations
+ // on the pointer value itself may rely on the alignment / known low bits of
+ // the pointer. Set this to something above the minimum to avoid needing
+ // dynamic realignment in common cases.
+ Align getStackAlignment() const { return Align(16); }
+
+ bool enableMachineScheduler() const override {
+ return true;
+ }
+
+ bool useAA() const override;
+
+ bool enableSubRegLiveness() const override {
+ return true;
+ }
+
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
+
+ // static wrappers
+ static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
+
+ // XXX - Why is this here if it isn't in the default pass set?
+ bool enableEarlyIfConversion() const override {
+ return true;
+ }
+
+ bool enableFlatScratch() const;
+
+ void overrideSchedPolicy(MachineSchedPolicy &Policy,
+ unsigned NumRegionInstrs) const override;
+
+ unsigned getMaxNumUserSGPRs() const {
+ return 16;
+ }
+
+ bool hasSMemRealTime() const {
+ return HasSMemRealTime;
+ }
+
+ bool hasMovrel() const {
+ return HasMovrel;
+ }
+
+ bool hasVGPRIndexMode() const {
+ return HasVGPRIndexMode;
+ }
+
+ bool useVGPRIndexMode() const;
+
+ bool hasScalarCompareEq64() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasScalarStores() const {
+ return HasScalarStores;
+ }
+
+ bool hasScalarAtomics() const {
+ return HasScalarAtomics;
+ }
+
+ bool hasLDSFPAtomics() const {
+ return GFX8Insts;
+ }
+
+ bool hasDPP() const {
+ return HasDPP;
+ }
+
+ bool hasDPPBroadcasts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
+ bool hasDPPWavefrontShifts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
+ bool hasDPP8() const {
+ return HasDPP8;
+ }
+
+ bool hasR128A16() const {
+ return HasR128A16;
+ }
+
+ bool hasGFX10A16() const {
+ return HasGFX10A16;
+ }
+
+ bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
+
+ bool hasG16() const { return HasG16; }
+
+ bool hasOffset3fBug() const {
+ return HasOffset3fBug;
+ }
+
+ bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
+
+ bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
+
+ bool hasNSAEncoding() const { return HasNSAEncoding; }
+
+ bool hasGFX10_BEncoding() const {
+ return GFX10_BEncoding;
+ }
+
+ bool hasGFX10_3Insts() const {
+ return GFX10_3Insts;
+ }
+
+ bool hasMadF16() const;
+
+ bool enableSIScheduler() const {
+ return EnableSIScheduler;
+ }
+
+ bool loadStoreOptEnabled() const {
+ return EnableLoadStoreOpt;
+ }
+
+ bool hasSGPRInitBug() const {
+ return SGPRInitBug;
+ }
+
+ bool hasMFMAInlineLiteralBug() const {
+ return HasMFMAInlineLiteralBug;
+ }
+
+ bool has12DWordStoreHazard() const {
+ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+ }
+
+ // \returns true if the subtarget supports DWORDX3 load/store instructions.
+ bool hasDwordx3LoadStores() const {
+ return CIInsts;
+ }
+
+ bool hasReadM0MovRelInterpHazard() const {
+ return getGeneration() == AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasReadM0SendMsgHazard() const {
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ getGeneration() <= AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasVcmpxPermlaneHazard() const {
+ return HasVcmpxPermlaneHazard;
+ }
+
+ bool hasVMEMtoScalarWriteHazard() const {
+ return HasVMEMtoScalarWriteHazard;
+ }
+
+ bool hasSMEMtoVectorWriteHazard() const {
+ return HasSMEMtoVectorWriteHazard;
+ }
+
+ bool hasLDSMisalignedBug() const {
+ return LDSMisalignedBug && !EnableCuMode;
+ }
+
+ bool hasInstFwdPrefetchBug() const {
+ return HasInstFwdPrefetchBug;
+ }
+
+ bool hasVcmpxExecWARHazard() const {
+ return HasVcmpxExecWARHazard;
+ }
+
+ bool hasLdsBranchVmemWARHazard() const {
+ return HasLdsBranchVmemWARHazard;
+ }
+
+ bool hasNSAtoVMEMBug() const {
+ return HasNSAtoVMEMBug;
+ }
+
+ bool hasHardClauses() const { return getGeneration() >= GFX10; }
+
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+ /// SGPRs
+ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+ /// VGPRs
+ unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+
+ /// Return occupancy for the given function. Used LDS and a number of
+ /// registers if provided.
+ /// Note, occupancy can be affected by the scratch allocation as well, but
+ /// we do not have enough information to compute it.
+ unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
+ unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+
+ /// \returns true if the flat_scratch register should be initialized with the
+ /// pointer to the wave's scratch memory rather than a size and offset.
+ bool flatScratchIsPointer() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
+ /// \returns true if the machine has merged shaders in which s0-s7 are
+ /// reserved by the hardware and user SGPRs start at s8
+ bool hasMergedShaders() const {
+ return getGeneration() >= GFX9;
+ }
+
+ /// \returns SGPR allocation granularity supported by the subtarget.
+ unsigned getSGPRAllocGranule() const {
+ return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
+ }
+
+ /// \returns SGPR encoding granularity supported by the subtarget.
+ unsigned getSGPREncodingGranule() const {
+ return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
+ }
+
+ /// \returns Total number of SGPRs supported by the subtarget.
+ unsigned getTotalNumSGPRs() const {
+ return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
+ }
+
+ /// \returns Addressable number of SGPRs supported by the subtarget.
+ unsigned getAddressableNumSGPRs() const {
+ return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
+ }
+
+ /// \returns Minimum number of SGPRs that meets the given number of waves per
+ /// execution unit requirement supported by the subtarget.
+ unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
+ }
+
+ /// \returns Maximum number of SGPRs that meets the given number of waves per
+ /// execution unit requirement supported by the subtarget.
+ unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+ return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
+ }
+
+ /// \returns Reserved number of SGPRs for given function \p MF.
+ unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+ /// \returns Maximum number of SGPRs that meets number of waves per execution
+ /// unit requirement for function \p MF, or number of SGPRs explicitly
+ /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+ /// \returns VGPR allocation granularity supported by the subtarget.
+ unsigned getVGPRAllocGranule() const {
+ return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
+ }
+
+ /// \returns VGPR encoding granularity supported by the subtarget.
+ unsigned getVGPREncodingGranule() const {
+ return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
+ }
+
+ /// \returns Total number of VGPRs supported by the subtarget.
+ unsigned getTotalNumVGPRs() const {
+ return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
+ }
+
+ /// \returns Addressable number of VGPRs supported by the subtarget.
+ unsigned getAddressableNumVGPRs() const {
+ return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
+ }
+
+ /// \returns Minimum number of VGPRs that meets given number of waves per
+ /// execution unit requirement supported by the subtarget.
+ unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
+ }
+
+ /// \returns Maximum number of VGPRs that meets given number of waves per
+ /// execution unit requirement supported by the subtarget.
+ unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+ }
+
+ /// \returns Maximum number of VGPRs that meets number of waves per execution
+ /// unit requirement for function \p MF, or number of VGPRs explicitly
+ /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+ void getPostRAMutations(
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+ const override;
+
+ bool isWave32() const {
+ return getWavefrontSize() == 32;
+ }
+
+ bool isWave64() const {
+ return getWavefrontSize() == 64;
+ }
+
+ const TargetRegisterClass *getBoolRC() const {
+ return getRegisterInfo()->getBoolRC();
+ }
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+ }
+
+ /// \returns Number of waves per execution unit required to support the given
+ /// \p FlatWorkGroupSize.
+ unsigned
+ getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const override {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+ }
+
+ void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
+ SDep &Dep) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td
new file mode 100644
index 000000000000..98b2adc442fa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/InstCombineTables.td
@@ -0,0 +1,11 @@
+include "llvm/TableGen/SearchableTable.td"
+include "llvm/IR/Intrinsics.td"
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+ let FilterClass = "AMDGPUImageDMaskIntrinsic";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+ let PrimaryKeyEarlyOut = 1;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index ea6e9038fd1e..dd0db6c7b655 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -9,17 +9,14 @@
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCValue.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/TargetRegistry.h"
-#include "Utils/AMDGPUBaseInfo.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -61,7 +58,6 @@ void AMDGPUAsmBackend::relaxInstruction(MCInst &Inst,
Res.setOpcode(RelaxedOpcode);
Res.addOperand(Inst.getOperand(0));
Inst = std::move(Res);
- return;
}
bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
@@ -237,7 +233,6 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
- // Use 64-bit ELF for amdgcn
return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
- IsaInfo::hasCodeObjectV3(&STI) ? 1 : 0);
+ getHsaAbiVersion(&STI).getValueOr(0));
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 619fde74e88d..426648d19d55 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -8,15 +8,9 @@
#include "AMDGPUFixupKinds.h"
#include "AMDGPUMCTargetDesc.h"
-#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 40437d8fa1a4..1ce7012040da 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -7,10 +7,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUELFStreamer.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCObjectWriter.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 9fbf53c944ef..b56f75132135 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -14,13 +14,15 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
-#include "llvm/MC/MCELFStreamer.h"
-
+#include <memory>
namespace llvm {
class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCSubtargetInfo;
+class MCELFStreamer;
+class Triple;
+class MCObjectWriter;
MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fe063d33ea3e..fbf7dc2a72db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -16,13 +16,9 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -136,7 +132,7 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
uint16_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
- O << ((OpNo == 0)? "offset:" : " offset:");
+ O << " offset:";
printU16ImmDecOperand(MI, OpNo, O);
}
}
@@ -146,15 +142,16 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
uint16_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
- O << ((OpNo == 0)? "offset:" : " offset:");
+ O << " offset:";
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- bool IsFlatSeg = !(Desc.TSFlags & SIInstrFlags::IsNonFlatSeg);
+ bool IsFlatSeg = !(Desc.TSFlags &
+ (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch));
if (IsFlatSeg) { // Unsigned offset
printU16ImmDecOperand(MI, OpNo, O);
} else { // Signed offset
- if (AMDGPU::isGFX10(STI)) {
+ if (AMDGPU::isGFX10Plus(STI)) {
O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
} else {
O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
@@ -206,7 +203,7 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- if (AMDGPU::isGFX10(STI))
+ if (AMDGPU::isGFX10Plus(STI))
printNamedBit(MI, OpNo, O, "dlc");
}
@@ -285,26 +282,58 @@ void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm())
- O << " compr";
+ printNamedBit(MI, OpNo, O, "compr");
}
void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm())
- O << " vm";
+ printNamedBit(MI, OpNo, O, "vm");
}
void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (unsigned Val = MI->getOperand(OpNo).getImm()) {
- if (AMDGPU::isGFX10(STI))
+}
+
+void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::MTBUFFormat;
+
+ int OpNo =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::format);
+ assert(OpNo != -1);
+
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ if (AMDGPU::isGFX10Plus(STI)) {
+ if (Val == UFMT_DEFAULT)
+ return;
+ if (isValidUnifiedFormat(Val)) {
+ O << " format:[" << getUnifiedFormatName(Val) << ']';
+ } else {
+ O << " format:" << Val;
+ }
+ } else {
+ if (Val == DFMT_NFMT_DEFAULT)
+ return;
+ if (isValidDfmtNfmt(Val, STI)) {
+ unsigned Dfmt;
+ unsigned Nfmt;
+ decodeDfmtNfmt(Val, Dfmt, Nfmt);
+ O << " format:[";
+ if (Dfmt != DFMT_DEFAULT) {
+ O << getDfmtName(Dfmt);
+ if (Nfmt != NFMT_DEFAULT) {
+ O << ',';
+ }
+ }
+ if (Nfmt != NFMT_DEFAULT) {
+ O << getNfmtName(Nfmt, STI);
+ }
+ O << ']';
+ } else {
O << " format:" << Val;
- else {
- O << " dfmt:" << (Val & 15);
- O << ", nfmt:" << (Val >> 4);
}
}
}
@@ -382,10 +411,12 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
int16_t SImm = static_cast<int16_t>(Imm);
- if (isInlinableIntLiteral(SImm))
+ if (isInlinableIntLiteral(SImm)) {
O << SImm;
- else
- O << formatHex(static_cast<uint64_t>(Imm));
+ } else {
+ uint64_t Imm16 = static_cast<uint16_t>(Imm);
+ O << formatHex(Imm16);
+ }
}
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
@@ -413,11 +444,13 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
O<< "4.0";
else if (Imm == 0xC400)
O<< "-4.0";
- else if (Imm == 0x3118) {
- assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
+ else if (Imm == 0x3118 &&
+ STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) {
O << "0.15915494";
- } else
- O << formatHex(static_cast<uint64_t>(Imm));
+ } else {
+ uint64_t Imm16 = static_cast<uint16_t>(Imm);
+ O << formatHex(Imm16);
+ }
}
void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
@@ -669,6 +702,14 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
printDefaultVccOperand(OpNo, STI, O);
break;
}
+
+ if (Desc.TSFlags & SIInstrFlags::MTBUF) {
+ int SOffsetIdx =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::soffset);
+ assert(SOffsetIdx != -1);
+ if ((int)OpNo == SOffsetIdx)
+ printSymbolicFormat(MI, STI, O);
+ }
}
void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
@@ -735,11 +776,11 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (!AMDGPU::isGFX10(STI))
+ if (!AMDGPU::isGFX10Plus(STI))
llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10");
unsigned Imm = MI->getOperand(OpNo).getImm();
- O << " dpp8:[" << formatDec(Imm & 0x7);
+ O << "dpp8:[" << formatDec(Imm & 0x7);
for (size_t i = 1; i < 8; ++i) {
O << ',' << formatDec((Imm >> (3 * i)) & 0x7);
}
@@ -753,81 +794,81 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
unsigned Imm = MI->getOperand(OpNo).getImm();
if (Imm <= DppCtrl::QUAD_PERM_LAST) {
- O << " quad_perm:[";
+ O << "quad_perm:[";
O << formatDec(Imm & 0x3) << ',';
O << formatDec((Imm & 0xc) >> 2) << ',';
O << formatDec((Imm & 0x30) >> 4) << ',';
O << formatDec((Imm & 0xc0) >> 6) << ']';
} else if ((Imm >= DppCtrl::ROW_SHL_FIRST) &&
(Imm <= DppCtrl::ROW_SHL_LAST)) {
- O << " row_shl:";
+ O << "row_shl:";
printU4ImmDecOperand(MI, OpNo, O);
} else if ((Imm >= DppCtrl::ROW_SHR_FIRST) &&
(Imm <= DppCtrl::ROW_SHR_LAST)) {
- O << " row_shr:";
+ O << "row_shr:";
printU4ImmDecOperand(MI, OpNo, O);
} else if ((Imm >= DppCtrl::ROW_ROR_FIRST) &&
(Imm <= DppCtrl::ROW_ROR_LAST)) {
- O << " row_ror:";
+ O << "row_ror:";
printU4ImmDecOperand(MI, OpNo, O);
} else if (Imm == DppCtrl::WAVE_SHL1) {
- if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
- O << " /* wave_shl is not supported starting from GFX10 */";
+ if (AMDGPU::isGFX10Plus(STI)) {
+ O << "/* wave_shl is not supported starting from GFX10 */";
return;
}
- O << " wave_shl:1";
+ O << "wave_shl:1";
} else if (Imm == DppCtrl::WAVE_ROL1) {
- if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
- O << " /* wave_rol is not supported starting from GFX10 */";
+ if (AMDGPU::isGFX10Plus(STI)) {
+ O << "/* wave_rol is not supported starting from GFX10 */";
return;
}
- O << " wave_rol:1";
+ O << "wave_rol:1";
} else if (Imm == DppCtrl::WAVE_SHR1) {
- if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
- O << " /* wave_shr is not supported starting from GFX10 */";
+ if (AMDGPU::isGFX10Plus(STI)) {
+ O << "/* wave_shr is not supported starting from GFX10 */";
return;
}
- O << " wave_shr:1";
+ O << "wave_shr:1";
} else if (Imm == DppCtrl::WAVE_ROR1) {
- if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
- O << " /* wave_ror is not supported starting from GFX10 */";
+ if (AMDGPU::isGFX10Plus(STI)) {
+ O << "/* wave_ror is not supported starting from GFX10 */";
return;
}
- O << " wave_ror:1";
+ O << "wave_ror:1";
} else if (Imm == DppCtrl::ROW_MIRROR) {
- O << " row_mirror";
+ O << "row_mirror";
} else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
- O << " row_half_mirror";
+ O << "row_half_mirror";
} else if (Imm == DppCtrl::BCAST15) {
- if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
- O << " /* row_bcast is not supported starting from GFX10 */";
+ if (AMDGPU::isGFX10Plus(STI)) {
+ O << "/* row_bcast is not supported starting from GFX10 */";
return;
}
- O << " row_bcast:15";
+ O << "row_bcast:15";
} else if (Imm == DppCtrl::BCAST31) {
- if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
- O << " /* row_bcast is not supported starting from GFX10 */";
+ if (AMDGPU::isGFX10Plus(STI)) {
+ O << "/* row_bcast is not supported starting from GFX10 */";
return;
}
- O << " row_bcast:31";
+ O << "row_bcast:31";
} else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
(Imm <= DppCtrl::ROW_SHARE_LAST)) {
- if (!AMDGPU::isGFX10(STI)) {
- O << " /* row_share is not supported on ASICs earlier than GFX10 */";
+ if (!AMDGPU::isGFX10Plus(STI)) {
+ O << "/* row_share is not supported on ASICs earlier than GFX10 */";
return;
}
- O << " row_share:";
+ O << "row_share:";
printU4ImmDecOperand(MI, OpNo, O);
} else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
(Imm <= DppCtrl::ROW_XMASK_LAST)) {
- if (!AMDGPU::isGFX10(STI)) {
- O << " /* row_xmask is not supported on ASICs earlier than GFX10 */";
+ if (!AMDGPU::isGFX10Plus(STI)) {
+ O << "/* row_xmask is not supported on ASICs earlier than GFX10 */";
return;
}
O << "row_xmask:";
printU4ImmDecOperand(MI, OpNo, O);
} else {
- O << " /* Invalid dpp_ctrl value */";
+ O << "/* Invalid dpp_ctrl value */";
}
}
@@ -917,10 +958,9 @@ void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
}
}
-template <unsigned N>
void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O,
+ unsigned N) {
unsigned Opc = MI->getOpcode();
int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en);
unsigned En = MI->getOperand(EnIdx).getImm();
@@ -928,12 +968,8 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr);
// If compr is set, print as src0, src0, src1, src1
- if (MI->getOperand(ComprIdx).getImm()) {
- if (N == 1 || N == 2)
- --OpNo;
- else if (N == 3)
- OpNo -= 2;
- }
+ if (MI->getOperand(ComprIdx).getImm())
+ OpNo = OpNo - N + N / 2;
if (En & (1 << N))
printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
@@ -944,48 +980,43 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printExpSrcN<0>(MI, OpNo, STI, O);
+ printExpSrcN(MI, OpNo, STI, O, 0);
}
void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printExpSrcN<1>(MI, OpNo, STI, O);
+ printExpSrcN(MI, OpNo, STI, O, 1);
}
void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printExpSrcN<2>(MI, OpNo, STI, O);
+ printExpSrcN(MI, OpNo, STI, O, 2);
}
void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printExpSrcN<3>(MI, OpNo, STI, O);
+ printExpSrcN(MI, OpNo, STI, O, 3);
}
void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ using namespace llvm::AMDGPU::Exp;
+
// This is really a 6 bit field.
- uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
-
- if (Tgt <= 7)
- O << " mrt" << Tgt;
- else if (Tgt == 8)
- O << " mrtz";
- else if (Tgt == 9)
- O << " null";
- else if ((Tgt >= 12 && Tgt <= 15) || (Tgt == 16 && AMDGPU::isGFX10(STI)))
- O << " pos" << Tgt - 12;
- else if (AMDGPU::isGFX10(STI) && Tgt == 20)
- O << " prim";
- else if (Tgt >= 32 && Tgt <= 63)
- O << " param" << Tgt - 32;
- else {
- // Reserved values 10, 11
- O << " invalid_target_" << Tgt;
+ unsigned Id = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
+
+ int Index;
+ StringRef TgtName;
+ if (getTgtName(Id, TgtName, Index) && isSupportedTgtId(Id, STI)) {
+ O << ' ' << TgtName;
+ if (Index >= 0)
+ O << Index;
+ } else {
+ O << " invalid_target_" << Id;
}
}
@@ -1124,9 +1155,9 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
unsigned Val = MI->getOperand(OpNo).getImm();
if ((Val & ~ENABLE_MASK) != 0) {
- O << " " << formatHex(static_cast<uint64_t>(Val));
+ O << formatHex(static_cast<uint64_t>(Val));
} else {
- O << " gpr_idx(";
+ O << "gpr_idx(";
bool NeedComma = false;
for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
if (Val & (1 << ModeId)) {
@@ -1171,15 +1202,13 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm())
- O << " high";
+ printNamedBit(MI, OpNo, O, "high");
}
void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm())
- O << " clamp";
+ printNamedBit(MI, OpNo, O, "clamp");
}
void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 6dfd23ea72e6..8d13aa682211 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -24,6 +24,7 @@ public:
//Autogenerated by tblgen
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
@@ -99,6 +100,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printSymbolicFormat(const MCInst *MI,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -109,8 +112,6 @@ private:
raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printImmediateIntV216(uint32_t Imm, const MCSubtargetInfo &STI,
- raw_ostream &O);
void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -178,10 +179,8 @@ private:
void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
-
- template <unsigned N>
- void printExpSrcN(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
+ void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O, unsigned N);
void printExpSrc0(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpSrc1(const MCInst *MI, unsigned OpNo,
@@ -253,6 +252,7 @@ public:
void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
const MCSubtargetInfo &STI, raw_ostream &O) override;
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 687cfef4559f..1836237c8df5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -40,7 +40,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
HasAggressiveSymbolFolding = true;
COMMDirectiveAlignmentIsInBytes = false;
HasNoDeadStrip = true;
- WeakRefDirective = ".weakref\t";
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
DwarfRegNumForCFI = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index d7d8c8181b02..1a7ca7e1a330 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -15,7 +15,7 @@
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 7d3235efc59e..34b2cd1fc1e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -20,15 +20,15 @@
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -74,8 +74,8 @@ MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) {
static MCSubtargetInfo *
createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
if (TT.getArch() == Triple::r600)
- return createR600MCSubtargetInfoImpl(TT, CPU, FS);
- return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
+ return createR600MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+ return createAMDGPUMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index b9cdbc6502e5..71b44a509108 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -15,8 +15,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
-#include "llvm/Support/DataTypes.h"
-
#include <memory>
namespace llvm {
@@ -33,7 +31,7 @@ class Target;
class Triple;
class raw_pwrite_stream;
-enum AMDGPUDwarfFlavour { Wave64 = 0, Wave32 = 1 };
+enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 };
MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
@@ -58,34 +56,24 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
#define GET_REGINFO_ENUM
#include "AMDGPUGenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
#define GET_REGINFO_ENUM
#include "R600GenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
#define GET_INSTRINFO_SCHED_ENUM
#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_SCHED_ENUM
-#undef GET_INSTRINFO_OPERAND_ENUM
-#undef GET_INSTRINFO_ENUM
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
#define GET_INSTRINFO_SCHED_ENUM
#include "R600GenInstrInfo.inc"
-#undef GET_INSTRINFO_SCHED_ENUM
-#undef GET_INSTRINFO_OPERAND_ENUM
-#undef GET_INSTRINFO_ENUM
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
#define GET_SUBTARGETINFO_ENUM
#include "R600GenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 3d202d7960d6..f0eb11b70c97 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -11,31 +11,21 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetStreamer.h"
-#include "AMDGPU.h"
-#include "SIDefines.h"
+#include "AMDGPUPTNote.h"
+#include "AMDKernelCodeT.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
-#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetParser.h"
-
-namespace llvm {
-#include "AMDGPUPTNote.h"
-}
using namespace llvm;
using namespace llvm::AMDGPU;
-using namespace llvm::AMDGPU::HSAMD;
//===----------------------------------------------------------------------===//
// AMDGPUTargetStreamer
@@ -43,9 +33,8 @@ using namespace llvm::AMDGPU::HSAMD;
bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
- if (HSAMD::fromString(std::string(HSAMetadataString), HSAMetadata))
+ if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
return false;
-
return EmitHSAMetadata(HSAMetadata);
}
@@ -79,14 +68,17 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX602: AK = GK_GFX602; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX705: AK = GK_GFX705; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX805: AK = GK_GFX805; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
@@ -94,10 +86,14 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: AK = GK_GFX1031; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: AK = GK_GFX1032; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
@@ -131,14 +127,17 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_TURKS: return ELF::EF_AMDGPU_MACH_R600_TURKS;
case GK_GFX600: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX600;
case GK_GFX601: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX601;
+ case GK_GFX602: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX602;
case GK_GFX700: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX700;
case GK_GFX701: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX701;
case GK_GFX702: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX702;
case GK_GFX703: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX703;
case GK_GFX704: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX704;
+ case GK_GFX705: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX705;
case GK_GFX801: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX801;
case GK_GFX802: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX802;
case GK_GFX803: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX803;
+ case GK_GFX805: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX805;
case GK_GFX810: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX810;
case GK_GFX900: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX900;
case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
@@ -146,10 +145,14 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+ case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
+ case GK_GFX1031: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031;
+ case GK_GFX1032: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032;
+ case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -166,10 +169,15 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
// A hook for emitting stuff at the end.
// We use it for emitting the accumulated PAL metadata as directives.
+// The PAL metadata is reset after it is emitted.
void AMDGPUTargetAsmStreamer::finish() {
std::string S;
getPALMetadata()->toString(S);
OS << S;
+
+ // Reset the pal metadata so its data will not affect a compilation that
+ // reuses this object.
+ getPALMetadata()->reset();
}
void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
@@ -228,15 +236,15 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
if (HSAMD::toString(HSAMetadata, HSAMetadataString))
return false;
- OS << '\t' << AssemblerDirectiveBegin << '\n';
+ OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
OS << HSAMetadataString << '\n';
- OS << '\t' << AssemblerDirectiveEnd << '\n';
+ OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
return true;
}
bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
msgpack::Document &HSAMetadataDoc, bool Strict) {
- V3::MetadataVerifier Verifier(Strict);
+ HSAMD::V3::MetadataVerifier Verifier(Strict);
if (!Verifier.verify(HSAMetadataDoc.getRoot()))
return false;
@@ -244,9 +252,9 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
raw_string_ostream StrOS(HSAMetadataString);
HSAMetadataDoc.toYAML(StrOS);
- OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
+ OS << '\t' << HSAMD::V3::AssemblerDirectiveBegin << '\n';
OS << StrOS.str() << '\n';
- OS << '\t' << V3::AssemblerDirectiveEnd << '\n';
+ OS << '\t' << HSAMD::V3::AssemblerDirectiveEnd << '\n';
return true;
}
@@ -302,7 +310,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(
OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
@@ -421,6 +429,7 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
// A hook for emitting stuff at the end.
// We use it for emitting the accumulated PAL metadata as a .note record.
+// The PAL metadata is reset after it is emitted.
void AMDGPUTargetELFStreamer::finish() {
std::string Blob;
const char *Vendor = getPALMetadata()->getVendor();
@@ -430,6 +439,10 @@ void AMDGPUTargetELFStreamer::finish() {
return;
EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type,
[&](MCELFStreamer &OS) { OS.emitBytes(Blob); });
+
+ // Reset the pal metadata so its data will not affect a compilation that
+ // reuses this object.
+ getPALMetadata()->reset();
}
void AMDGPUTargetELFStreamer::EmitNote(
@@ -554,7 +567,7 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
bool Strict) {
- V3::MetadataVerifier Verifier(Strict);
+ HSAMD::V3::MetadataVerifier Verifier(Strict);
if (!Verifier.verify(HSAMetadataDoc.getRoot()))
return false;
@@ -644,9 +657,10 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
Streamer.emitLabel(KernelDescriptorSymbol);
- Streamer.emitBytes(StringRef(
- (const char*)&(KernelDescriptor),
- offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+ Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size);
+ Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size);
+ for (uint8_t Res : KernelDescriptor.reserved0)
+ Streamer.emitInt8(Res);
// FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
// expression being created is:
// (start of kernel code) - (start of kernel descriptor)
@@ -658,11 +672,12 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
Context),
sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
- Streamer.emitBytes(StringRef(
- (const char*)&(KernelDescriptor) +
- offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
- sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
- sizeof(KernelDescriptor) -
- offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
- sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+ for (uint8_t Res : KernelDescriptor.reserved1)
+ Streamer.emitInt8(Res);
+ Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc3);
+ Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1);
+ Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2);
+ Streamer.emitInt16(KernelDescriptor.kernel_code_properties);
+ for (uint8_t Res : KernelDescriptor.reserved2)
+ Streamer.emitInt8(Res);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index a19d4646deb2..1ad64532931c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -9,16 +9,12 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
-#include "AMDKernelCodeT.h"
#include "Utils/AMDGPUPALMetadata.h"
-#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/AMDGPUMetadata.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
+
+struct amd_kernel_code_t;
namespace llvm {
-#include "AMDGPUPTNote.h"
class DataLayout;
class Function;
@@ -28,6 +24,16 @@ class MDNode;
class Module;
class Type;
+namespace AMDGPU {
+namespace HSAMD {
+struct Metadata;
+}
+} // namespace AMDGPU
+
+namespace amdhsa {
+struct kernel_descriptor_t;
+}
+
class AMDGPUTargetStreamer : public MCTargetStreamer {
AMDGPUPALMetadata PALMetadata;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index f61470573050..bbca8cbb742c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -13,22 +13,15 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600Defines.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 2cd6c3a81d2b..1a1ffcda3b4e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -12,29 +12,15 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPU.h"
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <cstdlib>
using namespace llvm;
@@ -303,7 +289,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
}
// NSA encoding.
- if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
+ if (AMDGPU::isGFX10Plus(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vaddr0);
int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 2bfc2d579533..54c8cdf196ac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -51,7 +51,7 @@ def MIMGBaseOpcodesTable : GenericTable {
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
"Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
"LodOrClampOrMip", "HasD16"];
- GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
+ string TypeOf_BaseOpcode = "MIMGBaseOpcode";
let PrimaryKey = ["BaseOpcode"];
let PrimaryKeyName = "getMIMGBaseOpcodeInfo";
@@ -65,7 +65,7 @@ def MIMGDimInfoTable : GenericTable {
let FilterClass = "AMDGPUDimProps";
let CppTypeName = "MIMGDimInfo";
let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"];
- GenericEnum TypeOf_Dim = MIMGDim;
+ string TypeOf_Dim = "MIMGDim";
let PrimaryKey = ["Dim"];
let PrimaryKeyName = "getMIMGDimInfo";
@@ -95,8 +95,8 @@ def MIMGLZMappingTable : GenericTable {
let FilterClass = "MIMGLZMapping";
let CppTypeName = "MIMGLZMappingInfo";
let Fields = ["L", "LZ"];
- GenericEnum TypeOf_L = MIMGBaseOpcode;
- GenericEnum TypeOf_LZ = MIMGBaseOpcode;
+ string TypeOf_L = "MIMGBaseOpcode";
+ string TypeOf_LZ = "MIMGBaseOpcode";
let PrimaryKey = ["L"];
let PrimaryKeyName = "getMIMGLZMappingInfo";
@@ -111,8 +111,8 @@ def MIMGMIPMappingTable : GenericTable {
let FilterClass = "MIMGMIPMapping";
let CppTypeName = "MIMGMIPMappingInfo";
let Fields = ["MIP", "NONMIP"];
- GenericEnum TypeOf_MIP = MIMGBaseOpcode;
- GenericEnum TypeOf_NONMIP = MIMGBaseOpcode;
+ string TypeOf_MIP = "MIMGBaseOpcode";
+ string TypeOf_NONMIP = "MIMGBaseOpcode";
let PrimaryKey = ["MIP"];
let PrimaryKeyName = "getMIMGMIPMappingInfo";
@@ -127,8 +127,8 @@ def MIMGG16MappingTable : GenericTable {
let FilterClass = "MIMGG16Mapping";
let CppTypeName = "MIMGG16MappingInfo";
let Fields = ["G", "G16"];
- GenericEnum TypeOf_G = MIMGBaseOpcode;
- GenericEnum TypeOf_G16 = MIMGBaseOpcode;
+ string TypeOf_G = "MIMGBaseOpcode";
+ string TypeOf_G16 = "MIMGBaseOpcode";
let PrimaryKey = ["G"];
let PrimaryKeyName = "getMIMGG16MappingInfo";
@@ -148,7 +148,7 @@ class MIMG_Base <dag outs, string dns = "">
let hasSideEffects = 0; // XXX ????
let DecoderNamespace = dns;
- let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+ let isAsmParserOnly = !eq(dns, "");
}
class MIMG <dag outs, string dns = "">
@@ -168,8 +168,8 @@ def MIMGInfoTable : GenericTable {
let FilterClass = "MIMG";
let CppTypeName = "MIMGInfo";
let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
- GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
- GenericEnum TypeOf_MIMGEncoding = MIMGEncoding;
+ string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+ string TypeOf_MIMGEncoding = "MIMGEncoding";
let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"];
let PrimaryKeyName = "getMIMGOpcodeHelper";
@@ -180,14 +180,14 @@ def getMIMGInfo : SearchIndex {
let Key = ["Opcode"];
}
-// This is a separate class so that TableGen memoizes the computations.
+// This class used to use !foldl to memoize the AddrAsmNames list.
+// It turned out that that was much slower than using !filter.
class MIMGNSAHelper<int num_addrs> {
list<string> AddrAsmNames =
- !foldl([]<string>, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], lhs, i,
- !if(!lt(i, num_addrs), !listconcat(lhs, ["vaddr"#!size(lhs)]), lhs));
+ !foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+ !lt(i, num_addrs)), "vaddr" # i);
dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames);
- string AddrAsm = "[" # !foldl("$" # !head(AddrAsmNames), !tail(AddrAsmNames), lhs, rhs,
- lhs # ", $" # rhs) # "]";
+ string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
int NSA = !if(!le(num_addrs, 1), ?,
!if(!le(num_addrs, 5), 1,
@@ -308,13 +308,13 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
bit isResInfo = 0> {
def "" : MIMGBaseOpcode {
- let Coordinates = !if(isResInfo, 0, 1);
+ let Coordinates = !not(isResInfo);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
- mayLoad = !if(isResInfo, 0, 1) in {
+ mayLoad = !not(isResInfo) in {
let VDataDwords = 1 in
defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
let VDataDwords = 2 in
@@ -413,6 +413,8 @@ multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> {
defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
let VDataDwords = 4 in
defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_Store_Addr_Helper <op, asm, VReg_160, 0>;
}
}
@@ -665,12 +667,12 @@ multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
bit isG16 = 0, bit isGetLod = 0,
string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
- let HasD16 = !if(isGetLod, 0, 1);
+ let HasD16 = !not(isGetLod);
let G16 = isG16;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
- mayLoad = !if(isGetLod, 0, 1) in {
+ mayLoad = !not(isGetLod) in {
let VDataDwords = 1 in
defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
let VDataDwords = 2 in
@@ -708,6 +710,55 @@ multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample>
: MIMG_Gather<op, sample, 1>;
+class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A16>
+ : MIMG_gfx10<op, (outs VReg_128:$vdata), "AMDGPU"> {
+
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
+ !if(A16, (ins GFX10A16:$a16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(A16, "$a16", "");
+
+ let nsa = 0;
+}
+
+class MIMG_IntersectRay_nsa_gfx10<int op, string opcode, int num_addrs, bit A16>
+ : MIMG_nsa_gfx10<op, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+ let InOperandList = !con(nsah.AddrIns,
+ (ins SReg_128:$srsrc),
+ !if(A16, (ins GFX10A16:$a16), (ins)));
+ let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
+}
+
+multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> {
+ def "" : MIMGBaseOpcode;
+ let SubtargetPredicate = HasGFX10_BEncoding,
+ AssemblerPredicate = HasGFX10_BEncoding,
+ AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
+ dmask = 0xf,
+ unorm = 1,
+ d16 = 0,
+ glc = 0,
+ slc = 0,
+ dlc = 0,
+ tfe = 0,
+ lwe = 0,
+ r128 = 1,
+ ssamp = 0,
+ dim = {0, 0, 0},
+ a16 = A16,
+ d16 = 0,
+ BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
+ VDataDwords = 4 in {
+ // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple,
+ // when we only need 9, 11 or 12 depending on A16 field and ptr size.
+ def "_sa" : MIMG_IntersectRay_gfx10<op, opcode, MIMGAddrSize<num_addrs, 0>.RegClass, A16> {
+ let VAddrDwords = !srl(MIMGAddrSize<num_addrs, 0>.RegClass.Size, 5);
+ }
+ def _nsa : MIMG_IntersectRay_nsa_gfx10<op, opcode, num_addrs, A16> {
+ let VAddrDwords = num_addrs;
+ }
+ }
+}
+
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
@@ -832,6 +883,11 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl
let SubtargetPredicate = HasGFX10_BEncoding in
defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
+defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>;
+
/********** ========================================= **********/
/********** Table of dimension-aware image intrinsics **********/
/********** ========================================= **********/
@@ -840,13 +896,40 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
Intrinsic Intr = I;
MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
AMDGPUDimProps Dim = I.P.Dim;
+ AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
+
+ bits<8> NumGradients = DimEval.NumGradientArgs;
+ bits<8> NumDmask = DimEval.NumDmaskArgs;
+ bits<8> NumData = DimEval.NumDataArgs;
+ bits<8> NumVAddrs = DimEval.NumVAddrArgs;
+ bits<8> NumArgs = !add(DimEval.CachePolicyArgIndex, 1);
+
+ bits<8> DMaskIndex = DimEval.DmaskArgIndex;
+ bits<8> VAddrStart = DimEval.VAddrArgIndex;
+ bits<8> GradientStart = DimEval.GradientArgIndex;
+ bits<8> CoordStart = DimEval.CoordArgIndex;
+ bits<8> LodIndex = DimEval.LodArgIndex;
+ bits<8> MipIndex = DimEval.MipArgIndex;
+ bits<8> VAddrEnd = !add(DimEval.VAddrArgIndex, DimEval.NumVAddrArgs);
+ bits<8> RsrcIndex = DimEval.RsrcArgIndex;
+ bits<8> SampIndex = DimEval.SampArgIndex;
+ bits<8> UnormIndex = DimEval.UnormArgIndex;
+ bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
+ bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
+
+ bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
+ !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
+ bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
}
def ImageDimIntrinsicTable : GenericTable {
let FilterClass = "ImageDimIntrinsicInfo";
- let Fields = ["Intr", "BaseOpcode", "Dim"];
- GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
- GenericEnum TypeOf_Dim = MIMGDim;
+ let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
+ "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
+ "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+ "GradientTyArg", "CoordTyArg"];
+ string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+ string TypeOf_Dim = "MIMGDim";
let PrimaryKey = ["Intr"];
let PrimaryKeyName = "getImageDimIntrinsicInfo";
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index d363baa15507..a96fc7ef234e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -15,10 +15,10 @@
//===----------------------------------------------------------------------===//
#include "R600AsmPrinter.h"
-#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Subtarget.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 290a960ae901..a19d00b62502 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -13,17 +13,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "R600Subtarget.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8124df68f688..ca1e61393e9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -13,35 +13,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
+#include "R600MachineFunctionInfo.h"
+#include "R600Subtarget.h"
#include <set>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -84,12 +59,7 @@ unsigned CFStack::getLoopDepth() {
}
bool CFStack::branchStackContains(CFStack::StackItem Item) {
- for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
- E = BranchStack.end(); I != E; ++I) {
- if (*I == Item)
- return true;
- }
- return false;
+ return llvm::is_contained(BranchStack, Item);
}
bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h
index d72534908dcf..613a59ae81f0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Defines.h
@@ -10,8 +10,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
-#include "llvm/MC/MCRegisterInfo.h"
-
// Operand Flags
#define MO_FLAG_CLAMP (1 << 0)
#define MO_FLAG_NEG (1 << 1)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index b97e3c8b8dd7..664e134889e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -14,25 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600RegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
-#include <vector>
+#include "R600Defines.h"
+#include "R600Subtarget.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 5f682d86d26e..81dc91ab922f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -14,21 +14,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/Pass.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
+#include "R600Defines.h"
+#include "R600Subtarget.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index c568a4aa61c3..abd4086db62c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -7,19 +7,16 @@
//==-----------------------------------------------------------------------===//
#include "R600FrameLowering.h"
-#include "AMDGPUSubtarget.h"
-#include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/MathExtras.h"
+#include "R600Subtarget.h"
using namespace llvm;
R600FrameLowering::~R600FrameLowering() = default;
/// \returns The number of registers allocated for \p FI.
-int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const {
+StackOffset
+R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const R600RegisterInfo *RI
= MF.getSubtarget<R600Subtarget>().getRegisterInfo();
@@ -44,5 +41,5 @@ int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
if (FI != -1)
OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(FI));
- return OffsetBytes / (getStackWidth(MF) * 4);
+ return StackOffset::getFixed(OffsetBytes / (getStackWidth(MF) * 4));
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index b877ecd29829..f171bc4fea78 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -24,8 +24,8 @@ public:
MachineBasicBlock &MBB) const override {}
void emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const override {}
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
bool hasFP(const MachineFunction &MF) const override {
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index dc2e73e1f94e..c0120903396c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -12,42 +12,14 @@
//===----------------------------------------------------------------------===//
#include "R600ISelLowering.h"
-#include "AMDGPUFrameLowering.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600Defines.h"
-#include "R600FrameLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/DAGCombine.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "R600Subtarget.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -338,7 +310,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case R600::MASK_WRITE: {
Register maskedRegister = MI.getOperand(0).getReg();
- assert(Register::isVirtualRegister(maskedRegister));
+ assert(maskedRegister.isVirtual());
MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
break;
@@ -1550,10 +1522,10 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
unsigned FrameIndex = FIN->getIndex();
Register IgnoredFrameReg;
- unsigned Offset =
- TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
- return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
- Op.getValueType());
+ StackOffset Offset =
+ TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+ return DAG.getConstant(Offset.getFixed() * 4 * TFL->getStackWidth(MF),
+ SDLoc(Op), Op.getValueType());
}
CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
@@ -1608,7 +1580,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
}
if (AMDGPU::isShader(CallConv)) {
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
+ Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
InVals.push_back(Register);
continue;
@@ -1747,7 +1719,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
for (unsigned i = 0; i < 4; i++) {
RemapSwizzle[i] = i;
if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+ unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
->getZExtValue();
if (i == Idx)
isUnmovable[Idx] = true;
@@ -1756,7 +1728,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
for (unsigned i = 0; i < 4; i++) {
if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+ unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
->getZExtValue();
if (isUnmovable[Idx])
continue;
@@ -2160,7 +2132,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
uint64_t ImmValue = 0;
if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
- ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+ ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Src.getOperand(0));
float FloatValue = FPC->getValueAPF().convertToFloat();
if (FloatValue == 0.0) {
ImmReg = R600::ZERO;
@@ -2172,7 +2144,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
}
} else {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+ ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0));
uint64_t Value = C->getZExtValue();
if (Value == 0) {
ImmReg = R600::ZERO;
@@ -2189,8 +2161,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
if (ImmReg == R600::ALU_LITERAL_X) {
if (!Imm.getNode())
return false;
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
- assert(C);
+ ConstantSDNode *C = cast<ConstantSDNode>(Imm);
if (C->getZExtValue())
return false;
Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 088cf16d8ed2..7a623f3e304e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -13,33 +13,10 @@
#include "R600InstrInfo.h"
#include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600FrameLowering.h"
-#include "R600RegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/BitVector.h"
+#include "R600Defines.h"
+#include "R600Subtarget.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <iterator>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -97,7 +74,7 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {
for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
E = MBBI->operands_end(); I != E; ++I) {
- if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() &&
+ if (I->isReg() && !I->getReg().isVirtual() && I->isUse() &&
RI.isPhysRegLiveAcrossClauses(I->getReg()))
return false;
}
@@ -242,7 +219,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
E = MI.operands_end();
I != E; ++I) {
- if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg()))
+ if (!I->isReg() || !I->isUse() || I->getReg().isVirtual())
continue;
if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
@@ -963,8 +940,9 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con
return false;
}
-bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const {
+bool R600InstrInfo::ClobbersPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred,
+ bool SkipDead) const {
return isPredicateSetter(MI.getOpcode());
}
@@ -1191,15 +1169,15 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
for (std::pair<unsigned, unsigned> LI : MRI.liveins()) {
- unsigned Reg = LI.first;
- if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg))
+ Register Reg = LI.first;
+ if (Reg.isVirtual() || !IndirectRC->contains(Reg))
continue;
unsigned RegIndex;
unsigned RegEnd;
for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
++RegIndex) {
- if (IndirectRC->getRegister(RegIndex) == Reg)
+ if (IndirectRC->getRegister(RegIndex) == (unsigned)Reg)
break;
}
Offset = std::max(Offset, (int)RegIndex);
@@ -1225,7 +1203,7 @@ int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
const R600FrameLowering *TFL = ST.getFrameLowering();
Register IgnoredFrameReg;
- Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
+ Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg).getFixed();
return getIndirectIndexBegin(MF) + Offset;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index 873ee08470cb..1e249c6348f1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -194,8 +194,8 @@ public:
unsigned NumFCycles, unsigned ExtraFCycles,
BranchProbability Probability) const override;
- bool DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const override;
+ bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+ bool SkipDead) const override;
bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
index 2cc21364c439..055e2de59ea1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -353,8 +353,8 @@ class LoadVtxId1 <PatFrag load> : PatFrag <
const MemSDNode *LD = cast<MemSDNode>(N);
return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
(LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
- !isa<GlobalValue>(GetUnderlyingObject(
- LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
+ !isa<GlobalValue>(getUnderlyingObject(
+ LD->getMemOperand()->getValue())));
}]>;
def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>;
@@ -365,8 +365,8 @@ class LoadVtxId2 <PatFrag load> : PatFrag <
(ops node:$ptr), (load node:$ptr), [{
const MemSDNode *LD = cast<MemSDNode>(N);
return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
- isa<GlobalValue>(GetUnderlyingObject(
- LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
+ isa<GlobalValue>(getUnderlyingObject(
+ LD->getMemOperand()->getValue()));
}]>;
def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 7569a2629539..f85a68706287 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -12,13 +12,8 @@
//===----------------------------------------------------------------------===//
#include "R600MachineScheduler.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
+#include "R600Subtarget.h"
using namespace llvm;
@@ -45,7 +40,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
std::vector<SUnit *> &QDst)
{
- QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+ llvm::append_range(QDst, QSrc);
QSrc.clear();
}
@@ -183,7 +178,7 @@ isPhysicalRegCopy(MachineInstr *MI) {
if (MI->getOpcode() != R600::COPY)
return false;
- return !Register::isVirtualRegister(MI->getOperand(1).getReg());
+ return !MI->getOperand(1).getReg().isVirtual();
}
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
@@ -207,9 +202,9 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
}
-bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+bool R600SchedStrategy::regBelongsToClass(Register Reg,
const TargetRegisterClass *RC) const {
- if (!Register::isVirtualRegister(Reg)) {
+ if (!Reg.isVirtual()) {
return RC->contains(Reg);
} else {
return MRI->getRegClass(Reg) == RC;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
index bc66f2ef5907..abcc37f8400d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -80,7 +80,7 @@ private:
bool VLIW5;
int getInstKind(SUnit *SU);
- bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+ bool regBelongsToClass(Register Reg, const TargetRegisterClass *RC) const;
AluKind getAluKind(SUnit *SU) const;
void LoadAlu();
unsigned AvailablesAluCount() const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 1fe92d2269d3..5fd912e0fb39 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -27,27 +27,12 @@
#include "AMDGPU.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index b0620663a230..8f19a3e478e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -27,30 +27,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "R600Defines.h"
+#include "R600Subtarget.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <utility>
-#include <vector>
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 176269f9b68c..eaac938b098a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -14,17 +14,12 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "R600InstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Subtarget.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 78ef71cdf8e3..e4f7d89bf4c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -12,11 +12,9 @@
//===----------------------------------------------------------------------===//
#include "R600RegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "R600Defines.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Defines.h"
+#include "R600Subtarget.h"
using namespace llvm;
@@ -94,8 +92,8 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
}
}
-bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
- assert(!Register::isVirtualRegister(Reg));
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(Register Reg) const {
+ assert(!Reg.isVirtual());
switch (Reg) {
case R600::OQAP:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 06981c4cf9c5..1308e9fff1fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -45,7 +45,7 @@ struct R600RegisterInfo final : public R600GenRegisterInfo {
// \returns true if \p Reg can be defined in one ALU clause and used in
// another.
- bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+ bool isPhysRegLiveAcrossClauses(Register Reg) const;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h
new file mode 100644
index 000000000000..07238da18c67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -0,0 +1,174 @@
+//=====-- R600Subtarget.h - Define Subtarget for AMDGPU R600 ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDGPU R600 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600SUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_R600SUBTARGET_H
+
+#include "AMDGPUSubtarget.h"
+#include "R600FrameLowering.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+
+} // namespace llvm
+
+#define GET_SUBTARGETINFO_HEADER
+#include "R600GenSubtargetInfo.inc"
+
+namespace llvm {
+
+class R600Subtarget final : public R600GenSubtargetInfo,
+ public AMDGPUSubtarget {
+private:
+ R600InstrInfo InstrInfo;
+ R600FrameLowering FrameLowering;
+ bool FMA;
+ bool CaymanISA;
+ bool CFALUBug;
+ bool HasVertexCache;
+ bool R600ALUInst;
+ bool FP64;
+ short TexVTXClauseSize;
+ Generation Gen;
+ R600TargetLowering TLInfo;
+ InstrItineraryData InstrItins;
+ SelectionDAGTargetInfo TSInfo;
+
+public:
+ R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ const TargetMachine &TM);
+
+ const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+ const R600FrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+
+ const R600TargetLowering *getTargetLowering() const override {
+ return &TLInfo;
+ }
+
+ const R600RegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo.getRegisterInfo();
+ }
+
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
+ // Nothing implemented, just prevent crashes on use.
+ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+ Generation getGeneration() const {
+ return Gen;
+ }
+
+ Align getStackAlignment() const { return Align(4); }
+
+ R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
+ StringRef GPU, StringRef FS);
+
+ bool hasBFE() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBFI() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasBCNT(unsigned Size) const {
+ if (Size == 32)
+ return (getGeneration() >= EVERGREEN);
+
+ return false;
+ }
+
+ bool hasBORROW() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCARRY() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasCaymanISA() const {
+ return CaymanISA;
+ }
+
+ bool hasFFBL() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFFBH() const {
+ return (getGeneration() >= EVERGREEN);
+ }
+
+ bool hasFMA() const { return FMA; }
+
+ bool hasCFAluBug() const { return CFALUBug; }
+
+ bool hasVertexCache() const { return HasVertexCache; }
+
+ short getTexVTXClauseSize() const { return TexVTXClauseSize; }
+
+ bool enableMachineScheduler() const override {
+ return true;
+ }
+
+ bool enableSubRegLiveness() const override {
+ return true;
+ }
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+ }
+
+ /// \returns Number of waves per execution unit required to support the given
+ /// \p FlatWorkGroupSize.
+ unsigned
+ getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const override {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600SUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
index 90e48c63b5dc..3b753cb66ead 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -16,15 +16,9 @@
//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "si-img-init"
@@ -80,9 +74,8 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
- // Check for instructions that don't have tfe or lwe fields
- // There shouldn't be any at this point.
- assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
+ if (!TFE && !LWE) // intersect_ray
+ continue;
unsigned TFEVal = TFE->getImm();
unsigned LWEVal = LWE->getImm();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 3c41bf1fef5e..625749deb3a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -12,36 +12,18 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
+#include "GCNSubtarget.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <utility>
using namespace llvm;
@@ -313,8 +295,15 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
Value *Exec = popSaved();
Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
- if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt))
+ if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
+ Instruction *ExecDef = cast<Instruction>(Exec);
+ BasicBlock *DefBB = ExecDef->getParent();
+ if (!DT->dominates(DefBB, BB)) {
+ // Split edge to make Def dominate Use
+ FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
+ }
CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
+ }
}
/// Annotate the control flow with intrinsics so the backend can
@@ -327,7 +316,6 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
const TargetMachine &TM = TPC.getTM<TargetMachine>();
initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
-
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
E = df_end(&F.getEntryBlock()); I != E; ++I) {
BasicBlock *BB = *I;
@@ -344,7 +332,8 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
if (isTopOfStack(BB))
closeControlFlow(BB);
- handleLoop(Term);
+ if (DT->dominates(Term->getSuccessor(1), BB))
+ handleLoop(Term);
continue;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
index 4f7d255eb450..c83802b323c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -33,26 +33,27 @@ enum : uint64_t {
VOP2 = 1 << 8,
VOPC = 1 << 9,
- // TODO: Should this be spilt into VOP3 a and b?
+ // TODO: Should this be spilt into VOP3 a and b?
VOP3 = 1 << 10,
VOP3P = 1 << 12,
VINTRP = 1 << 13,
SDWA = 1 << 14,
DPP = 1 << 15,
+ TRANS = 1 << 16,
// Memory instruction formats.
- MUBUF = 1 << 16,
- MTBUF = 1 << 17,
- SMRD = 1 << 18,
- MIMG = 1 << 19,
- EXP = 1 << 20,
- FLAT = 1 << 21,
- DS = 1 << 22,
+ MUBUF = 1 << 17,
+ MTBUF = 1 << 18,
+ SMRD = 1 << 19,
+ MIMG = 1 << 20,
+ EXP = 1 << 21,
+ FLAT = 1 << 22,
+ DS = 1 << 23,
// Pseudo instruction formats.
- VGPRSpill = 1 << 23,
- SGPRSpill = 1 << 24,
+ VGPRSpill = 1 << 24,
+ SGPRSpill = 1 << 25,
// High bits - other information.
VM_CNT = UINT64_C(1) << 32,
@@ -89,8 +90,8 @@ enum : uint64_t {
// Is a D16 buffer instruction.
D16Buf = UINT64_C(1) << 50,
- // FLAT instruction accesses FLAT_GLBL or FLAT_SCRATCH segment.
- IsNonFlatSeg = UINT64_C(1) << 51,
+ // FLAT instruction accesses FLAT_GLBL segment.
+ IsFlatGlobal = UINT64_C(1) << 51,
// Uses floating point double precision rounding mode
FPDPRounding = UINT64_C(1) << 52,
@@ -102,7 +103,10 @@ enum : uint64_t {
IsMAI = UINT64_C(1) << 54,
// Is a DOT instruction.
- IsDOT = UINT64_C(1) << 55
+ IsDOT = UINT64_C(1) << 55,
+
+ // FLAT instruction accesses FLAT_SCRATCH segment.
+ IsFlatScratch = UINT64_C(1) << 56
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -217,7 +221,8 @@ enum EncBits : unsigned {
SRC1_ENABLE = 1 << ID_SRC1,
SRC2_ENABLE = 1 << ID_SRC2,
DST_ENABLE = 1 << ID_DST,
- ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE
+ ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE,
+ UNDEF = 0xFFFF
};
} // namespace VGPRIndexMode
@@ -242,8 +247,8 @@ enum : unsigned {
SGPR_MAX_GFX10 = 105,
TTMP_VI_MIN = 112,
TTMP_VI_MAX = 123,
- TTMP_GFX9_GFX10_MIN = 108,
- TTMP_GFX9_GFX10_MAX = 123,
+ TTMP_GFX9PLUS_MIN = 108,
+ TTMP_GFX9PLUS_MAX = 123,
INLINE_INTEGER_C_MIN = 128,
INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
INLINE_INTEGER_C_MAX = 208,
@@ -392,6 +397,172 @@ enum ModeRegisterMasks : uint32_t {
} // namespace Hwreg
+namespace MTBUFFormat {
+
+enum DataFormat : int64_t {
+ DFMT_INVALID = 0,
+ DFMT_8,
+ DFMT_16,
+ DFMT_8_8,
+ DFMT_32,
+ DFMT_16_16,
+ DFMT_10_11_11,
+ DFMT_11_11_10,
+ DFMT_10_10_10_2,
+ DFMT_2_10_10_10,
+ DFMT_8_8_8_8,
+ DFMT_32_32,
+ DFMT_16_16_16_16,
+ DFMT_32_32_32,
+ DFMT_32_32_32_32,
+ DFMT_RESERVED_15,
+
+ DFMT_MIN = DFMT_INVALID,
+ DFMT_MAX = DFMT_RESERVED_15,
+
+ DFMT_UNDEF = -1,
+ DFMT_DEFAULT = DFMT_8,
+
+ DFMT_SHIFT = 0,
+ DFMT_MASK = 0xF
+};
+
+enum NumFormat : int64_t {
+ NFMT_UNORM = 0,
+ NFMT_SNORM,
+ NFMT_USCALED,
+ NFMT_SSCALED,
+ NFMT_UINT,
+ NFMT_SINT,
+ NFMT_RESERVED_6, // VI and GFX9
+ NFMT_SNORM_OGL = NFMT_RESERVED_6, // SI and CI only
+ NFMT_FLOAT,
+
+ NFMT_MIN = NFMT_UNORM,
+ NFMT_MAX = NFMT_FLOAT,
+
+ NFMT_UNDEF = -1,
+ NFMT_DEFAULT = NFMT_UNORM,
+
+ NFMT_SHIFT = 4,
+ NFMT_MASK = 7
+};
+
+enum MergedFormat : int64_t {
+ DFMT_NFMT_UNDEF = -1,
+ DFMT_NFMT_DEFAULT = ((DFMT_DEFAULT & DFMT_MASK) << DFMT_SHIFT) |
+ ((NFMT_DEFAULT & NFMT_MASK) << NFMT_SHIFT),
+
+
+ DFMT_NFMT_MASK = (DFMT_MASK << DFMT_SHIFT) | (NFMT_MASK << NFMT_SHIFT),
+
+ DFMT_NFMT_MAX = DFMT_NFMT_MASK
+};
+
+enum UnifiedFormat : int64_t {
+ UFMT_INVALID = 0,
+
+ UFMT_8_UNORM,
+ UFMT_8_SNORM,
+ UFMT_8_USCALED,
+ UFMT_8_SSCALED,
+ UFMT_8_UINT,
+ UFMT_8_SINT,
+
+ UFMT_16_UNORM,
+ UFMT_16_SNORM,
+ UFMT_16_USCALED,
+ UFMT_16_SSCALED,
+ UFMT_16_UINT,
+ UFMT_16_SINT,
+ UFMT_16_FLOAT,
+
+ UFMT_8_8_UNORM,
+ UFMT_8_8_SNORM,
+ UFMT_8_8_USCALED,
+ UFMT_8_8_SSCALED,
+ UFMT_8_8_UINT,
+ UFMT_8_8_SINT,
+
+ UFMT_32_UINT,
+ UFMT_32_SINT,
+ UFMT_32_FLOAT,
+
+ UFMT_16_16_UNORM,
+ UFMT_16_16_SNORM,
+ UFMT_16_16_USCALED,
+ UFMT_16_16_SSCALED,
+ UFMT_16_16_UINT,
+ UFMT_16_16_SINT,
+ UFMT_16_16_FLOAT,
+
+ UFMT_10_11_11_UNORM,
+ UFMT_10_11_11_SNORM,
+ UFMT_10_11_11_USCALED,
+ UFMT_10_11_11_SSCALED,
+ UFMT_10_11_11_UINT,
+ UFMT_10_11_11_SINT,
+ UFMT_10_11_11_FLOAT,
+
+ UFMT_11_11_10_UNORM,
+ UFMT_11_11_10_SNORM,
+ UFMT_11_11_10_USCALED,
+ UFMT_11_11_10_SSCALED,
+ UFMT_11_11_10_UINT,
+ UFMT_11_11_10_SINT,
+ UFMT_11_11_10_FLOAT,
+
+ UFMT_10_10_10_2_UNORM,
+ UFMT_10_10_10_2_SNORM,
+ UFMT_10_10_10_2_USCALED,
+ UFMT_10_10_10_2_SSCALED,
+ UFMT_10_10_10_2_UINT,
+ UFMT_10_10_10_2_SINT,
+
+ UFMT_2_10_10_10_UNORM,
+ UFMT_2_10_10_10_SNORM,
+ UFMT_2_10_10_10_USCALED,
+ UFMT_2_10_10_10_SSCALED,
+ UFMT_2_10_10_10_UINT,
+ UFMT_2_10_10_10_SINT,
+
+ UFMT_8_8_8_8_UNORM,
+ UFMT_8_8_8_8_SNORM,
+ UFMT_8_8_8_8_USCALED,
+ UFMT_8_8_8_8_SSCALED,
+ UFMT_8_8_8_8_UINT,
+ UFMT_8_8_8_8_SINT,
+
+ UFMT_32_32_UINT,
+ UFMT_32_32_SINT,
+ UFMT_32_32_FLOAT,
+
+ UFMT_16_16_16_16_UNORM,
+ UFMT_16_16_16_16_SNORM,
+ UFMT_16_16_16_16_USCALED,
+ UFMT_16_16_16_16_SSCALED,
+ UFMT_16_16_16_16_UINT,
+ UFMT_16_16_16_16_SINT,
+ UFMT_16_16_16_16_FLOAT,
+
+ UFMT_32_32_32_UINT,
+ UFMT_32_32_32_SINT,
+ UFMT_32_32_32_FLOAT,
+ UFMT_32_32_32_32_UINT,
+ UFMT_32_32_32_32_SINT,
+ UFMT_32_32_32_32_FLOAT,
+
+ UFMT_FIRST = UFMT_INVALID,
+ UFMT_LAST = UFMT_32_32_32_32_FLOAT,
+
+ UFMT_MAX = 127,
+
+ UFMT_UNDEF = -1,
+ UFMT_DEFAULT = UFMT_8_UNORM
+};
+
+} // namespace MTBUFFormat
+
namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
enum Id : unsigned { // id of symbolic names
@@ -518,19 +689,67 @@ enum DppFiMode {
};
} // namespace DPP
+
+namespace Exp {
+
+enum Target : unsigned {
+ ET_MRT0 = 0,
+ ET_MRT7 = 7,
+ ET_MRTZ = 8,
+ ET_NULL = 9,
+ ET_POS0 = 12,
+ ET_POS3 = 15,
+ ET_POS4 = 16, // GFX10+
+ ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget
+ ET_PRIM = 20, // GFX10+
+ ET_PARAM0 = 32,
+ ET_PARAM31 = 63,
+
+ ET_NULL_MAX_IDX = 0,
+ ET_MRTZ_MAX_IDX = 0,
+ ET_PRIM_MAX_IDX = 0,
+ ET_MRT_MAX_IDX = 7,
+ ET_POS_MAX_IDX = 4,
+ ET_PARAM_MAX_IDX = 31,
+
+ ET_INVALID = 255,
+};
+
+} // namespace Exp
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
+#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
+#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
+#define S_00B028_MEM_ORDERED(x) (((x) & 0x1) << 25)
+#define G_00B028_MEM_ORDERED(x) (((x) >> 25) & 0x1)
+#define C_00B028_MEM_ORDERED 0xFDFFFFFF
+
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C
#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8)
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128
+#define S_00B128_MEM_ORDERED(x) (((x) & 0x1) << 27)
+#define G_00B128_MEM_ORDERED(x) (((x) >> 27) & 0x1)
+#define C_00B128_MEM_ORDERED 0xF7FFFFFF
+
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228
+#define S_00B228_WGP_MODE(x) (((x) & 0x1) << 27)
+#define G_00B228_WGP_MODE(x) (((x) >> 27) & 0x1)
+#define C_00B228_WGP_MODE 0xF7FFFFFF
+#define S_00B228_MEM_ORDERED(x) (((x) & 0x1) << 25)
+#define G_00B228_MEM_ORDERED(x) (((x) >> 25) & 0x1)
+#define C_00B228_MEM_ORDERED 0xFDFFFFFF
+
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES 0x00B328
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428
+#define S_00B428_WGP_MODE(x) (((x) & 0x1) << 26)
+#define G_00B428_WGP_MODE(x) (((x) >> 26) & 0x1)
+#define C_00B428_WGP_MODE 0xFBFFFFFF
+#define S_00B428_MEM_ORDERED(x) (((x) & 0x1) << 24)
+#define G_00B428_MEM_ORDERED(x) (((x) >> 24) & 0x1)
+#define C_00B428_MEM_ORDERED 0xFEFFFFFF
+
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS 0x00B528
-#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
-#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
-#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ef64c5674bd1..34f59bf34dd5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -65,37 +65,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <list>
-#include <map>
-#include <tuple>
-#include <utility>
using namespace llvm;
@@ -122,7 +96,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- void processPHINode(MachineInstr &MI);
+ MachineBasicBlock *processPHINode(MachineInstr &MI);
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
@@ -154,8 +128,7 @@ static bool hasVectorOperands(const MachineInstr &MI,
const SIRegisterInfo *TRI) {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- if (!MI.getOperand(i).isReg() ||
- !Register::isVirtualRegister(MI.getOperand(i).getReg()))
+ if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
continue;
if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
@@ -171,14 +144,14 @@ getCopyRegClasses(const MachineInstr &Copy,
Register DstReg = Copy.getOperand(0).getReg();
Register SrcReg = Copy.getOperand(1).getReg();
- const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg)
+ const TargetRegisterClass *SrcRC = SrcReg.isVirtual()
? MRI.getRegClass(SrcReg)
: TRI.getPhysRegClass(SrcReg);
// We don't really care about the subregister here.
// SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
- const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg)
+ const TargetRegisterClass *DstRC = DstReg.isVirtual()
? MRI.getRegClass(DstReg)
: TRI.getPhysRegClass(DstReg);
@@ -206,8 +179,7 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
auto &Src = MI.getOperand(1);
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = Src.getReg();
- if (!Register::isVirtualRegister(SrcReg) ||
- !Register::isVirtualRegister(DstReg))
+ if (!SrcReg.isVirtual() || !DstReg.isVirtual())
return false;
for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
@@ -215,8 +187,12 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
if (UseMI == &MI)
continue;
if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
- UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
- !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+ UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+ return false;
+
+ unsigned OpIdx = UseMI->getOperandNo(&MO);
+ if (OpIdx >= UseMI->getDesc().getNumOperands() ||
+ !TII->isOperandLegal(*UseMI, OpIdx, &Src))
return false;
}
// Change VGPR to SGPR destination.
@@ -255,7 +231,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return false;
// It is illegal to have vreg inputs to a physreg defining reg_sequence.
- if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+ if (CopyUse.getOperand(0).getReg().isPhysical())
return false;
const TargetRegisterClass *SrcRC, *DstRC;
@@ -306,7 +282,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
- AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+ AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
TmpAReg)
.addReg(TmpReg, RegState::Kill);
@@ -362,8 +338,7 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
DenseSet<const MachineBasicBlock *> Visited;
- SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
- MBB->pred_end());
+ SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors());
while (!Worklist.empty()) {
MachineBasicBlock *MBB = Worklist.pop_back_val();
@@ -388,17 +363,13 @@ static bool isReachable(const MachineInstr *From,
const MachineInstr *To,
const MachineBasicBlock *CutOff,
MachineDominatorTree &MDT) {
- // If either From block dominates To block or instructions are in the same
- // block and From is higher.
if (MDT.dominates(From, To))
return true;
const MachineBasicBlock *MBBFrom = From->getParent();
const MachineBasicBlock *MBBTo = To->getParent();
- if (MBBFrom == MBBTo)
- return false;
- // Instructions are in different blocks, do predecessor search.
+ // Do predecessor search.
// We should almost never get here since we do not usually produce M0 stores
// other than -1.
return searchPredecessors(MBBTo, CutOff, [MBBFrom]
@@ -598,13 +569,11 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
- SmallVector<MachineInstr *, 16> Worklist;
-
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
- MachineBasicBlock &MBB = *BI;
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
+ MachineBasicBlock *MBB = &*BI;
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+ ++I) {
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
@@ -619,7 +588,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
- if (!Register::isVirtualRegister(DstReg)) {
+ if (!DstReg.isVirtual()) {
// If the destination register is a physical register there isn't
// really much we can do to fix this.
// Some special instructions use M0 as an input. Some even only use
@@ -628,9 +597,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Register TmpReg
= MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(MBB, MI, MI.getDebugLoc(),
+ BuildMI(*MBB, MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
- .add(MI.getOperand(1));
+ .add(MI.getOperand(1));
MI.getOperand(1).setReg(TmpReg);
}
@@ -639,8 +608,16 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
Register SrcReg = MI.getOperand(1).getReg();
- if (!Register::isVirtualRegister(SrcReg)) {
- TII->moveToVALU(MI, MDT);
+ if (!SrcReg.isVirtual()) {
+ MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+ if (NewBB && NewBB != MBB) {
+ MBB = NewBB;
+ E = MBB->end();
+ BI = MachineFunction::iterator(MBB);
+ BE = MF.end();
+ }
+ assert((!NewBB || NewBB == I->getParent()) &&
+ "moveToVALU did not return the right basic block");
break;
}
@@ -655,7 +632,15 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
MI.setDesc(TII->get(SMovOp));
break;
}
- TII->moveToVALU(MI, MDT);
+ MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+ if (NewBB && NewBB != MBB) {
+ MBB = NewBB;
+ E = MBB->end();
+ BI = MachineFunction::iterator(MBB);
+ BE = MF.end();
+ }
+ assert((!NewBB || NewBB == I->getParent()) &&
+ "moveToVALU did not return the right basic block");
} else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}
@@ -663,10 +648,18 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
case AMDGPU::PHI: {
- processPHINode(MI);
+ MachineBasicBlock *NewBB = processPHINode(MI);
+ if (NewBB && NewBB != MBB) {
+ MBB = NewBB;
+ E = MBB->end();
+ BI = MachineFunction::iterator(MBB);
+ BE = MF.end();
+ }
+ assert((!NewBB || NewBB == I->getParent()) &&
+ "moveToVALU did not return the right basic block");
break;
}
- case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::REG_SEQUENCE: {
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
@@ -675,8 +668,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
- TII->moveToVALU(MI, MDT);
+ MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+ if (NewBB && NewBB != MBB) {
+ MBB = NewBB;
+ E = MBB->end();
+ BI = MachineFunction::iterator(MBB);
+ BE = MF.end();
+ }
+ assert((!NewBB || NewBB == I->getParent()) &&
+ "moveToVALU did not return the right basic block");
break;
+ }
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
@@ -686,7 +688,15 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
- TII->moveToVALU(MI, MDT);
+ MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
+ if (NewBB && NewBB != MBB) {
+ MBB = NewBB;
+ E = MBB->end();
+ BI = MachineFunction::iterator(MBB);
+ BE = MF.end();
+ }
+ assert((!NewBB || NewBB == I->getParent()) &&
+ "moveToVALU did not return the right basic block");
}
break;
}
@@ -721,7 +731,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// that can't be resolved in later operand folding pass
bool Resolved = false;
for (MachineOperand *MO : {&Src0, &Src1}) {
- if (Register::isVirtualRegister(MO->getReg())) {
+ if (MO->getReg().isVirtual()) {
MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
if (DefMI && TII->isFoldableCopy(*DefMI)) {
const MachineOperand &Def = DefMI->getOperand(0);
@@ -761,17 +771,18 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
unsigned numVGPRUses = 0;
bool AllAGPRUses = true;
SetVector<const MachineInstr *> worklist;
SmallSet<const MachineInstr *, 4> Visited;
SetVector<MachineInstr *> PHIOperands;
+ MachineBasicBlock *CreatedBB = nullptr;
worklist.insert(&MI);
Visited.insert(&MI);
while (!worklist.empty()) {
const MachineInstr *Instr = worklist.pop_back_val();
- unsigned Reg = Instr->getOperand(0).getReg();
+ Register Reg = Instr->getOperand(0).getReg();
for (const auto &Use : MRI->use_operands(Reg)) {
const MachineInstr *UseMI = Use.getParent();
AllAGPRUses &= (UseMI->isCopy() &&
@@ -820,11 +831,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
bool hasVGPRInput = false;
for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
- unsigned InputReg = MI.getOperand(i).getReg();
+ Register InputReg = MI.getOperand(i).getReg();
MachineInstr *Def = MRI->getVRegDef(InputReg);
if (TRI->isVectorRegister(*MRI, InputReg)) {
if (Def->isCopy()) {
- unsigned SrcReg = Def->getOperand(1).getReg();
+ Register SrcReg = Def->getOperand(1).getReg();
const TargetRegisterClass *RC =
TRI->getRegClassForReg(*MRI, SrcReg);
if (TRI->isSGPRClass(RC))
@@ -858,7 +869,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
RC0 != &AMDGPU::VReg_1RegClass) &&
(hasVGPRInput || numVGPRUses > 1)) {
LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
- TII->moveToVALU(MI);
+ CreatedBB = TII->moveToVALU(MI);
}
else {
LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
@@ -869,4 +880,5 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
while (!PHIOperands.empty()) {
processPHINode(*PHIOperands.pop_back_val());
}
+ return CreatedBB;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 29484668a01d..f7e3ea5fc072 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -12,8 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
deleted file mode 100644
index 8e3402b537b3..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-/// \file
-/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
-/// Currently this will convert GLOBAL_{LOAD|STORE}_*
-/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
-/// feeding the sreg into the saddr field of the new instruction.
-/// We currently handle a REG_SEQUENCE feeding the vaddr
-/// and decompose it into a base and index.
-///
-/// Transform:
-/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
-/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
-/// %24:vgpr_32, %19:sreg_64_xexec
-/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
-/// %11:vreg_64 = COPY %16:vreg_64
-/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
-/// Into:
-/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
-/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
-/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
-///
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#define DEBUG_TYPE "si-fixup-vector-isel"
-
-using namespace llvm;
-
-static cl::opt<bool> EnableGlobalSGPRAddr(
- "amdgpu-enable-global-sgpr-addr",
- cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"),
- cl::init(false));
-
-STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
-STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
-
-namespace {
-
-class SIFixupVectorISel : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIFixupVectorISel() : MachineFunctionPass(ID) {
- initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
- "SI Fixup Vector ISel", false, false)
-
-char SIFixupVectorISel::ID = 0;
-
-char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
-
-FunctionPass *llvm::createSIFixupVectorISelPass() {
- return new SIFixupVectorISel();
-}
-
-static bool findSRegBaseAndIndex(MachineOperand *Op,
- unsigned &BaseReg,
- unsigned &IndexReg,
- MachineRegisterInfo &MRI,
- const SIRegisterInfo *TRI) {
- SmallVector<MachineOperand *, 8> Worklist;
- Worklist.push_back(Op);
- while (!Worklist.empty()) {
- MachineOperand *WOp = Worklist.pop_back_val();
- if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg()))
- continue;
- MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
- switch (DefInst->getOpcode()) {
- default:
- continue;
- case AMDGPU::COPY:
- Worklist.push_back(&DefInst->getOperand(1));
- break;
- case AMDGPU::REG_SEQUENCE:
- if (DefInst->getNumOperands() != 5)
- continue;
- Worklist.push_back(&DefInst->getOperand(1));
- Worklist.push_back(&DefInst->getOperand(3));
- break;
- case AMDGPU::V_ADD_I32_e64:
- // The V_ADD_* and its analogous V_ADDCV_* are generated by
- // a previous pass which lowered from an ADD_64_PSEUDO,
- // which generates subregs to break up the 64 bit args.
- if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
- continue;
- BaseReg = DefInst->getOperand(2).getReg();
- if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
- continue;
- IndexReg = DefInst->getOperand(3).getReg();
- // Chase the IndexReg.
- MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
- if (!MI || !MI->isCopy())
- continue;
- // Make sure the reg class is 64 bit for Index.
- // If the Index register is a subreg, we want it to reference
- // a 64 bit register which we will use as the Index reg.
- const TargetRegisterClass *IdxRC, *BaseRC;
- IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
- if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
- continue;
- IndexReg = MI->getOperand(1).getReg();
- // Chase the BaseReg.
- MI = MRI.getUniqueVRegDef(BaseReg);
- if (!MI || !MI->isCopy())
- continue;
- // Make sure the register class is 64 bit for Base.
- BaseReg = MI->getOperand(1).getReg();
- BaseRC = MRI.getRegClass(BaseReg);
- if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
- continue;
- // Make sure Base is SReg and Index is VReg.
- if (!TRI->isSGPRReg(MRI, BaseReg))
- return false;
- if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
- return false;
- // clear any killed flags on Index and Base regs, used later.
- MRI.clearKillFlags(IndexReg);
- MRI.clearKillFlags(BaseReg);
- return true;
- }
- }
- return false;
-}
-
-// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
-static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
- MachineFunction &MF,
- MachineRegisterInfo &MRI,
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI) {
- if (!EnableGlobalSGPRAddr)
- return false;
- bool FuncModified = false;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
- int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
- if (NewOpcd < 0)
- continue;
- // Update our statistics on opportunities seen.
- ++NumSGPRGlobalOccurs;
- LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
- // Need a Base and Index or we cant transform to _SADDR.
- unsigned BaseReg = 0;
- unsigned IndexReg = 0;
- MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
- if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
- continue;
- ++NumSGPRGlobalSaddrs;
- FuncModified = true;
- // Create the new _SADDR Memory instruction.
- bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
- MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
- MachineInstr *NewGlob = nullptr;
- NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
- if (HasVdst)
- NewGlob->addOperand(MF, MI.getOperand(0));
- NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
- if (VData)
- NewGlob->addOperand(MF, *VData);
- NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
- NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
-
- MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
- // Atomics dont have a GLC, so omit the field if not there.
- if (Glc)
- NewGlob->addOperand(MF, *Glc);
-
- MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc);
- if (DLC)
- NewGlob->addOperand(MF, *DLC);
-
- NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
- // _D16 have an vdst_in operand, copy it in.
- MachineOperand *VDstInOp = TII->getNamedOperand(MI,
- AMDGPU::OpName::vdst_in);
- if (VDstInOp)
- NewGlob->addOperand(MF, *VDstInOp);
- NewGlob->copyImplicitOps(MF, MI);
- NewGlob->cloneMemRefs(MF, MI);
- // Remove the old Global Memop instruction.
- MI.eraseFromParent();
- LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
- }
- return FuncModified;
-}
-
-bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
- // Only need to run this in SelectionDAG path.
- if (MF.getProperties().hasProperty(
- MachineFunctionProperties::Property::Selected))
- return false;
-
- if (skipFunction(MF.getFunction()))
- return false;
-
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
- bool FuncModified = false;
- for (MachineBasicBlock &MBB : MF) {
- // Cleanup missed Saddr opportunites from ISel.
- FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
- }
- return FuncModified;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 92980d2406cf..d5fa9afded27 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -9,18 +9,11 @@
//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "si-fold-operands"
using namespace llvm;
@@ -35,7 +28,7 @@ struct FoldCandidate {
int FrameIndexToFold;
};
int ShrinkOpcode;
- unsigned char UseOpNo;
+ unsigned UseOpNo;
MachineOperand::MachineOperandType Kind;
bool Commuted;
@@ -129,6 +122,23 @@ char SIFoldOperands::ID = 0;
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
+static unsigned macToMad(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::V_MAC_F32_e64:
+ return AMDGPU::V_MAD_F32_e64;
+ case AMDGPU::V_MAC_F16_e64:
+ return AMDGPU::V_MAD_F16_e64;
+ case AMDGPU::V_FMAC_F32_e64:
+ return AMDGPU::V_FMA_F32_e64;
+ case AMDGPU::V_FMAC_F16_e64:
+ return AMDGPU::V_FMA_F16_gfx9_e64;
+ case AMDGPU::V_FMAC_LEGACY_F32_e64:
+ return AMDGPU::V_FMA_LEGACY_F32_e64;
+ }
+ return AMDGPU::INSTRUCTION_LIST_END;
+}
+
// Wrapper around isInlineConstant that understands special cases when
// instruction types are replaced during operand folding.
static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
@@ -139,31 +149,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
return true;
unsigned Opc = UseMI.getOpcode();
- switch (Opc) {
- case AMDGPU::V_MAC_F32_e64:
- case AMDGPU::V_MAC_F16_e64:
- case AMDGPU::V_FMAC_F32_e64:
- case AMDGPU::V_FMAC_F16_e64: {
+ unsigned NewOpc = macToMad(Opc);
+ if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
- Opc == AMDGPU::V_FMAC_F16_e64;
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64;
-
- unsigned Opc = IsFMA ?
- (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
- (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
- const MCInstrDesc &MadDesc = TII->get(Opc);
+ const MCInstrDesc &MadDesc = TII->get(NewOpc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
- return false;
- }
- default:
- return false;
}
+
+ return false;
}
// TODO: Add heuristic that the frame index might not fit in the addressing mode
@@ -172,9 +169,23 @@ static bool frameIndexMayFold(const SIInstrInfo *TII,
const MachineInstr &UseMI,
int OpNo,
const MachineOperand &OpToFold) {
- return OpToFold.isFI() &&
- (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
- OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+ if (!OpToFold.isFI())
+ return false;
+
+ if (TII->isMUBUF(UseMI))
+ return OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
+ AMDGPU::OpName::vaddr);
+ if (!TII->isFLATScratch(UseMI))
+ return false;
+
+ int SIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
+ AMDGPU::OpName::saddr);
+ if (OpNo == SIdx)
+ return true;
+
+ int VIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(),
+ AMDGPU::OpName::vaddr);
+ return OpNo == VIdx && SIdx == -1;
}
FunctionPass *llvm::createSIFoldOperandsPass() {
@@ -282,9 +293,6 @@ static bool updateOperand(FoldCandidate &Fold,
assert(!Fold.needsShrink() && "not handled");
if (Fold.isImm()) {
- // FIXME: ChangeToImmediate should probably clear the subreg flags. It's
- // reinterpreted as TargetFlags.
- Old.setSubReg(0);
Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
@@ -335,17 +343,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
- if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
- (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
- Opc == AMDGPU::V_FMAC_F16_e64;
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64;
- unsigned NewOpc = IsFMA ?
- (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
- (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
-
+ unsigned NewOpc = macToMad(Opc);
+ if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
MI->setDesc(TII->get(NewOpc));
@@ -358,10 +357,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
}
// Special case for s_setreg_b32
- if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
- MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
- appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
- return true;
+ if (OpToFold->isImm()) {
+ unsigned ImmOpc = 0;
+ if (Opc == AMDGPU::S_SETREG_B32)
+ ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
+ else if (Opc == AMDGPU::S_SETREG_B32_mode)
+ ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
+ if (ImmOpc) {
+ MI->setDesc(TII->get(ImmOpc));
+ appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
+ return true;
+ }
}
// If we are already folding into another operand of MI, then
@@ -399,9 +405,9 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
return false;
if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
- if ((Opc == AMDGPU::V_ADD_I32_e64 ||
- Opc == AMDGPU::V_SUB_I32_e64 ||
- Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
+ if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
+ Opc == AMDGPU::V_SUB_CO_U32_e64 ||
+ Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
(OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -463,7 +469,18 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
static bool isUseSafeToFold(const SIInstrInfo *TII,
const MachineInstr &MI,
const MachineOperand &UseMO) {
- return !UseMO.isUndef() && !TII->isSDWA(MI);
+ if (UseMO.isUndef() || TII->isSDWA(MI))
+ return false;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ // Do not fold into an indirect mov.
+ return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
+ }
+
+ return true;
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
@@ -528,11 +545,12 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false;
Register UseReg = OpToFold.getReg();
- if (!Register::isVirtualRegister(UseReg))
+ if (!UseReg.isVirtual())
return false;
- if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
- return FC.UseMI == UseMI; }) != FoldList.end())
+ if (llvm::any_of(FoldList, [UseMI](const FoldCandidate &FC) {
+ return FC.UseMI == UseMI;
+ }))
return false;
MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
@@ -587,9 +605,9 @@ void SIFoldOperands::foldOperand(
Register RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
- MachineRegisterInfo::use_iterator Next;
- for (MachineRegisterInfo::use_iterator
- RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
+ MachineRegisterInfo::use_nodbg_iterator Next;
+ for (MachineRegisterInfo::use_nodbg_iterator
+ RSUse = MRI->use_nodbg_begin(RegSeqDstReg), RSE = MRI->use_nodbg_end();
RSUse != RSE; RSUse = Next) {
Next = std::next(RSUse);
@@ -616,25 +634,30 @@ void SIFoldOperands::foldOperand(
// Sanity check that this is a stack access.
// FIXME: Should probably use stack pseudos before frame lowering.
- if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
- MFI->getScratchRSrcReg())
- return;
+ if (TII->isMUBUF(*UseMI)) {
+ if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+ MFI->getScratchRSrcReg())
+ return;
- // Ensure this is either relative to the current frame or the current wave.
- MachineOperand &SOff =
- *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
- if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) &&
- (!SOff.isImm() || SOff.getImm() != 0))
- return;
+ // Ensure this is either relative to the current frame or the current
+ // wave.
+ MachineOperand &SOff =
+ *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+ if (!SOff.isImm() || SOff.getImm() != 0)
+ return;
+ }
// A frame index will resolve to a positive constant, so it should always be
// safe to fold the addressing mode, even pre-GFX9.
UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
- // If this is relative to the current wave, update it to be relative to the
- // current frame.
- if (SOff.isImm())
- SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false);
+ if (TII->isFLATScratch(*UseMI) &&
+ AMDGPU::getNamedOperandIdx(UseMI->getOpcode(),
+ AMDGPU::OpName::vaddr) != -1) {
+ unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode());
+ UseMI->setDesc(TII->get(NewOpc));
+ }
+
return;
}
@@ -643,42 +666,46 @@ void SIFoldOperands::foldOperand(
if (FoldingImmLike && UseMI->isCopy()) {
Register DestReg = UseMI->getOperand(0).getReg();
+ Register SrcReg = UseMI->getOperand(1).getReg();
+ assert(SrcReg.isVirtual());
- // Don't fold into a copy to a physical register. Doing so would interfere
- // with the register coalescer's logic which would avoid redundant
- // initalizations.
- if (DestReg.isPhysical())
- return;
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
- const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg);
+ // Don't fold into a copy to a physical register with the same class. Doing
+ // so would interfere with the register coalescer's logic which would avoid
+ // redundant initalizations.
+ if (DestReg.isPhysical() && SrcRC->contains(DestReg))
+ return;
- Register SrcReg = UseMI->getOperand(1).getReg();
- if (SrcReg.isVirtual()) { // XXX - This can be an assert?
- const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
+ const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
+ if (!DestReg.isPhysical()) {
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
- MachineRegisterInfo::use_iterator NextUse;
+ MachineRegisterInfo::use_nodbg_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
- for (MachineRegisterInfo::use_iterator
- Use = MRI->use_begin(DestReg), E = MRI->use_end();
- Use != E; Use = NextUse) {
+ for (MachineRegisterInfo::use_nodbg_iterator Use = MRI->use_nodbg_begin(DestReg),
+ E = MRI->use_nodbg_end();
+ Use != E; Use = NextUse) {
NextUse = std::next(Use);
- FoldCandidate FC = FoldCandidate(Use->getParent(),
- Use.getOperandNo(), &UseMI->getOperand(1));
+ // There's no point trying to fold into an implicit operand.
+ if (Use->isImplicit())
+ continue;
+
+ FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
+ &UseMI->getOperand(1));
CopyUses.push_back(FC);
- }
- for (auto & F : CopyUses) {
- foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
- FoldList, CopiesToReplace);
+ }
+ for (auto &F : CopyUses) {
+ foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
}
}
- }
- if (DestRC == &AMDGPU::AGPR_32RegClass &&
- TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
- UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
- CopiesToReplace.push_back(UseMI);
- return;
+ if (DestRC == &AMDGPU::AGPR_32RegClass &&
+ TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ CopiesToReplace.push_back(UseMI);
+ return;
+ }
}
// In order to fold immediates into copies, we need to change the
@@ -738,7 +765,7 @@ void SIFoldOperands::foldOperand(
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm);
+ TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
B.addReg(Tmp);
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
auto Src = getRegSubRegPair(*Def);
@@ -780,7 +807,7 @@ void SIFoldOperands::foldOperand(
}
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr);
+ TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
B.addReg(Tmp);
}
@@ -794,10 +821,10 @@ void SIFoldOperands::foldOperand(
return;
if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
- UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
- UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
return;
}
@@ -819,8 +846,6 @@ void SIFoldOperands::foldOperand(
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
- // FIXME: ChangeToImmediate should clear subreg
- UseMI->getOperand(1).setSubReg(0);
if (OpToFold.isImm())
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
else
@@ -991,8 +1016,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
MachineOperand &Op) {
if (Op.isReg()) {
// If this has a subregister, it obviously is a register source.
- if (Op.getSubReg() != AMDGPU::NoSubRegister ||
- !Register::isVirtualRegister(Op.getReg()))
+ if (Op.getSubReg() != AMDGPU::NoSubRegister || !Op.getReg().isVirtual())
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -1032,25 +1056,6 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
if (!Src0->isImm() && !Src1->isImm())
return false;
- if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32 ||
- MI->getOpcode() == AMDGPU::V_LSHL_ADD_U32 ||
- MI->getOpcode() == AMDGPU::V_AND_OR_B32) {
- if (Src0->isImm() && Src0->getImm() == 0) {
- // v_lshl_or_b32 0, X, Y -> copy Y
- // v_lshl_or_b32 0, X, K -> v_mov_b32 K
- // v_lshl_add_b32 0, X, Y -> copy Y
- // v_lshl_add_b32 0, X, K -> v_mov_b32 K
- // v_and_or_b32 0, X, Y -> copy Y
- // v_and_or_b32 0, X, K -> v_mov_b32 K
- bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
- MI->RemoveOperand(Src1Idx);
- MI->RemoveOperand(Src0Idx);
-
- MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
- return true;
- }
- }
-
// and k0, k1 -> v_mov_b32 (k0 & k1)
// or k0, k1 -> v_mov_b32 (k0 | k1)
// xor k0, k1 -> v_mov_b32 (k0 ^ k1)
@@ -1178,9 +1183,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;
- MachineRegisterInfo::use_iterator NextUse;
- for (MachineRegisterInfo::use_iterator
- Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ MachineRegisterInfo::use_nodbg_iterator NextUse;
+ for (MachineRegisterInfo::use_nodbg_iterator
+ Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
Use != E; Use = NextUse) {
NextUse = std::next(Use);
MachineInstr *UseMI = Use->getParent();
@@ -1202,7 +1207,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
// again. The same constant folded instruction could also have a second
// use operand.
- NextUse = MRI->use_begin(Dst.getReg());
+ NextUse = MRI->use_nodbg_begin(Dst.getReg());
FoldList.clear();
continue;
}
@@ -1241,9 +1246,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
} else {
// Folding register.
- SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
- for (MachineRegisterInfo::use_iterator
- Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ SmallVector <MachineRegisterInfo::use_nodbg_iterator, 4> UsesToProcess;
+ for (MachineRegisterInfo::use_nodbg_iterator
+ Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
Use != E; ++Use) {
UsesToProcess.push_back(Use);
}
@@ -1260,9 +1265,12 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
+ SmallPtrSet<MachineInstr *, 16> Folded;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.OpToFold);
- if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) {
+ if (Folded.count(Fold.UseMI))
+ continue;
+ if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
Register Reg = Fold.OpToFold->getReg();
MachineInstr *DefMI = Fold.OpToFold->getParent();
if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
@@ -1281,7 +1289,8 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
<< static_cast<int>(Fold.UseOpNo) << " of "
<< *Fold.UseMI << '\n');
- tryFoldInst(TII, Fold.UseMI);
+ if (tryFoldInst(TII, Fold.UseMI))
+ Folded.insert(Fold.UseMI);
} else if (Fold.isCommuted()) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
@@ -1296,7 +1305,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
switch (Op) {
case AMDGPU::V_MAX_F32_e64:
case AMDGPU::V_MAX_F16_e64:
- case AMDGPU::V_MAX_F64:
+ case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -1557,7 +1566,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (!FoldingImm && !OpToFold.isReg())
continue;
- if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg()))
+ if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
continue;
// Prevent folding operands backwards in the function. For example,
@@ -1567,7 +1576,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// ...
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg()))
+ if (Dst.isReg() && !Dst.getReg().isVirtual())
continue;
foldInstOperand(MI, OpToFold);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 8ef02e73865d..a12e013b4fe6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -14,15 +14,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
#include "GCNRegPressure.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -60,9 +53,14 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
+ MachineFunctionProperties getClearedProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+
private:
template <typename Callable>
- void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const;
+ void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const;
bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
@@ -145,15 +143,15 @@ static unsigned getMopState(const MachineOperand &MO) {
S |= RegState::Kill;
if (MO.isEarlyClobber())
S |= RegState::EarlyClobber;
- if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+ if (MO.getReg().isPhysical() && MO.isRenamable())
S |= RegState::Renamable;
return S;
}
template <typename Callable>
-void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
+void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask,
Callable Func) const {
- if (LaneMask.all() || Register::isPhysicalRegister(Reg) ||
+ if (LaneMask.all() || Reg.isPhysical() ||
LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
Func(0);
return;
@@ -228,7 +226,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
if (Conflict == Map.end())
continue;
- if (Register::isPhysicalRegister(Reg))
+ if (Reg.isPhysical())
return false;
LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
@@ -270,7 +268,7 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
if (!Reg)
continue;
- LaneBitmask Mask = Register::isVirtualRegister(Reg)
+ LaneBitmask Mask = Reg.isVirtual()
? TRI->getSubRegIndexLaneMask(MO.getSubReg())
: LaneBitmask::getAll();
RegUse &Map = MO.isDef() ? Defs : Uses;
@@ -324,6 +322,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
MF.getFunction(), "amdgpu-max-memory-clause", MaxClause);
for (MachineBasicBlock &MBB : MF) {
+ GCNDownwardRPTracker RPT(*LIS);
MachineBasicBlock::instr_iterator Next;
for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
MachineInstr &MI = *I;
@@ -334,12 +333,19 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
if (!isValidClauseInst(MI, IsVMEM))
continue;
- RegUse Defs, Uses;
- GCNDownwardRPTracker RPT(*LIS);
- RPT.reset(MI);
+ if (!RPT.getNext().isValid())
+ RPT.reset(MI);
+ else { // Advance the state to the current MI.
+ RPT.advance(MachineBasicBlock::const_iterator(MI));
+ RPT.advanceBeforeNext();
+ }
- if (!processRegUses(MI, Defs, Uses, RPT))
+ const GCNRPTracker::LiveRegSet LiveRegsCopy(RPT.getLiveRegs());
+ RegUse Defs, Uses;
+ if (!processRegUses(MI, Defs, Uses, RPT)) {
+ RPT.reset(MI, &LiveRegsCopy);
continue;
+ }
unsigned Length = 1;
for ( ; Next != E && Length < FuncMaxClause; ++Next) {
@@ -354,8 +360,10 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
++Length;
}
- if (Length < 2)
+ if (Length < 2) {
+ RPT.reset(MI, &LiveRegsCopy);
continue;
+ }
Changed = true;
MFI->limitOccupancy(LastRecordedOccupancy);
@@ -363,6 +371,9 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
Ind->insertMachineInstrInMaps(*B);
+ // Restore the state after processing the bundle.
+ RPT.reset(*B, &LiveRegsCopy);
+
for (auto BI = I; BI != Next; ++BI) {
BI->bundleWithPred();
Ind->removeSingleMachineInstrFromMaps(*BI);
@@ -388,17 +399,17 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
}
for (auto &&R : Defs) {
- unsigned Reg = R.first;
+ Register Reg = R.first;
Uses.erase(Reg);
- if (Register::isPhysicalRegister(Reg))
+ if (Reg.isPhysical())
continue;
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
}
for (auto &&R : Uses) {
- unsigned Reg = R.first;
- if (Register::isPhysicalRegister(Reg))
+ Register Reg = R.first;
+ if (Reg.isPhysical())
continue;
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index a2e802009d09..0398d27756db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -7,17 +7,14 @@
//==-----------------------------------------------------------------------===//
#include "SIFrameLowering.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -112,15 +109,19 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
// 3: There's no free lane to spill, and no free register to save FP/BP,
// so we're forced to spill another VGPR to use for the spill.
FrameIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
} else {
+ // Remove dead <NewFI> index
+ MF.getFrameInfo().RemoveStackObject(NewFI);
// 4: If all else fails, spill the FP/BP to memory.
FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+ LLVM_DEBUG(dbgs() << "Reserved FI " << FrameIndex << " for spilling "
+ << (IsFP ? "FP" : "BP") << '\n');
}
-
- LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
- << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
- << '\n';);
} else {
LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
<< printReg(TempSGPR, TRI) << '\n');
@@ -130,7 +131,8 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
// We need to specially emit stack operations here because a different frame
// register is used than in the rest of the function, as getFrameRegister would
// use.
-static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+ MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const SIInstrInfo *TII, Register SpillReg,
Register ScratchRsrcReg, Register SPReg, int FI) {
@@ -143,7 +145,19 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
MFI.getObjectAlign(FI));
- if (isUInt<12>(Offset)) {
+ if (ST.enableFlatScratch()) {
+ if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+ } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
.addReg(SpillReg, RegState::Kill)
.addReg(ScratchRsrcReg)
@@ -158,27 +172,52 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
return;
}
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+ // Don't clobber the TmpVGPR if we also need a scratch reg for the stack
+ // offset in the spill.
+ LiveRegs.addReg(SpillReg);
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
- .addImm(Offset);
+ if (ST.enableFlatScratch()) {
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
- .addReg(SpillReg, RegState::Kill)
- .addReg(OffsetReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(SPReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(OffsetReg, RegState::Kill)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ } else {
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addImm(0) // swz
+ .addMemOperand(MMO);
+ }
+
+ LiveRegs.removeReg(SpillReg);
}
-static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+ MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const SIInstrInfo *TII, Register SpillReg,
Register ScratchRsrcReg, Register SPReg, int FI) {
@@ -190,7 +229,36 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
MFI.getObjectAlign(FI));
- if (isUInt<12>(Offset)) {
+ if (ST.enableFlatScratch()) {
+ if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(SPReg)
+ .addImm(Offset);
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR),
+ SpillReg)
+ .addReg(OffsetReg, RegState::Kill)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
BuildMI(MBB, I, DebugLoc(),
TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
.addReg(ScratchRsrcReg)
@@ -225,6 +293,31 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addMemOperand(MMO);
}
+static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, const SIInstrInfo *TII,
+ Register TargetReg) {
+ MachineFunction *MF = MBB.getParent();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+ Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
+ Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
+
+ if (MFI->getGITPtrHigh() != 0xffffffff) {
+ BuildMI(MBB, I, DL, SMovB32, TargetHi)
+ .addImm(MFI->getGITPtrHigh())
+ .addReg(TargetReg, RegState::ImplicitDefine);
+ } else {
+ const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+ BuildMI(MBB, I, DL, GetPC64, TargetReg);
+ }
+ Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
+ MF->getRegInfo().addLiveIn(GitPtrLo);
+ MBB.addLiveIn(GitPtrLo);
+ BuildMI(MBB, I, DL, SMovB32, TargetLo)
+ .addReg(GitPtrLo);
+}
+
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -244,15 +337,74 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.
- Register FlatScratchInitReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+ Register FlatScrInitLo;
+ Register FlatScrInitHi;
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MRI.addLiveIn(FlatScratchInitReg);
- MBB.addLiveIn(FlatScratchInitReg);
+ if (ST.isAmdPalOS()) {
+ // Extract the scratch offset from the descriptor in the GIT
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ // Find unused reg to load flat scratch init into
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register FlatScrInit = AMDGPU::NoRegister;
+ ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
+ unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
+ AllSGPR64s = AllSGPR64s.slice(
+ std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
+ Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
+ for (MCPhysReg Reg : AllSGPR64s) {
+ if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
+ FlatScrInit = Reg;
+ break;
+ }
+ }
+ assert(FlatScrInit && "Failed to find free register for scratch init");
+
+ FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
- Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+
+ // We now have the GIT ptr - now get the scratch descriptor from the entry
+ // at offset 0 (or offset 16 for a compute shader).
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
+ auto *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 8, Align(4));
+ unsigned Offset =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
+ BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
+ .addReg(FlatScrInit)
+ .addImm(EncodedOffset) // offset
+ .addImm(0) // glc
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+
+ // Mask the offset in [47:0] of the descriptor
+ const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
+ BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
+ .addReg(FlatScrInitHi)
+ .addImm(0xffff);
+ } else {
+ Register FlatScratchInitReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+ assert(FlatScratchInitReg);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ }
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
@@ -274,6 +426,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
return;
}
+ // For GFX9.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);
@@ -284,7 +437,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
return;
}
- assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
+ assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
// Copy the size in bytes.
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
@@ -302,6 +455,18 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
.addImm(8);
}
+// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
+// memory. They should have been removed by now.
+static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+ for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+ I != E; ++I) {
+ if (!MFI.isDeadObjectIndex(I))
+ return false;
+ }
+
+ return true;
+}
+
// Shift down registers reserved for the scratch RSRC.
Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
MachineFunction &MF) const {
@@ -316,7 +481,8 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
Register ScratchRsrcReg = MFI->getScratchRSrcReg();
- if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
+ if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
+ allStackObjectsAreDead(MF.getFrameInfo())))
return Register();
if (ST.hasSGPRInitBug() ||
@@ -354,6 +520,10 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
return ScratchRsrcReg;
}
+static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
+ return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
+}
+
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
@@ -390,7 +560,9 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
//
// This will return `Register()` in cases where there are no actual
// uses of the SRSRC.
- Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
+ Register ScratchRsrcReg;
+ if (!ST.enableFlatScratch())
+ ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
// Make the selected register live throughout the function.
if (ScratchRsrcReg) {
@@ -446,11 +618,11 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
assert(ScratchWaveOffsetReg);
- if (MF.getFrameInfo().hasCalls()) {
+ if (requiresStackPointerReference(MF)) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
- .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
+ .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST));
}
if (hasFP(MF)) {
@@ -490,26 +662,9 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
if (ST.isAmdPalOS()) {
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
- Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
- const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-
- if (MFI->getGITPtrHigh() != 0xffffffff) {
- BuildMI(MBB, I, DL, SMovB32, RsrcHi)
- .addImm(MFI->getGITPtrHigh())
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- } else {
- const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
- BuildMI(MBB, I, DL, GetPC64, Rsrc01);
- }
- Register GitPtrLo = MFI->getGITPtrLoReg(MF);
- MF.getRegInfo().addLiveIn(GitPtrLo);
- MBB.addLiveIn(GitPtrLo);
- BuildMI(MBB, I, DL, SMovB32, RsrcLo)
- .addReg(GitPtrLo)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ buildGitPtr(MBB, I, DL, TII, Rsrc01);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
@@ -629,7 +784,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::NoAlloc:
case TargetStackID::SGPRSpill:
return true;
- case TargetStackID::SVEVector:
+ case TargetStackID::ScalableVector:
return false;
}
llvm_unreachable("Invalid TargetStackID::Value");
@@ -769,7 +924,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
if (!ScratchExecCopy)
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
- buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
FuncInfo->getScratchRSrcReg(),
StackPtrReg,
Reg.FI.getValue());
@@ -787,7 +942,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(FramePtrReg);
- buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+ buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg,
FuncInfo->FramePointerSaveIndex.getValue());
}
@@ -804,7 +959,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(BasePtrReg);
- buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+ buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg,
*FuncInfo->BasePointerSaveIndex);
}
@@ -830,8 +985,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// Save FP before setting it up.
// FIXME: This should respect spillSGPRToVGPR;
- BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
- Spill[0].VGPR)
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
.addReg(FramePtrReg)
.addImm(Spill[0].Lane)
.addReg(Spill[0].VGPR, RegState::Undef);
@@ -849,8 +1003,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// Save BP before setting it up.
// FIXME: This should respect spillSGPRToVGPR;
- BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
- Spill[0].VGPR)
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
.addReg(BasePtrReg)
.addImm(Spill[0].Lane)
.addReg(Spill[0].VGPR, RegState::Undef);
@@ -877,11 +1030,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// s_and_b32 s32, tmp_reg, 0b111...0000
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
.addReg(StackPtrReg)
- .addImm((Alignment - 1) * ST.getWavefrontSize())
+ .addImm((Alignment - 1) * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
.addReg(ScratchSPReg, RegState::Kill)
- .addImm(-Alignment * ST.getWavefrontSize())
+ .addImm(-Alignment * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
} else if ((HasFP = hasFP(MF))) {
@@ -903,7 +1056,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
if (HasFP && RoundedSize != 0) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(RoundedSize * ST.getWavefrontSize())
+ .addImm(RoundedSize * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
}
@@ -965,7 +1118,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
if (RoundedSize != 0 && hasFP(MF)) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
- .addImm(RoundedSize * ST.getWavefrontSize())
+ .addImm(RoundedSize * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameDestroy);
}
@@ -991,7 +1144,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+ buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
.addReg(TempVGPR, RegState::Kill);
@@ -1001,8 +1154,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
FuncInfo->getSGPRToVGPRSpills(FI);
assert(Spill.size() == 1);
- BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- FramePtrReg)
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg)
.addReg(Spill[0].VGPR)
.addImm(Spill[0].Lane);
}
@@ -1017,7 +1169,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+ buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
.addReg(TempVGPR, RegState::Kill);
@@ -1027,8 +1179,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
assert(Spill.size() == 1);
- BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- BasePtrReg)
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg)
.addReg(Spill[0].VGPR)
.addImm(Spill[0].Lane);
}
@@ -1042,7 +1193,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
if (!ScratchExecCopy)
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
- buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
FuncInfo->getScratchRSrcReg(), StackPtrReg,
Reg.FI.getValue());
}
@@ -1056,28 +1207,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
}
}
-// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
-// memory. They should have been removed by now.
-static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
- for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
- I != E; ++I) {
- if (!MFI.isDeadObjectIndex(I))
- return false;
- }
-
- return true;
-}
-
#ifndef NDEBUG
-static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
- Optional<int> FramePointerSaveIndex,
- Optional<int> BasePointerSaveIndex) {
+static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
if (!MFI.isDeadObjectIndex(I) &&
MFI.getStackID(I) == TargetStackID::SGPRSpill &&
- ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
- (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
+ (I != FuncInfo->FramePointerSaveIndex &&
+ I != FuncInfo->BasePointerSaveIndex)) {
return false;
}
}
@@ -1086,12 +1225,13 @@ static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
}
#endif
-int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const {
+StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ Register &FrameReg) const {
const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
FrameReg = RI->getFrameRegister(MF);
- return MF.getFrameInfo().getObjectOffset(FI);
+ return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
}
void SIFrameLowering::processFunctionBeforeFrameFinalized(
@@ -1104,7 +1244,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
FuncInfo->removeDeadFrameIndices(MFI);
- assert(allSGPRSpillsAreDead(MFI, None, None) &&
+ assert(allSGPRSpillsAreDead(MF) &&
"SGPR spill should have been removed in SILowerSGPRSpills");
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
@@ -1163,6 +1303,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
LiveRegs.init(*TRI);
if (WillHaveFP || hasFP(MF)) {
+ assert(!MFI->SGPRForFPSaveRestoreCopy && !MFI->FramePointerSaveIndex &&
+ "Re-reserving spill slot for FP");
getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
MFI->FramePointerSaveIndex, true);
}
@@ -1170,6 +1312,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (TRI->hasBasePointer(MF)) {
if (MFI->SGPRForFPSaveRestoreCopy)
LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+
+ assert(!MFI->SGPRForBPSaveRestoreCopy &&
+ !MFI->BasePointerSaveIndex && "Re-reserving spill slot for BP");
getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
MFI->BasePointerSaveIndex, false);
}
@@ -1188,7 +1333,21 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
// The SP is specifically managed and we don't want extra spills of it.
SavedRegs.reset(MFI->getStackPtrOffsetReg());
+
+ const BitVector AllSavedRegs = SavedRegs;
SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+
+ // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
+ const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
+
+ // We have to anticipate introducing CSR VGPR spills if we don't have any
+ // stack objects already, since we require an FP if there is a call and stack.
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR;
+
+ // FP will be specially managed like SP.
+ if (WillHaveFP || hasFP(MF))
+ SavedRegs.reset(MFI->getFrameOffsetReg());
}
bool SIFrameLowering::assignCalleeSavedSpillSlots(
@@ -1253,7 +1412,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
BuildMI(MBB, I, DL, TII->get(Op), SPReg)
.addReg(SPReg)
- .addImm(Amount * ST.getWavefrontSize());
+ .addImm(Amount * getScratchScaleFactor(ST));
} else if (CalleePopAmount != 0) {
llvm_unreachable("is this used?");
}
@@ -1261,6 +1420,20 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
return MBB.erase(I);
}
+/// Returns true if the frame will require a reference to the stack pointer.
+///
+/// This is the set of conditions common to setting up the stack pointer in a
+/// kernel, and for using a frame pointer in a callable function.
+///
+/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
+/// references SP.
+static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
+ return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
+}
+
+// The FP for kernels is always known 0, so we never really need to setup an
+// explicit register for it. However, DisableFramePointerElim will force us to
+// use a register for it.
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1276,8 +1449,31 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
return MFI.getStackSize() != 0;
}
- return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
- MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
MF.getTarget().Options.DisableFramePointerElim(MF);
}
+
+// This is essentially a reduced version of hasFP for entry functions. Since the
+// stack pointer is known 0 on entry to kernels, we never really need an FP
+// register. We may need to initialize the stack pointer depending on the frame
+// properties, which logically overlaps many of the cases where an ordinary
+// function would require an FP.
+bool SIFrameLowering::requiresStackPointerReference(
+ const MachineFunction &MF) const {
+ // Callable functions always require a stack pointer reference.
+ assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
+ "only expected to call this for entry points");
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Entry points ordinarily don't need to initialize SP. We have to set it up
+ // for callees if there are any. Also note tail calls are impossible/don't
+ // make any sense for kernels.
+ if (MFI.hasCalls())
+ return true;
+
+ // We still need to initialize the SP if we're doing anything weird that
+ // references the SP, like variable sized stack objects.
+ return frameTriviallyRequiresSP(MFI);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index e89432040661..951ea79b2809 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -31,8 +31,8 @@ public:
MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
@@ -71,6 +71,8 @@ private:
public:
bool hasFP(const MachineFunction &MF) const override;
+
+ bool requiresStackPointerReference(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d035aa8f72bd..839437b5e3f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13,72 +13,21 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/DAGCombine.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetCallingConv.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <iterator>
-#include <tuple>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -449,6 +398,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::f16, Custom);
setOperationAction(ISD::FEXP, MVT::f16, Custom);
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
@@ -486,6 +436,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasBFE())
setHasExtractBitsInsn(true);
+ // Clamp modifier on add/sub
+ if (Subtarget->hasIntClamp()) {
+ setOperationAction(ISD::UADDSAT, MVT::i32, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::i32, Legal);
+ }
+
+ if (Subtarget->hasAddNoCarry()) {
+ setOperationAction(ISD::SADDSAT, MVT::i16, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::i16, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
+ }
+
setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
@@ -531,13 +494,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
- setOperationAction(ISD::ROTR, MVT::i16, Promote);
- setOperationAction(ISD::ROTL, MVT::i16, Promote);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
setOperationAction(ISD::SDIV, MVT::i16, Promote);
setOperationAction(ISD::UDIV, MVT::i16, Promote);
setOperationAction(ISD::SREM, MVT::i16, Promote);
setOperationAction(ISD::UREM, MVT::i16, Promote);
+ setOperationAction(ISD::UADDSAT, MVT::i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::i16, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
@@ -702,6 +667,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal);
+
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
@@ -729,6 +699,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom);
+
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
setOperationAction(ISD::FMA, MVT::v4f16, Custom);
@@ -779,6 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
@@ -790,6 +767,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
@@ -844,6 +823,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
// FIXME: In other contexts we pretend this is a per-function property.
setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
@@ -888,15 +869,18 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
if (VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32)
- return ScalarVT.getSimpleVT();
+ if (Size == 16) {
+ if (Subtarget->has16BitInsts())
+ return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ return VT.isInteger() ? MVT::i32 : MVT::f32;
+ }
- if (Size > 32)
- return MVT::i32;
+ if (Size < 16)
+ return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
+ return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
+ }
- if (Size == 16 && Subtarget->has16BitInsts())
- return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
- } else if (VT.getSizeInBits() > 32)
+ if (VT.getSizeInBits() > 32)
return MVT::i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -913,14 +897,15 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32)
+ // FIXME: Should probably promote 8-bit vectors to i16.
+ if (Size == 16 && Subtarget->has16BitInsts())
+ return (NumElts + 1) / 2;
+
+ if (Size <= 32)
return NumElts;
if (Size > 32)
return NumElts * ((Size + 31) / 32);
-
- if (Size == 16 && Subtarget->has16BitInsts())
- return (NumElts + 1) / 2;
} else if (VT.getSizeInBits() > 32)
return (VT.getSizeInBits() + 31) / 32;
@@ -935,6 +920,16 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
+ // FIXME: We should fix the ABI to be the same on targets without 16-bit
+ // support, but unless we can properly handle 3-vectors, it will be still be
+ // inconsistent.
+ if (Size == 16 && Subtarget->has16BitInsts()) {
+ RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = (NumElts + 1) / 2;
+ return NumIntermediates;
+ }
+
if (Size == 32) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
@@ -942,20 +937,26 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
- if (Size > 32) {
+ if (Size < 16 && Subtarget->has16BitInsts()) {
+ // FIXME: Should probably form v2i16 pieces
+ RegisterVT = MVT::i16;
+ IntermediateVT = ScalarVT;
+ NumIntermediates = NumElts;
+ return NumIntermediates;
+ }
+
+
+ if (Size != 16 && Size <= 32) {
RegisterVT = MVT::i32;
- IntermediateVT = RegisterVT;
- NumIntermediates = NumElts * ((Size + 31) / 32);
+ IntermediateVT = ScalarVT;
+ NumIntermediates = NumElts;
return NumIntermediates;
}
- // FIXME: We should fix the ABI to be the same on targets without 16-bit
- // support, but unless we can properly handle 3-vectors, it will be still be
- // inconsistent.
- if (Size == 16 && Subtarget->has16BitInsts()) {
- RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ if (Size > 32) {
+ RegisterVT = MVT::i32;
IntermediateVT = RegisterVT;
- NumIntermediates = (NumElts + 1) / 2;
+ NumIntermediates = NumElts * ((Size + 31) / 32);
return NumIntermediates;
}
}
@@ -1007,14 +1008,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (RsrcIntr->IsImage) {
- Info.ptrVal = MFI->getImagePSV(
- *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
- CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.ptrVal =
+ MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
Info.align.reset();
} else {
- Info.ptrVal = MFI->getBufferPSV(
- *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
- CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.ptrVal =
+ MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
}
Info.flags = MachineMemOperand::MODereferenceable;
@@ -1056,8 +1055,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
} else {
// Atomic
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
+ Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
+ ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable;
@@ -1091,11 +1091,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_buffer_atomic_fadd: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- Info.opc = ISD::INTRINSIC_VOID;
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.ptrVal = MFI->getBufferPSV(
- *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
- CI.getArgOperand(1));
+ Info.ptrVal =
+ MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
@@ -1105,16 +1104,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
- ->getPointerElementType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
- return true;
- }
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1136,10 +1125,31 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
+ Info.ptrVal =
+ MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1175,9 +1185,13 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
- case Intrinsic::amdgcn_ds_fmax: {
+ case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Ops.push_back(Ptr);
@@ -1234,7 +1248,7 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// assume those use MUBUF instructions. Scratch loads / stores are currently
// implemented as mubuf instructions with offen bit set, so slightly
// different than the normal addr64.
- if (!isUInt<12>(AM.BaseOffs))
+ if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
return false;
// FIXME: Since we can split immediate into soffset and immediate offset,
@@ -1355,37 +1369,77 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
}
bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
- unsigned Size, unsigned AddrSpace, unsigned Align,
+ unsigned Size, unsigned AddrSpace, Align Alignment,
MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
- // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
- // aligned, 8 byte access in a single operation using ds_read2/write2_b32
- // with adjacent offsets.
- bool AlignedBy4 = (Align % 4 == 0);
+ // Check if alignment requirements for ds_read/write instructions are
+ // disabled.
+ if (Subtarget->hasUnalignedDSAccessEnabled() &&
+ !Subtarget->hasLDSMisalignedBug()) {
+ if (IsFast)
+ *IsFast = Alignment != Align(2);
+ return true;
+ }
+
+ if (Size == 64) {
+ // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+ // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+ // with adjacent offsets.
+ bool AlignedBy4 = Alignment >= Align(4);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+
+ return AlignedBy4;
+ }
+ if (Size == 96) {
+ // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
+ bool Aligned = Alignment >= Align(16);
+ if (IsFast)
+ *IsFast = Aligned;
+
+ return Aligned;
+ }
+ if (Size == 128) {
+ // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
+ // can do a 8 byte aligned, 16 byte access in a single operation using
+ // ds_read2/write2_b64.
+ bool Aligned = Alignment >= Align(8);
+ if (IsFast)
+ *IsFast = Aligned;
+
+ return Aligned;
+ }
+ }
+
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+ bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
- return AlignedBy4;
+ return AlignedBy4 ||
+ Subtarget->enableFlatScratch() ||
+ Subtarget->hasUnalignedScratchAccess();
}
// FIXME: We have to be conservative here and assume that flat operations
// will access scratch. If we had access to the IR function, then we
// could determine if any private memory was used in the function.
- if (!Subtarget->hasUnalignedScratchAccess() &&
- (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
- AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
- bool AlignedBy4 = Align >= 4;
+ if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
+ !Subtarget->hasUnalignedScratchAccess()) {
+ bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccess()) {
+ if (Subtarget->hasUnalignedBufferAccessEnabled() &&
+ !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
@@ -1393,7 +1447,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
- Align >= 4 : Align != 2;
+ Alignment >= Align(4) : Alignment != Align(2);
}
return true;
@@ -1409,12 +1463,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (IsFast)
*IsFast = true;
- return Size >= 32 && Align >= 4;
+ return Size >= 32 && Alignment >= Align(4);
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
- bool *IsFast) const {
+ EVT VT, unsigned AddrSpace, unsigned Alignment,
+ MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1428,7 +1482,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Align, Flags, IsFast);
+ Align(Alignment), Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1449,11 +1503,6 @@ EVT SITargetLowering::getOptimalMemOpType(
return MVT::Other;
}
-bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
- unsigned DestAS) const {
- return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
-}
-
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
@@ -1461,6 +1510,11 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
return I && I->getMetadata("amdgpu.noclobber");
}
+bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
+ AS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
@@ -1468,7 +1522,9 @@ bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
return true;
- return isNoopAddrSpaceCast(SrcAS, DestAS);
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+ return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
}
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
@@ -1537,7 +1593,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
+ return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
@@ -1597,9 +1653,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
// TODO: If we passed in the base kernel offset we could have a better
// alignment than 4, but we don't really need it.
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
- SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachineMemOperand::MOInvariant);
SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
@@ -1682,12 +1738,11 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
}
-static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
- CallingConv::ID CallConv,
- ArrayRef<ISD::InputArg> Ins,
- BitVector &Skipped,
- FunctionType *FType,
- SIMachineFunctionInfo *Info) {
+static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+ CallingConv::ID CallConv,
+ ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
+ FunctionType *FType,
+ SIMachineFunctionInfo *Info) {
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
@@ -1895,26 +1950,26 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
if (Info.hasImplicitBufferPtr()) {
- unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+ Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info.hasPrivateSegmentBuffer()) {
- unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+ Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info.hasDispatchPtr()) {
- unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+ Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info.hasQueuePtr()) {
- unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+ Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
@@ -1929,13 +1984,13 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
}
if (Info.hasDispatchID()) {
- unsigned DispatchIDReg = Info.addDispatchID(TRI);
+ Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info.hasFlatScratchInit()) {
- unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+ if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
+ Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
@@ -1951,25 +2006,25 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
CallingConv::ID CallConv,
bool IsShader) const {
if (Info.hasWorkGroupIDX()) {
- unsigned Reg = Info.addWorkGroupIDX();
+ Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDY()) {
- unsigned Reg = Info.addWorkGroupIDY();
+ Register Reg = Info.addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDZ()) {
- unsigned Reg = Info.addWorkGroupIDZ();
+ Register Reg = Info.addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupInfo()) {
- unsigned Reg = Info.addWorkGroupInfo();
+ Register Reg = Info.addWorkGroupInfo();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -2020,26 +2075,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
- // If we have stack objects, we unquestionably need the private buffer
- // resource. For the Code Object V2 ABI, this will be the first 4 user
- // SGPR inputs. We can reserve those and use them directly.
+ if (!ST.enableFlatScratch()) {
+ if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
- Register PrivateSegmentBufferReg =
- Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
- Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- } else {
- unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
- // We tentatively reserve the last registers (skipping the last registers
- // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
- // we'll replace these with the ones immediately after those which were
- // really allocated. In the prologue copies will be inserted from the
- // argument to these reserved registers.
+ Register PrivateSegmentBufferReg =
+ Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+ Info.setScratchRSrcReg(PrivateSegmentBufferReg);
+ } else {
+ unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+ // We tentatively reserve the last registers (skipping the last registers
+ // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+ // we'll replace these with the ones immediately after those which were
+ // really allocated. In the prologue copies will be inserted from the
+ // argument to these reserved registers.
- // Without HSA, relocations are used for the scratch pointer and the
- // buffer resource setup is always inserted in the prologue. Scratch wave
- // offset is still in an input SGPR.
- Info.setScratchRSrcReg(ReservedBufferReg);
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info.setScratchRSrcReg(ReservedBufferReg);
+ }
}
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -2139,7 +2196,7 @@ SDValue SITargetLowering::LowerFormalArguments(
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+ if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -2152,12 +2209,21 @@ SDValue SITargetLowering::LowerFormalArguments(
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
- bool IsShader = AMDGPU::isShader(CallConv);
+ bool IsGraphics = AMDGPU::isGraphics(CallConv);
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
- if (IsShader) {
- processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+ if (IsGraphics) {
+ assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
+ (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
+ !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+ !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+ !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+ !Info->hasWorkItemIDZ());
+ }
+
+ if (CallConv == CallingConv::AMDGPU_PS) {
+ processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
// At least one interpolation mode must be enabled or else the GPU will
// hang.
@@ -2172,39 +2238,28 @@ SDValue SITargetLowering::LowerFormalArguments(
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
// enabled too.
- if (CallConv == CallingConv::AMDGPU_PS) {
- if ((Info->getPSInputAddr() & 0x7F) == 0 ||
- ((Info->getPSInputAddr() & 0xF) == 0 &&
- Info->isPSInputAllocated(11))) {
- CCInfo.AllocateReg(AMDGPU::VGPR0);
- CCInfo.AllocateReg(AMDGPU::VGPR1);
- Info->markPSInputAllocated(0);
- Info->markPSInputEnabled(0);
- }
- if (Subtarget->isAmdPalOS()) {
- // For isAmdPalOS, the user does not enable some bits after compilation
- // based on run-time states; the register values being generated here are
- // the final ones set in hardware. Therefore we need to apply the
- // workaround to PSInputAddr and PSInputEnable together. (The case where
- // a bit is set in PSInputAddr but not PSInputEnable is where the
- // frontend set up an input arg for a particular interpolation mode, but
- // nothing uses that input arg. Really we should have an earlier pass
- // that removes such an arg.)
- unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
- if ((PsInputBits & 0x7F) == 0 ||
- ((PsInputBits & 0xF) == 0 &&
- (PsInputBits >> 11 & 1)))
- Info->markPSInputEnabled(
- countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
- }
+ if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->markPSInputEnabled(0);
+ }
+ if (Subtarget->isAmdPalOS()) {
+ // For isAmdPalOS, the user does not enable some bits after compilation
+ // based on run-time states; the register values being generated here are
+ // the final ones set in hardware. Therefore we need to apply the
+ // workaround to PSInputAddr and PSInputEnable together. (The case where
+ // a bit is set in PSInputAddr but not PSInputEnable is where the
+ // frontend set up an input arg for a particular interpolation mode, but
+ // nothing uses that input arg. Really we should have an earlier pass
+ // that removes such an arg.)
+ unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+ if ((PsInputBits & 0x7F) == 0 ||
+ ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
+ Info->markPSInputEnabled(
+ countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
}
-
- assert(!Info->hasDispatchPtr() &&
- !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
- !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
- !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
- !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
- !Info->hasWorkItemIDZ());
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
@@ -2253,9 +2308,23 @@ SDValue SITargetLowering::LowerFormalArguments(
const uint64_t Offset = VA.getLocMemOffset();
Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
- SDValue Arg =
- lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment,
- Ins[i].Flags.isSExt(), &Ins[i]);
+ if (Arg.Flags.isByRef()) {
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+ if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
+ Arg.Flags.getPointerAddrSpace())) {
+ Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
+ Arg.Flags.getPointerAddrSpace());
+ }
+
+ InVals.push_back(Ptr);
+ continue;
+ }
+
+ SDValue Arg = lowerKernargMemParameter(
+ DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
@@ -2337,7 +2406,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// Start adding system SGPRs.
if (IsEntryFunc) {
- allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
+ allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -2820,7 +2889,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
report_fatal_error("unsupported libcall legalization");
if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- !CLI.CB->getCalledFunction()) {
+ !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
return lowerUnhandledCall(CLI, InVals,
"unsupported indirect call to function ");
}
@@ -2830,11 +2899,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported required tail call to function ");
}
- if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
- // Note the issue is with the CC of the calling function, not of the call
+ if (AMDGPU::isShader(CallConv)) {
+ // Note the issue is with the CC of the called function, not of the call
// itself.
return lowerUnhandledCall(CLI, InVals,
- "unsupported call from graphics shader of function ");
+ "unsupported call to a shader function ");
+ }
+
+ if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
+ // Only allow calls with specific calling conventions.
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported calling convention for call from "
+ "graphics shader of function ");
}
if (IsTailCall) {
@@ -2865,7 +2942,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -2894,14 +2972,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
- SmallVector<SDValue, 4> CopyFromChains;
+ if (!Subtarget->enableFlatScratch()) {
+ SmallVector<SDValue, 4> CopyFromChains;
- // In the HSA case, this should be an identity copy.
- SDValue ScratchRSrcReg
- = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
- RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
- Chain = DAG.getTokenFactor(DL, CopyFromChains);
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+ Chain = DAG.getTokenFactor(DL, CopyFromChains);
+ }
}
MVT PtrVT = MVT::i32;
@@ -2992,14 +3072,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MemOpChains.push_back(Cpy);
} else {
- SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
- Alignment ? Alignment->value() : 0);
+ SDValue Store =
+ DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
MemOpChains.push_back(Store);
}
}
}
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
// Copy special input registers after user input arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -3223,29 +3304,11 @@ Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
// If kill is not the last instruction, split the block so kill is always a
// proper terminator.
-MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
- MachineBasicBlock *BB) const {
+MachineBasicBlock *
+SITargetLowering::splitKillBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-
- MachineBasicBlock::iterator SplitPoint(&MI);
- ++SplitPoint;
-
- if (SplitPoint == BB->end()) {
- // Don't bother with a new block.
- MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
- return BB;
- }
-
- MachineFunction *MF = BB->getParent();
- MachineBasicBlock *SplitBB
- = MF->CreateMachineBasicBlock(BB->getBasicBlock());
-
- MF->insert(++MachineFunction::iterator(BB), SplitBB);
- SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
-
- SplitBB->transferSuccessorsAndUpdatePHIs(BB);
- BB->addSuccessor(SplitBB);
-
MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
return SplitBB;
}
@@ -3357,20 +3420,14 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
// will only do one iteration. In the worst case, this will loop 64 times.
//
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
-static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
- const SIInstrInfo *TII,
- MachineRegisterInfo &MRI,
- MachineBasicBlock &OrigBB,
- MachineBasicBlock &LoopBB,
- const DebugLoc &DL,
- const MachineOperand &IdxReg,
- unsigned InitReg,
- unsigned ResultReg,
- unsigned PhiReg,
- unsigned InitSaveExecReg,
- int Offset,
- bool UseGPRIdxMode,
- bool IsIndirectSrc) {
+static MachineBasicBlock::iterator
+emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
+ const DebugLoc &DL, const MachineOperand &Idx,
+ unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
+ unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
+ Register &SGPRIdxReg) {
+
MachineFunction *MF = OrigBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -3396,12 +3453,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
// Read the next variant <- also loop target.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
- .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+ .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
// Compare the just read M0 value to all possible Idx values.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
- .addReg(CurrentIdxReg)
- .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+ .addReg(CurrentIdxReg)
+ .addReg(Idx.getReg(), 0, Idx.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
@@ -3412,22 +3469,14 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
MRI.setSimpleHint(NewExec, CondReg);
if (UseGPRIdxMode) {
- unsigned IdxReg;
if (Offset == 0) {
- IdxReg = CurrentIdxReg;
+ SGPRIdxReg = CurrentIdxReg;
} else {
- IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
- .addReg(CurrentIdxReg, RegState::Kill)
- .addImm(Offset);
+ SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
}
- unsigned IdxMode = IsIndirectSrc ?
- AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
- MachineInstr *SetOn =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(IdxReg, RegState::Kill)
- .addImm(IdxMode);
- SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
@@ -3463,14 +3512,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
// subregister from it, using 1 more VGPR than necessary. This was saved when
// this was expanded after register allocation.
-static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
- MachineBasicBlock &MBB,
- MachineInstr &MI,
- unsigned InitResultReg,
- unsigned PhiReg,
- int Offset,
- bool UseGPRIdxMode,
- bool IsIndirectSrc) {
+static MachineBasicBlock::iterator
+loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
+ unsigned InitResultReg, unsigned PhiReg, int Offset,
+ bool UseGPRIdxMode, Register &SGPRIdxReg) {
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -3499,7 +3544,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
- Offset, UseGPRIdxMode, IsIndirectSrc);
+ Offset, UseGPRIdxMode, SGPRIdxReg);
+
MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(LoopBB);
++MBBI;
@@ -3530,64 +3576,45 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
}
-// Return true if the index is an SGPR and was set.
-static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
- MachineRegisterInfo &MRI,
- MachineInstr &MI,
- int Offset,
- bool UseGPRIdxMode,
- bool IsIndirectSrc) {
+static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI, MachineInstr &MI,
+ int Offset) {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
assert(Idx->getReg() != AMDGPU::NoRegister);
- if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
- return false;
-
- if (UseGPRIdxMode) {
- unsigned IdxMode = IsIndirectSrc ?
- AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
- if (Offset == 0) {
- MachineInstr *SetOn =
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .add(*Idx)
- .addImm(IdxMode);
+ if (Offset == 0) {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
+ } else {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .add(*Idx)
+ .addImm(Offset);
+ }
+}
- SetOn->getOperand(3).setIsUndef();
- } else {
- Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
- .add(*Idx)
- .addImm(Offset);
- MachineInstr *SetOn =
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(Tmp, RegState::Kill)
- .addImm(IdxMode);
+static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI, MachineInstr &MI,
+ int Offset) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
- SetOn->getOperand(3).setIsUndef();
- }
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- return true;
- }
+ if (Offset == 0)
+ return Idx->getReg();
- if (Offset == 0) {
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .add(*Idx);
- } else {
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
.add(*Idx)
.addImm(Offset);
- }
-
- return true;
+ return Tmp;
}
-// Control flow needs to be inserted if indexing with a VGPR.
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineBasicBlock &MBB,
const GCNSubtarget &ST) {
@@ -3597,10 +3624,12 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
Register Dst = MI.getOperand(0).getReg();
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
unsigned SubReg;
std::tie(SubReg, Offset)
@@ -3608,7 +3637,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
const bool UseGPRIdxMode = ST.useVGPRIndexMode();
- if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
+ // Check for a SGPR index.
+ if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
@@ -3616,14 +3646,19 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
// TODO: Look at the uses to avoid the copy. This may require rescheduling
// to avoid interfering with other uses, so probably requires a new
// optimization pass.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
- .addReg(SrcReg, RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
+ BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
+ .addReg(SrcReg)
+ .addReg(Idx)
+ .addImm(SubReg);
} else {
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, 0, SubReg)
.addReg(SrcReg, RegState::Implicit);
}
@@ -3632,6 +3667,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
return &MBB;
}
+ // Control flow needs to be inserted if indexing with a VGPR.
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
@@ -3640,19 +3676,23 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
- Offset, UseGPRIdxMode, true);
+ Register SGPRIdxReg;
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+ UseGPRIdxMode, SGPRIdxReg);
+
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
- .addReg(SrcReg, RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
+
+ BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
+ .addReg(SrcReg)
+ .addReg(SGPRIdxReg)
+ .addImm(SubReg);
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, 0, SubReg)
.addReg(SrcReg, RegState::Implicit);
}
@@ -3675,6 +3715,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
// This can be an immediate, but will be folded later.
assert(Val->getReg());
@@ -3700,23 +3741,36 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return &MBB;
}
- const MCInstrDesc &MovRelDesc
- = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false);
-
- if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
+ // Check for a SGPR index.
+ if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, I, DL, MovRelDesc, Dst)
- .addReg(SrcVec->getReg())
- .add(*Val)
- .addImm(SubReg);
- if (UseGPRIdxMode)
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ if (UseGPRIdxMode) {
+ Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+ BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
+ .addReg(SrcVec->getReg())
+ .add(*Val)
+ .addReg(Idx)
+ .addImm(SubReg);
+ } else {
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+
+ const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
+ TRI.getRegSizeInBits(*VecRC), 32, false);
+ BuildMI(MBB, I, DL, MovRelDesc, Dst)
+ .addReg(SrcVec->getReg())
+ .add(*Val)
+ .addImm(SubReg);
+ }
MI.eraseFromParent();
return &MBB;
}
+ // Control flow needs to be inserted if indexing with a VGPR.
if (Val->isReg())
MRI.clearKillFlags(Val->getReg());
@@ -3724,16 +3778,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
Register PhiReg = MRI.createVirtualRegister(VecRC);
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
- Offset, UseGPRIdxMode, false);
+ Register SGPRIdxReg;
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
+ UseGPRIdxMode, SGPRIdxReg);
MachineBasicBlock *LoopBB = InsPt->getParent();
- BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
- .addReg(PhiReg)
- .add(*Val)
- .addImm(AMDGPU::sub0);
- if (UseGPRIdxMode)
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ if (UseGPRIdxMode) {
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+
+ BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
+ .addReg(PhiReg)
+ .add(*Val)
+ .addReg(SGPRIdxReg)
+ .addImm(AMDGPU::sub0);
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
+ TRI.getRegSizeInBits(*VecRC), 32, false);
+ BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
+ .addReg(PhiReg)
+ .add(*Val)
+ .addImm(AMDGPU::sub0);
+ }
MI.eraseFromParent();
return LoopBB;
@@ -3849,7 +3915,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
- unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
.addReg(CarryReg, RegState::Define)
.add(SrcReg0Sub0)
@@ -3912,10 +3978,29 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Src2.setReg(RegOp2);
}
- if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) {
- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
- .addReg(Src2.getReg())
- .addImm(0);
+ const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
+ if (TRI->getRegSizeInBits(*Src2RC) == 64) {
+ if (ST.hasScalarCompareEq64()) {
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
+ .addReg(Src2.getReg())
+ .addImm(0);
+ } else {
+ const TargetRegisterClass *SubRC =
+ TRI->getSubRegClass(Src2RC, AMDGPU::sub0);
+ MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
+ MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
+ MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
+ MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
+ Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
+ .add(Src2Sub0)
+ .add(Src2Sub1);
+
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(Src2_32, RegState::Kill)
+ .addImm(0);
+ }
} else {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
.addReg(Src2.getReg())
@@ -3936,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
- case AMDGPU::SI_INIT_EXEC:
- // This should be before all vector instructions.
- BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::EXEC)
- .addImm(MI.getOperand(0).getImm());
- MI.eraseFromParent();
- return BB;
-
- case AMDGPU::SI_INIT_EXEC_LO:
- // This should be before all vector instructions.
- BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::EXEC_LO)
- .addImm(MI.getOperand(0).getImm());
- MI.eraseFromParent();
- return BB;
-
- case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
- // Extract the thread count from an SGPR input and set EXEC accordingly.
- // Since BFM can't shift by 64, handle that case with CMP + CMOV.
- //
- // S_BFE_U32 count, input, {shift, 7}
- // S_BFM_B64 exec, count, 0
- // S_CMP_EQ_U32 count, 64
- // S_CMOV_B64 exec, -1
- MachineInstr *FirstMI = &*BB->begin();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register InputReg = MI.getOperand(0).getReg();
- Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- bool Found = false;
-
- // Move the COPY of the input reg to the beginning, so that we can use it.
- for (auto I = BB->begin(); I != &MI; I++) {
- if (I->getOpcode() != TargetOpcode::COPY ||
- I->getOperand(0).getReg() != InputReg)
- continue;
-
- if (I == FirstMI) {
- FirstMI = &*++BB->begin();
- } else {
- I->removeFromParent();
- BB->insert(FirstMI, &*I);
- }
- Found = true;
- break;
- }
- assert(Found);
- (void)Found;
-
- // This should be before all vector instructions.
- unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
- bool isWave32 = getSubtarget()->isWave32();
- unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
- .addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- BuildMI(*BB, FirstMI, DebugLoc(),
- TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
- Exec)
- .addReg(CountReg)
- .addImm(0);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
- .addReg(CountReg, RegState::Kill)
- .addImm(getSubtarget()->getWavefrontSize());
- BuildMI(*BB, FirstMI, DebugLoc(),
- TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
- MI.eraseFromParent();
- return BB;
- }
-
case AMDGPU::GET_GROUPSTATICSIZE: {
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
@@ -4086,13 +4100,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::ADJCALLSTACKDOWN: {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
MachineInstrBuilder MIB(*MF, &MI);
-
- // Add an implicit use of the frame offset reg to prevent the restore copy
- // inserted after the call from being reorderd after stack operations in the
- // the caller's frame.
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
- .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
- .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL_ISEL: {
@@ -4111,9 +4120,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
- case AMDGPU::V_ADD_I32_e32:
- case AMDGPU::V_SUB_I32_e32:
- case AMDGPU::V_SUBREV_I32_e32: {
+ case AMDGPU::V_ADD_CO_U32_e32:
+ case AMDGPU::V_SUB_CO_U32_e32:
+ case AMDGPU::V_SUBREV_CO_U32_e32: {
// TODO: Define distinct V_*_I32_Pseudo instructions instead.
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc = MI.getOpcode();
@@ -4154,9 +4163,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return emitGWSMemViolTestLoop(MI, BB);
case AMDGPU::S_SETREG_B32: {
- if (!getSubtarget()->hasDenormModeInst())
- return BB;
-
// Try to optimize cases that only set the denormal mode or rounding mode.
//
// If the s_setreg_b32 fully sets all of the bits in the rounding mode or
@@ -4166,9 +4172,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// FIXME: This could be predicates on the immediate, but tablegen doesn't
// allow you to have a no side effect instruction in the output of a
// sideeffecting pattern.
-
- // TODO: Should also emit a no side effects pseudo if only FP bits are
- // touched, even if not all of them or to a variable.
unsigned ID, Offset, Width;
AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
if (ID != AMDGPU::Hwreg::ID_MODE)
@@ -4176,50 +4179,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
const unsigned SetMask = WidthMask << Offset;
- unsigned SetDenormOp = 0;
- unsigned SetRoundOp = 0;
-
- // The dedicated instructions can only set the whole denorm or round mode at
- // once, not a subset of bits in either.
- if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
- AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
- // If this fully sets both the round and denorm mode, emit the two
- // dedicated instructions for these.
- assert(Offset == 0);
- SetRoundOp = AMDGPU::S_ROUND_MODE;
- SetDenormOp = AMDGPU::S_DENORM_MODE;
- } else if (Width == 4) {
- if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
+
+ if (getSubtarget()->hasDenormModeInst()) {
+ unsigned SetDenormOp = 0;
+ unsigned SetRoundOp = 0;
+
+ // The dedicated instructions can only set the whole denorm or round mode
+ // at once, not a subset of bits in either.
+ if (SetMask ==
+ (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
+ // If this fully sets both the round and denorm mode, emit the two
+ // dedicated instructions for these.
SetRoundOp = AMDGPU::S_ROUND_MODE;
- assert(Offset == 0);
- } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
SetDenormOp = AMDGPU::S_DENORM_MODE;
- assert(Offset == 4);
+ } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
+ SetRoundOp = AMDGPU::S_ROUND_MODE;
+ } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
+ SetDenormOp = AMDGPU::S_DENORM_MODE;
}
- }
- if (SetRoundOp || SetDenormOp) {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
- if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
- unsigned ImmVal = Def->getOperand(1).getImm();
- if (SetRoundOp) {
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
- .addImm(ImmVal & 0xf);
+ if (SetRoundOp || SetDenormOp) {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
+ if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
+ unsigned ImmVal = Def->getOperand(1).getImm();
+ if (SetRoundOp) {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
+ .addImm(ImmVal & 0xf);
+
+ // If we also have the denorm mode, get just the denorm mode bits.
+ ImmVal >>= 4;
+ }
- // If we also have the denorm mode, get just the denorm mode bits.
- ImmVal >>= 4;
- }
+ if (SetDenormOp) {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
+ .addImm(ImmVal & 0xf);
+ }
- if (SetDenormOp) {
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
- .addImm(ImmVal & 0xf);
+ MI.eraseFromParent();
+ return BB;
}
-
- MI.eraseFromParent();
}
}
+ // If only FP bits are touched, used the no side effects pseudo.
+ if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
+ AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
+ MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
+
return BB;
}
default:
@@ -4256,6 +4263,12 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
}
+LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
+ return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
+ ? Ty.changeElementSize(16)
+ : Ty.changeElementSize(32);
+}
+
// Answering this is somewhat tricky and depends on the specific device which
// have different rates for fma or all f64 operations.
//
@@ -4457,6 +4470,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::UADDSAT:
+ case ISD::USUBSAT:
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
return splitBinaryVectorOp(Op, DAG);
case ISD::SMULO:
case ISD::UMULO:
@@ -4467,31 +4484,47 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// Used for D16: Casts the result of an instruction into the right vector,
+// packs values if loads return unpacked values.
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
const SDLoc &DL,
SelectionDAG &DAG, bool Unpacked) {
if (!LoadVT.isVector())
return Result;
+ // Cast back to the original packed type or to a larger type that is a
+ // multiple of 32 bit for D16. Widening the return type is a required for
+ // legalization.
+ EVT FittingLoadVT = LoadVT;
+ if ((LoadVT.getVectorNumElements() % 2) == 1) {
+ FittingLoadVT =
+ EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+ LoadVT.getVectorNumElements() + 1);
+ }
+
if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
// Truncate to v2i16/v4i16.
- EVT IntLoadVT = LoadVT.changeTypeToInteger();
+ EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
// Workaround legalizer not scalarizing truncate after vector op
- // legalization byt not creating intermediate vector trunc.
+ // legalization but not creating intermediate vector trunc.
SmallVector<SDValue, 4> Elts;
DAG.ExtractVectorElements(Result, Elts);
for (SDValue &Elt : Elts)
Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+ // Pad illegal v1i16/v3fi6 to v4i16
+ if ((LoadVT.getVectorNumElements() % 2) == 1)
+ Elts.push_back(DAG.getUNDEF(MVT::i16));
+
Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
// Bitcast to original type (v2f16/v4f16).
- return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
}
// Cast back to the original packed type.
- return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
}
SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
@@ -4505,10 +4538,16 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
EVT LoadVT = M->getValueType(0);
EVT EquivLoadVT = LoadVT;
- if (Unpacked && LoadVT.isVector()) {
- EquivLoadVT = LoadVT.isVector() ?
- EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- LoadVT.getVectorNumElements()) : LoadVT;
+ if (LoadVT.isVector()) {
+ if (Unpacked) {
+ EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements());
+ } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
+ // Widen v3f16 to legal type
+ EquivLoadVT =
+ EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+ LoadVT.getVectorNumElements() + 1);
+ }
}
// Change from v4f16/v2f16 to EquivLoadVT.
@@ -4519,8 +4558,6 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
VTList, Ops, M->getMemoryVT(),
M->getMemOperand());
- if (!Unpacked) // Just adjusted the opcode.
- return Load;
SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
@@ -4724,8 +4761,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
if (Res.getOpcode() == ISD::MERGE_VALUES) {
// FIXME: Hacky
- Results.push_back(Res.getOperand(0));
- Results.push_back(Res.getOperand(1));
+ for (unsigned I = 0; I < Res.getNumOperands(); I++) {
+ Results.push_back(Res.getOperand(I));
+ }
} else {
Results.push_back(Res);
Results.push_back(Res.getValue(1));
@@ -4967,7 +5005,7 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
// Get the return address reg and mark it as an implicit live-in
- unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
+ Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}
@@ -5063,7 +5101,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ Register UserSGPR = Info->getQueuePtrUserSGPR();
assert(UserSGPR != AMDGPU::NoRegister);
SDValue QueuePtr = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
@@ -5136,14 +5174,15 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
- SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
// be available and how do we get it?
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
- MinAlign(64, StructOffset),
+ commonAlignment(Align(64), StructOffset),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
}
@@ -5504,7 +5543,9 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// variable, but since the encoding of $symbol starts 4 bytes after the start
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address.
+ // compute the correct address. Similarly for the s_addc_u32 instruction, the
+ // encoding of $symbol starts 12 bytes after the start of the s_add_u32
+ // instruction.
SDValue PtrLo =
DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
SDValue PtrHi;
@@ -5512,7 +5553,7 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
} else {
PtrHi =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
}
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
@@ -5521,15 +5562,32 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ SDLoc DL(GSD);
+ EVT PtrVT = Op.getValueType();
+
const GlobalValue *GV = GSD->getGlobal();
if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
shouldUseLDSConstAddress(GV)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
- GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ GV->hasExternalLinkage()) {
+ Type *Ty = GV->getValueType();
+ // HIP uses an unsized array `extern __shared__ T s[]` or similar
+ // zero-sized type in other languages to declare the dynamic shared
+ // memory which size is not known at the compile time. They will be
+ // allocated by the runtime and placed directly after the static
+ // allocated ones. They all share the same offset.
+ if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+ assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
+ // Adjust alignment for that dynamic shared memory array.
+ MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+ return SDValue(
+ DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
+ }
+ }
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
-
- SDLoc DL(GSD);
- EVT PtrVT = Op.getValueType();
+ }
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
@@ -5713,7 +5771,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
SDValue Data(Result, 0);
SDValue TexFail;
- if (IsTexFail) {
+ if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
if (MaskPopVT.isVector()) {
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
@@ -5722,10 +5780,6 @@ static SDValue constructRetValue(SelectionDAG &DAG,
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);
}
-
- TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
- SDValue(Result, 0),
- DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}
if (DataDwordVT.isVector())
@@ -5735,13 +5789,27 @@ static SDValue constructRetValue(SelectionDAG &DAG,
if (IsD16)
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
- if (!ReqRetVT.isVector())
+ EVT LegalReqRetVT = ReqRetVT;
+ if (!ReqRetVT.isVector()) {
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+ } else {
+ // We need to widen the return vector to a legal type
+ if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
+ ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
+ LegalReqRetVT =
+ EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
+ ReqRetVT.getVectorNumElements() + 1);
+ }
+ }
+ Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
- Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
+ if (IsTexFail) {
+ TexFail =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
+ DAG.getConstant(MaskPopDwords, DL, MVT::i32));
- if (TexFail)
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+ }
if (Result->getNumValues() == 1)
return Data;
@@ -5798,7 +5866,7 @@ static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG, bool WithChain) const {
SDLoc DL(Op);
MachineFunction &MF = DAG.getMachineFunction();
const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
@@ -5810,10 +5878,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
- bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+ bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
- SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
- SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
+ SmallVector<EVT, 3> ResultTypes(Op->values());
+ SmallVector<EVT, 3> OrigResultTypes(Op->values());
bool IsD16 = false;
bool IsG16 = false;
bool IsA16 = false;
@@ -5821,7 +5889,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
int NumVDataDwords;
bool AdjustRetType = false;
- unsigned AddrIdx; // Index of first address argument
+ // Offset of intrinsic arguments
+ const unsigned ArgOffset = WithChain ? 2 : 1;
+
unsigned DMask;
unsigned DMaskLanes = 0;
@@ -5839,15 +5909,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
DMask = Is64Bit ? 0xf : 0x3;
NumVDataDwords = Is64Bit ? 4 : 2;
- AddrIdx = 4;
} else {
DMask = Is64Bit ? 0x3 : 0x1;
NumVDataDwords = Is64Bit ? 2 : 1;
- AddrIdx = 3;
}
} else {
- unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
- auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+ auto *DMaskConst =
+ cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
DMask = DMaskConst->getZExtValue();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
@@ -5860,7 +5928,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
return Op; // D16 is unsupported for this instruction
IsD16 = true;
- VData = handleD16VData(VData, DAG);
+ VData = handleD16VData(VData, DAG, true);
}
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
@@ -5880,63 +5948,56 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
(!LoadVT.isVector() && DMaskLanes > 1))
return Op;
- if (IsD16 && !Subtarget->hasUnpackedD16VMem())
+ // The sq block of gfx8 and gfx9 do not estimate register use correctly
+ // for d16 image_gather4, image_gather4_l, and image_gather4_lz
+ // instructions.
+ if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
+ !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
NumVDataDwords = (DMaskLanes + 1) / 2;
else
NumVDataDwords = DMaskLanes;
AdjustRetType = true;
}
-
- AddrIdx = DMaskIdx + 1;
}
- unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
- unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
- unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
- unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
- NumCoords + NumLCM;
- unsigned NumMIVAddrs = NumVAddrs;
-
+ unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
SmallVector<SDValue, 4> VAddrs;
// Optimize _L to _LZ when _L is zero
if (LZMappingInfo) {
- if (auto ConstantLod =
- dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+ if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>(
+ Op.getOperand(ArgOffset + Intr->LodIndex))) {
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
- NumMIVAddrs--; // remove 'lod'
+ VAddrEnd--; // remove 'lod'
}
}
}
// Optimize _mip away, when 'lod' is zero
if (MIPMappingInfo) {
- if (auto ConstantLod =
- dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+ if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
+ Op.getOperand(ArgOffset + Intr->MipIndex))) {
if (ConstantLod->isNullValue()) {
IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
- NumMIVAddrs--; // remove 'lod'
+ VAddrEnd--; // remove 'mip'
}
}
}
// Push back extra arguments.
- for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++)
- VAddrs.push_back(Op.getOperand(AddrIdx + I));
+ for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++)
+ VAddrs.push_back(Op.getOperand(ArgOffset + I));
// Check for 16 bit addresses or derivatives and pack if true.
- unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
- unsigned CoordIdx = DimIdx + NumGradients;
- unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
-
- MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
+ MVT VAddrVT =
+ Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
MVT VAddrScalarVT = VAddrVT.getScalarType();
MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
- VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
+ VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
VAddrScalarVT = VAddrVT.getScalarType();
IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
if (IsA16 || IsG16) {
@@ -5971,17 +6032,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
// Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx;
- packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx,
- PackEndIdx, NumGradients);
+ const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
+ packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,
+ ArgOffset + Intr->GradientStart, PackEndIdx,
+ Intr->NumGradients);
if (!IsA16) {
// Add uncompressed address
- for (unsigned I = CoordIdx; I < CoordsEnd; I++)
+ for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
} else {
- for (unsigned I = DimIdx; I < CoordsEnd; I++)
+ for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
@@ -6004,22 +6066,19 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
- unsigned CtrlIdx; // Index of texfailctrl argument
SDValue Unorm;
if (!BaseOpcode->Sampler) {
Unorm = True;
- CtrlIdx = AddrIdx + NumVAddrs + 1;
} else {
auto UnormConst =
- cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+ cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
Unorm = UnormConst->getZExtValue() ? True : False;
- CtrlIdx = AddrIdx + NumVAddrs + 3;
}
SDValue TFE;
SDValue LWE;
- SDValue TexFail = Op.getOperand(CtrlIdx);
+ SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
bool IsTexFail = false;
if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
return Op;
@@ -6066,42 +6125,40 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue DLC;
if (BaseOpcode->Atomic) {
GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
- IsGFX10 ? &DLC : nullptr))
+ if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
+ DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))
return Op;
} else {
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
- IsGFX10 ? &DLC : nullptr))
+ if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
+ DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))
return Op;
}
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
- if (UseNSA) {
- for (const SDValue &Addr : VAddrs)
- Ops.push_back(Addr);
- } else {
+ if (UseNSA)
+ append_range(Ops, VAddrs);
+ else
Ops.push_back(VAddr);
- }
- Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+ Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
if (BaseOpcode->Sampler)
- Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+ Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
- if (IsGFX10)
+ if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
- if (IsGFX10)
+ if (IsGFX10Plus)
Ops.push_back(DLC);
Ops.push_back(GLC);
Ops.push_back(SLC);
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
- if (IsGFX10)
+ if (IsGFX10Plus)
Ops.push_back(IsA16 ? True : False);
Ops.push_back(TFE);
Ops.push_back(LWE);
- if (!IsGFX10)
+ if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
@@ -6112,7 +6169,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (IsGFX10) {
+ if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
@@ -6391,11 +6448,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+ bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
SDValue GLC;
SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
- IsGFX10 ? &DLC : nullptr))
+ IsGFX10Plus ? &DLC : nullptr))
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -6417,11 +6474,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
- DiagnosticInfoUnsupported BadIntrin(
- MF.getFunction(), "intrinsic not supported on subtarget",
- DL.getDebugLoc());
- DAG.getContext()->diagnose(BadIntrin);
- return DAG.getUNDEF(VT);
+ return emitRemovedIntrinsicError(DAG, DL, VT);
}
case Intrinsic::amdgcn_ldexp:
return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
@@ -6567,7 +6620,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
- return lowerImage(Op, ImageDimIntr, DAG);
+ return lowerImage(Op, ImageDimIntr, DAG, false);
return Op;
}
@@ -6597,26 +6650,59 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset,
cast<ConstantSDNode>(Offset)->getSExtValue();
}
-static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
- switch (MF.getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_PS:
- return 1;
- case CallingConv::AMDGPU_VS:
- return 2;
- case CallingConv::AMDGPU_GS:
- return 3;
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_LS:
- case CallingConv::AMDGPU_ES:
- report_fatal_error("ds_ordered_count unsupported for this calling conv");
- case CallingConv::AMDGPU_CS:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::C:
- case CallingConv::Fast:
- default:
- // Assume other calling conventions are various compute callable functions
- return 0;
- }
+SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
+ SelectionDAG &DAG,
+ unsigned NewOpcode) const {
+ SDLoc DL(Op);
+
+ SDValue VData = Op.getOperand(2);
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+
+ auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+
+ EVT MemVT = VData.getValueType();
+ return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
+ M->getMemOperand());
+}
+
+SDValue
+SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOpcode) const {
+ SDLoc DL(Op);
+
+ SDValue VData = Op.getOperand(2);
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ };
+
+ auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+ Ops[3]));
+
+ EVT MemVT = VData.getValueType();
+ return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
+ M->getMemOperand());
}
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -6656,7 +6742,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
report_fatal_error("ds_ordered_count: wave_done requires wave_release");
unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
- unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction());
+ unsigned ShaderType =
+ SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
unsigned Offset0 = OrderedCountIndex << 2;
unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
(Instruction << 4);
@@ -6893,7 +6980,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor: {
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned IdxEn = 1;
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
@@ -6953,6 +7041,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
+ case Intrinsic::amdgcn_buffer_atomic_fadd:
+ if (!Op.getValue(0).use_empty()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
+ break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6960,155 +7059,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
M->getMemOperand());
}
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
- case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
- auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
- };
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_raw_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_inc:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_dec:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_SMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_UMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_SMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_UMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
- case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
- auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Offsets.first, // voffset
- Op.getOperand(6), // soffset
- Offsets.second, // offset
- Op.getOperand(7), // cachepolicy
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
- };
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_struct_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_inc:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_dec:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned IdxEn = 1;
@@ -7180,7 +7188,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_global_atomic_csub: {
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ if (!Op.getValue(0).use_empty()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
@@ -7188,15 +7204,85 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getOperand(3) // Value
};
- return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
- M->getVTList(), Ops, M->getMemoryVT(),
- M->getMemOperand());
+ EVT VT = Op.getOperand(3).getValueType();
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ M->getMemOperand());
}
+ case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+ SDLoc DL(Op);
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue NodePtr = M->getOperand(2);
+ SDValue RayExtent = M->getOperand(3);
+ SDValue RayOrigin = M->getOperand(4);
+ SDValue RayDir = M->getOperand(5);
+ SDValue RayInvDir = M->getOperand(6);
+ SDValue TDescr = M->getOperand(7);
+
+ assert(NodePtr.getValueType() == MVT::i32 ||
+ NodePtr.getValueType() == MVT::i64);
+ assert(RayDir.getValueType() == MVT::v4f16 ||
+ RayDir.getValueType() == MVT::v4f32);
+
+ bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+ bool Is64 = NodePtr.getValueType() == MVT::i64;
+ unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+ : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+ SmallVector<SDValue, 16> Ops;
+
+ auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
+ SmallVector<SDValue, 3> Lanes;
+ DAG.ExtractVectorElements(Op, Lanes, 0, 3);
+ if (Lanes[0].getValueSizeInBits() == 32) {
+ for (unsigned I = 0; I < 3; ++I)
+ Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
+ } else {
+ if (IsAligned) {
+ Ops.push_back(
+ DAG.getBitcast(MVT::i32,
+ DAG.getBuildVector(MVT::v2f16, DL,
+ { Lanes[0], Lanes[1] })));
+ Ops.push_back(Lanes[2]);
+ } else {
+ SDValue Elt0 = Ops.pop_back_val();
+ Ops.push_back(
+ DAG.getBitcast(MVT::i32,
+ DAG.getBuildVector(MVT::v2f16, DL,
+ { Elt0, Lanes[0] })));
+ Ops.push_back(
+ DAG.getBitcast(MVT::i32,
+ DAG.getBuildVector(MVT::v2f16, DL,
+ { Lanes[1], Lanes[2] })));
+ }
+ }
+ };
+ if (Is64)
+ DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
+ else
+ Ops.push_back(NodePtr);
+
+ Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+ packLanes(RayOrigin, true);
+ packLanes(RayDir, true);
+ packLanes(RayInvDir, false);
+ Ops.push_back(TDescr);
+ if (IsA16)
+ Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
+ Ops.push_back(M->getChain());
+
+ auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
+ MachineMemOperand *MemRef = M->getMemOperand();
+ DAG.setNodeMemRefs(NewNode, {MemRef});
+ return SDValue(NewNode, 0);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
- return lowerImage(Op, ImageDimIntr, DAG);
+ return lowerImage(Op, ImageDimIntr, DAG, true);
return SDValue();
}
@@ -7234,8 +7320,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
return NewOp;
}
-SDValue SITargetLowering::handleD16VData(SDValue VData,
- SelectionDAG &DAG) const {
+SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
+ bool ImageStore) const {
EVT StoreVT = VData.getValueType();
// No change for f16 and legal vector D16 types.
@@ -7243,19 +7329,70 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
return VData;
SDLoc DL(VData);
- assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+ unsigned NumElements = StoreVT.getVectorNumElements();
if (Subtarget->hasUnpackedD16VMem()) {
// We need to unpack the packed data to store.
EVT IntStoreVT = StoreVT.changeTypeToInteger();
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
- EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- StoreVT.getVectorNumElements());
+ EVT EquivStoreVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
return DAG.UnrollVectorOp(ZExt.getNode());
}
+ // The sq block of gfx8.1 does not estimate register use correctly for d16
+ // image store instructions. The data operand is computed as if it were not a
+ // d16 image instruction.
+ if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
+ // Bitcast to i16
+ EVT IntStoreVT = StoreVT.changeTypeToInteger();
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ // Decompose into scalars
+ SmallVector<SDValue, 4> Elts;
+ DAG.ExtractVectorElements(IntVData, Elts);
+
+ // Group pairs of i16 into v2i16 and bitcast to i32
+ SmallVector<SDValue, 4> PackedElts;
+ for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
+ SDValue Pair =
+ DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
+ SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+ PackedElts.push_back(IntPair);
+ }
+ if ((NumElements % 2) == 1) {
+ // Handle v3i16
+ unsigned I = Elts.size() / 2;
+ SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
+ {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
+ SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+ PackedElts.push_back(IntPair);
+ }
+
+ // Pad using UNDEF
+ PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
+
+ // Build final vector
+ EVT VecVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
+ return DAG.getBuildVector(VecVT, DL, PackedElts);
+ }
+
+ if (NumElements == 3) {
+ EVT IntStoreVT =
+ EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT WidenedStoreVT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
+ EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
+ WidenedStoreVT.getStoreSizeInBits());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
+ return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
+ }
+
assert(isTypeLegal(StoreVT));
return VData;
}
@@ -7433,8 +7570,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
EVT VDataVT = VData.getValueType();
EVT EltType = VDataVT.getScalarType();
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- if (IsD16)
+ if (IsD16) {
VData = handleD16VData(VData, DAG);
+ VDataVT = VData.getValueType();
+ }
if (!isTypeLegal(VDataVT)) {
VData =
@@ -7478,8 +7617,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
EVT EltType = VDataVT.getScalarType();
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- if (IsD16)
+ if (IsD16) {
VData = handleD16VData(VData, DAG);
+ VDataVT = VData.getValueType();
+ }
if (!isTypeLegal(VDataVT)) {
VData =
@@ -7514,57 +7655,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
-
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
- EVT VT = Op.getOperand(2).getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
- unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
- : AMDGPUISD::BUFFER_ATOMIC_FADD;
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
-
- case Intrinsic::amdgcn_global_atomic_fadd: {
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2), // ptr
- Op.getOperand(3) // vdata
- };
- EVT VT = Op.getOperand(3).getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- if (VT.isVector()) {
- return DAG.getMemIntrinsicNode(
- AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
-
- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
- DAG.getVTList(VT, MVT::Other), Ops,
- M->getMemOperand()).getValue(1);
- }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
@@ -7572,7 +7662,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
- return lowerImage(Op, ImageDimIntr, DAG);
+ return lowerImage(Op, ImageDimIntr, DAG, true);
return Op;
}
@@ -7848,13 +7938,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- MemVT, *Load->getMemOperand())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, DL);
- }
-
unsigned Alignment = Load->getAlignment();
unsigned AS = Load->getAddressSpace();
if (Subtarget->hasLDSMisalignedBug() &&
@@ -7879,9 +7962,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
- if (NumElements == 3)
- return WidenVectorLoad(Op, DAG);
- return SplitVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
}
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
@@ -7897,9 +7978,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
- if (NumElements == 3)
- return WidenVectorLoad(Op, DAG);
- return SplitVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
}
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
@@ -7914,7 +7993,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorLoad(Op, DAG);
// v3 loads not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
- return WidenVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
+
// v3 and v4 loads are supported for private and global memory.
return SDValue();
}
@@ -7938,15 +8018,19 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorLoad(Op, DAG);
// v3 loads not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
- return WidenVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
+
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_read_b128 if possible.
- if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
- MemVT.getStoreSize() == 16)
+ // Use ds_read_b128 or ds_read_b96 when possible.
+ if (Subtarget->hasDS96AndDS128() &&
+ ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
+ MemVT.getStoreSize() == 12) &&
+ allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+ Load->getAlign()))
return SDValue();
if (NumElements > 2)
@@ -7963,6 +8047,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorLoad(Op, DAG);
}
}
+
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ MemVT, *Load->getMemOperand())) {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+ return DAG.getMergeValues(Ops, DL);
+ }
+
return SDValue();
}
@@ -8003,8 +8095,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
- Flags.hasApproximateFuncs();
+ bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
@@ -8045,6 +8136,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
}
+SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ const SDNodeFlags Flags = Op->getFlags();
+
+ bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
+ DAG.getTarget().Options.UnsafeFPMath;
+ if (!AllowInaccurateDiv)
+ return SDValue();
+
+ SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
+ SDValue One = DAG.getConstantFP(1.0, SL, VT);
+
+ SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
+ SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+
+ R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
+ SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+ R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
+ SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
+ SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
+ return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
+}
+
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
EVT VT, SDValue A, SDValue B, SDValue GlueChain,
SDNodeFlags Flags) {
@@ -8273,8 +8391,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
- if (DAG.getTarget().Options.UnsafeFPMath)
- return lowerFastUnsafeFDIV(Op, DAG);
+ if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
+ return FastLowered;
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
@@ -8368,11 +8486,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.isVector() &&
Store->getValue().getValueType().getScalarType() == MVT::i32);
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand())) {
- return expandUnalignedStore(Store, DAG);
- }
-
unsigned AS = Store->getAddressSpace();
if (Subtarget->hasLDSMisalignedBug() &&
AS == AMDGPUAS::FLAT_ADDRESS &&
@@ -8397,6 +8510,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// v3 stores not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
return SplitVectorStore(Op, DAG);
+
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand()))
+ return expandUnalignedStore(Store, DAG);
+
return SDValue();
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
switch (Subtarget->getMaxPrivateElementSize()) {
@@ -8407,16 +8525,20 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
return SDValue();
case 16:
- if (NumElements > 4 || NumElements == 3)
+ if (NumElements > 4 ||
+ (NumElements == 3 && !Subtarget->enableFlatScratch()))
return SplitVectorStore(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_write_b128 if possible.
- if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
- VT.getStoreSize() == 16 && NumElements != 3)
+ // Use ds_write_b128 or ds_write_b96 when possible.
+ if (Subtarget->hasDS96AndDS128() &&
+ ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
+ (VT.getStoreSize() == 12)) &&
+ allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+ Store->getAlign()))
return SDValue();
if (NumElements > 2)
@@ -8433,6 +8555,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
}
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand())) {
+ if (VT.isVector())
+ return SplitVectorStore(Op, DAG);
+ return expandUnalignedStore(Store, DAG);
+ }
+
return SDValue();
} else {
llvm_unreachable("unhandled address space");
@@ -8474,7 +8603,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
unsigned AS = AtomicNode->getAddressSpace();
// No custom lowering required for local address space
- if (!isFlatGlobalAddrSpace(AS))
+ if (!AMDGPU::isFlatGlobalAddrSpace(AS))
return Op;
// Non-local address space requires custom lowering for atomic compare
@@ -8584,7 +8713,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
EVT VT = N->getValueType(0);
SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
- SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
+ SDValue COffset = DAG.getConstant(Offset, SL, VT);
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
@@ -8594,12 +8723,28 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
+/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
+/// by the chain and intrinsic ID. Theoretically we would also need to check the
+/// specific intrinsic, but they all place the pointer operand first.
+static unsigned getBasePtrIndex(const MemSDNode *N) {
+ switch (N->getOpcode()) {
+ case ISD::STORE:
+ case ISD::INTRINSIC_W_CHAIN:
+ case ISD::INTRINSIC_VOID:
+ return 2;
+ default:
+ return 1;
+ }
+}
+
SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
DAGCombinerInfo &DCI) const {
- SDValue Ptr = N->getBasePtr();
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
+ unsigned PtrIdx = getBasePtrIndex(N);
+ SDValue Ptr = N->getOperand(PtrIdx);
+
// TODO: We could also do this for multiplies.
if (Ptr.getOpcode() == ISD::SHL) {
SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
@@ -8607,7 +8752,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
if (NewPtr) {
SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
- NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+ NewOps[PtrIdx] = NewPtr;
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
}
@@ -8868,7 +9013,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
// and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
- N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
uint32_t LHSMask = getPermuteMask(DAG, LHS);
uint32_t RHSMask = getPermuteMask(DAG, RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
@@ -8965,7 +9110,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
- N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
uint32_t LHSMask = getPermuteMask(DAG, LHS);
uint32_t RHSMask = getPermuteMask(DAG, RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
@@ -10509,8 +10654,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return SDValue();
switch (N->getOpcode()) {
- default:
- return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
case ISD::ADD:
return performAddCombine(N, DCI);
case ISD::SUB:
@@ -10537,35 +10680,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
case ISD::FMA:
return performFMACombine(N, DCI);
- case ISD::LOAD: {
- if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
- return Widended;
- LLVM_FALLTHROUGH;
- }
- case ISD::STORE:
- case ISD::ATOMIC_LOAD:
- case ISD::ATOMIC_STORE:
- case ISD::ATOMIC_CMP_SWAP:
- case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
- case ISD::ATOMIC_SWAP:
- case ISD::ATOMIC_LOAD_ADD:
- case ISD::ATOMIC_LOAD_SUB:
- case ISD::ATOMIC_LOAD_AND:
- case ISD::ATOMIC_LOAD_OR:
- case ISD::ATOMIC_LOAD_XOR:
- case ISD::ATOMIC_LOAD_NAND:
- case ISD::ATOMIC_LOAD_MIN:
- case ISD::ATOMIC_LOAD_MAX:
- case ISD::ATOMIC_LOAD_UMIN:
- case ISD::ATOMIC_LOAD_UMAX:
- case ISD::ATOMIC_LOAD_FADD:
- case AMDGPUISD::ATOMIC_INC:
- case AMDGPUISD::ATOMIC_DEC:
- case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
- if (DCI.isBeforeLegalize())
- break;
- return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
case ISD::AND:
return performAndCombine(N, DCI);
case ISD::OR:
@@ -10630,14 +10744,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performExtractVectorEltCombine(N, DCI);
case ISD::INSERT_VECTOR_ELT:
return performInsertVectorEltCombine(N, DCI);
+ case ISD::LOAD: {
+ if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widended;
+ LLVM_FALLTHROUGH;
+ }
+ default: {
+ if (!DCI.isBeforeLegalize()) {
+ if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
+ return performMemSDNodeCombine(MemNode, DCI);
+ }
+
+ break;
}
+ }
+
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
/// Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
- default: return 0;
+ default: return ~0u;
case AMDGPU::sub0: return 0;
case AMDGPU::sub1: return 1;
case AMDGPU::sub2: return 2;
@@ -10697,6 +10825,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
// set, etc.
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+ if (Lane == ~0u)
+ return Node;
// Check if the use is for the TFE/LWE generated result at VGPRn+1.
if (UsesTFC && Lane == TFCLane) {
@@ -10826,8 +10956,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
// to try understanding copies to physical registers.
- if (SrcVal.getValueType() == MVT::i1 &&
- Register::isPhysicalRegister(DestReg->getReg())) {
+ if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
SDLoc SL(Node);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue VReg = DAG.getRegister(
@@ -10870,7 +10999,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
unsigned Opcode = Node->getMachineOpcode();
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
- !TII->isGather4(Opcode)) {
+ !TII->isGather4(Opcode) &&
+ AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) {
return adjustWritemask(Node, DAG);
}
@@ -10881,14 +11011,14 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
}
switch (Opcode) {
- case AMDGPU::V_DIV_SCALE_F32:
- case AMDGPU::V_DIV_SCALE_F64: {
+ case AMDGPU::V_DIV_SCALE_F32_e64:
+ case AMDGPU::V_DIV_SCALE_F64_e64: {
// Satisfy the operand register constraint when one of the inputs is
// undefined. Ordinarily each undef value will have its own implicit_def of
// a vreg, so force these to use a single register.
- SDValue Src0 = Node->getOperand(0);
- SDValue Src1 = Node->getOperand(1);
- SDValue Src2 = Node->getOperand(2);
+ SDValue Src0 = Node->getOperand(1);
+ SDValue Src1 = Node->getOperand(3);
+ SDValue Src2 = Node->getOperand(5);
if ((Src0.isMachineOpcode() &&
Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
@@ -10923,10 +11053,10 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
} else
break;
- SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
- for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
- Ops.push_back(Node->getOperand(I));
-
+ SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
+ Ops[1] = Src0;
+ Ops[3] = Src1;
+ Ops[5] = Src2;
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
@@ -10962,8 +11092,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MachineOperand &Op = MI.getOperand(I);
if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
- !Register::isVirtualRegister(Op.getReg()) ||
- !TRI->isAGPR(MRI, Op.getReg()))
+ !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg()))
continue;
auto *Src = MRI.getUniqueVRegDef(Op.getReg());
if (!Src || !Src->isCopy() ||
@@ -10985,8 +11114,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {
- MI.setDesc(TII->get(NoRetAtomicOp));
+ int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::glc1);
+ if (Glc1Idx != -1)
+ MI.RemoveOperand(Glc1Idx);
MI.RemoveOperand(0);
+ MI.setDesc(TII->get(NoRetAtomicOp));
return;
}
@@ -11341,17 +11474,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
- // Add VCC_HI def because many instructions marked as imp-use VCC where
- // we may only define VCC_LO. If nothing defines VCC_HI we may end up
- // having a use of undef.
-
const SIInstrInfo *TII = ST.getInstrInfo();
- DebugLoc DL;
-
- MachineBasicBlock &MBB = MF.front();
- MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
- BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
-
for (auto &MBB : MF) {
for (auto &MI : MBB) {
TII->fixImplicitOperands(MI);
@@ -11379,6 +11502,55 @@ void SITargetLowering::computeKnownBitsForFrameIndex(
Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
}
+static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
+ KnownBits &Known, unsigned Dim) {
+ unsigned MaxValue =
+ ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
+ Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+}
+
+void SITargetLowering::computeKnownBitsForTargetInstr(
+ GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
+ const MachineRegisterInfo &MRI, unsigned Depth) const {
+ const MachineInstr *MI = MRI.getVRegDef(R);
+ switch (MI->getOpcode()) {
+ case AMDGPU::G_INTRINSIC: {
+ switch (MI->getIntrinsicID()) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
+ break;
+ case Intrinsic::amdgcn_workitem_id_y:
+ knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
+ break;
+ case Intrinsic::amdgcn_workitem_id_z:
+ knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
+ break;
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::amdgcn_mbcnt_hi: {
+ // These return at most the wavefront size - 1.
+ unsigned Size = MRI.getType(R).getSizeInBits();
+ Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
+ break;
+ }
+ case Intrinsic::amdgcn_groupstaticsize: {
+ // We can report everything over the maximum size as 0. We can't report
+ // based on the actual size because we don't know if it's accurate or not
+ // at any given point.
+ Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
+ break;
+ }
+ }
+ break;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ Known.Zero.setHighBits(24);
+ break;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ Known.Zero.setHighBits(16);
+ break;
+ }
+}
+
Align SITargetLowering::computeKnownAlignForTargetInstr(
GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
unsigned Depth) const {
@@ -11484,46 +11656,40 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
return false;
}
-bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
- FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
-{
+bool SITargetLowering::isSDNodeSourceOfDivergence(
+ const SDNode *N, FunctionLoweringInfo *FLI,
+ LegacyDivergenceAnalysis *KDA) const {
switch (N->getOpcode()) {
- case ISD::CopyFromReg:
- {
- const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
- const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
- const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- Register Reg = R->getReg();
+ case ISD::CopyFromReg: {
+ const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
+ const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ Register Reg = R->getReg();
- // FIXME: Why does this need to consider isLiveIn?
- if (Reg.isPhysical() || MRI.isLiveIn(Reg))
- return !TRI->isSGPRReg(MRI, Reg);
+ // FIXME: Why does this need to consider isLiveIn?
+ if (Reg.isPhysical() || MRI.isLiveIn(Reg))
+ return !TRI->isSGPRReg(MRI, Reg);
- if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
- return KDA->isDivergent(V);
+ if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
+ return KDA->isDivergent(V);
- assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
- return !TRI->isSGPRReg(MRI, Reg);
- }
- break;
- case ISD::LOAD: {
- const LoadSDNode *L = cast<LoadSDNode>(N);
- unsigned AS = L->getAddressSpace();
- // A flat load may access private memory.
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- } break;
- case ISD::CALLSEQ_END:
+ assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+ return !TRI->isSGPRReg(MRI, Reg);
+ }
+ case ISD::LOAD: {
+ const LoadSDNode *L = cast<LoadSDNode>(N);
+ unsigned AS = L->getAddressSpace();
+ // A flat load may access private memory.
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ }
+ case ISD::CALLSEQ_END:
return true;
- break;
- case ISD::INTRINSIC_WO_CHAIN:
- {
-
- }
- return AMDGPU::isIntrinsicSourceOfDivergence(
- cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
- case ISD::INTRINSIC_W_CHAIN:
- return AMDGPU::isIntrinsicSourceOfDivergence(
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ case ISD::INTRINSIC_WO_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+ case ISD::INTRINSIC_W_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
}
return false;
}
@@ -11558,6 +11724,16 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}
+// Global FP atomic instructions have a hardcoded FP mode and do not support
+// FP32 denormals, and only support v2f16 denormals.
+static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
+ auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
+ if (&Flt == &APFloat::IEEEsingle())
+ return DenormMode == DenormalMode::getPreserveSign();
+ return DenormMode == DenormalMode::getIEEE();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
switch (RMW->getOperation()) {
@@ -11576,10 +11752,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
+ if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+ return AtomicExpansionKind::CmpXChg;
+
return RMW->use_empty() ? AtomicExpansionKind::None :
AtomicExpansionKind::CmpXChg;
}
+ // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
+ // to round-to-nearest-even.
return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f4c076464057..823d6eca9bf8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -16,10 +16,17 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPUArgumentUsageInfo.h"
-#include "SIInstrInfo.h"
namespace llvm {
+class GCNSubtarget;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
+
+namespace AMDGPU {
+struct ImageDimIntrinsicInfo;
+}
+
class SITargetLowering final : public AMDGPUTargetLowering {
private:
const GCNSubtarget *Subtarget;
@@ -59,10 +66,15 @@ private:
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
MVT VT, unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
- SelectionDAG &DAG) const;
+ SelectionDAG &DAG, bool WithChain) const;
SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
SDValue CachePolicy, SelectionDAG &DAG) const;
+ SDValue lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOpcode) const;
+ SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOpcode) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
@@ -80,12 +92,12 @@ private:
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFastUnsafeFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
@@ -104,7 +116,8 @@ private:
ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) const;
- SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
+ SDValue handleD16VData(SDValue VData, SelectionDAG &DAG,
+ bool ImageStore = false) const;
/// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.
@@ -255,12 +268,22 @@ public:
const SelectionDAG &DAG) const override;
bool allowsMisalignedMemoryAccessesImpl(
- unsigned Size, unsigned AS, unsigned Align,
+ unsigned Size, unsigned AddrSpace, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const;
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AS, unsigned Align,
+ LLT Ty, unsigned AddrSpace, Align Alignment,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const override {
+ if (IsFast)
+ *IsFast = false;
+ return allowsMisalignedMemoryAccessesImpl(Ty.getSizeInBits(), AddrSpace,
+ Alignment, Flags, IsFast);
+ }
+
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS, unsigned Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
@@ -270,20 +293,8 @@ public:
bool isMemOpUniform(const SDNode *N) const;
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
- static bool isNonGlobalAddrSpace(unsigned AS) {
- return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
- AS == AMDGPUAS::PRIVATE_ADDRESS;
- }
+ static bool isNonGlobalAddrSpace(unsigned AS);
- // FIXME: Missing constant_32bit
- static bool isFlatGlobalAddrSpace(unsigned AS) {
- return AS == AMDGPUAS::GLOBAL_ADDRESS ||
- AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS ||
- AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
- }
-
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
TargetLoweringBase::LegalizeTypeAction
@@ -366,6 +377,8 @@ public:
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+ LLT getPreferredShiftAmountTy(LLT Ty) const override;
+
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
@@ -412,6 +425,11 @@ public:
void computeKnownBitsForFrameIndex(int FrameIdx,
KnownBits &Known,
const MachineFunction &MF) const override;
+ void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const MachineRegisterInfo &MRI,
+ unsigned Depth = 0) const override;
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R,
const MachineRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 35c49ae8c0dd..5611c9c5d57e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -31,8 +31,9 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 052db5f6ea71..9d31cd5cedc3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -14,30 +14,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
using namespace llvm;
@@ -58,16 +39,18 @@ private:
MachineDominatorTree *MDT = nullptr;
MachineBasicBlock *EarlyExitBlock = nullptr;
+ bool EarlyExitClearsExec = false;
bool shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool dominatesAllReachable(MachineBasicBlock &MBB);
- void createEarlyExitBlock(MachineBasicBlock &MBB);
+ void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL);
bool kill(MachineInstr &MI);
+ void earlyTerm(MachineInstr &MI);
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
@@ -164,31 +147,62 @@ bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
return true;
}
-static void generatePsEndPgm(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- const SIInstrInfo *TII) {
- // Generate "null export; s_endpgm".
- BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addImm(1) // vm
- .addImm(0) // compr
- .addImm(0); // en
+static void generateEndPgm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ const SIInstrInfo *TII, bool IsPS) {
+ // "null export"
+ if (IsPS) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(AMDGPU::Exp::ET_NULL)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+ }
+ // s_endpgm
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
}
-void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
+void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
+ bool ClearExec) {
MachineFunction *MF = MBB.getParent();
DebugLoc DL;
- assert(!EarlyExitBlock);
- EarlyExitBlock = MF->CreateMachineBasicBlock();
- MF->insert(MF->end(), EarlyExitBlock);
+ if (!EarlyExitBlock) {
+ EarlyExitBlock = MF->CreateMachineBasicBlock();
+ MF->insert(MF->end(), EarlyExitBlock);
+ generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
+ MF->getFunction().getCallingConv() ==
+ CallingConv::AMDGPU_PS);
+ EarlyExitClearsExec = false;
+ }
- generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
+ if (ClearExec && !EarlyExitClearsExec) {
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ auto ExitI = EarlyExitBlock->getFirstNonPHI();
+ BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
+ EarlyExitClearsExec = true;
+ }
+}
+
+static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineDominatorTree *MDT) {
+ MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
+
+ // Update dominator tree
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ MDT->getBase().applyUpdates(DTUpdates);
}
/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
@@ -196,6 +210,7 @@ void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL) {
MachineFunction *MF = MBB.getParent();
+ (void)MF;
assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
// It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
@@ -211,45 +226,22 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
// In this case, we write the "null_export; s_endpgm" skip code in the
// already-existing basic block.
auto NextBBI = std::next(MBB.getIterator());
- bool NoSuccessor = I == MBB.end() &&
- llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
+ bool NoSuccessor =
+ I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
if (NoSuccessor) {
- generatePsEndPgm(MBB, I, DL, TII);
+ generateEndPgm(MBB, I, DL, TII, true);
} else {
- if (!EarlyExitBlock) {
- createEarlyExitBlock(MBB);
- // Update next block pointer to reflect any new blocks
- NextBBI = std::next(MBB.getIterator());
- }
+ ensureEarlyExitBlock(MBB, false);
- auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(EarlyExitBlock);
+ MachineInstr *BranchMI =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ .addMBB(EarlyExitBlock);
// Split the block if the branch will not come at the end.
auto Next = std::next(BranchMI->getIterator());
- if (Next != MBB.end() && !Next->isTerminator()) {
- MachineBasicBlock *SplitBB =
- MF->CreateMachineBasicBlock(MBB.getBasicBlock());
- MF->insert(NextBBI, SplitBB);
- SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
- SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
- // FIXME: the expectation is that this will be used near the beginning
- // of a block so just assume all registers are still live.
- for (auto LiveIn : MBB.liveins())
- SplitBB->addLiveIn(LiveIn);
- MBB.addSuccessor(SplitBB);
-
- // Update dominator tree
- using DomTreeT = DomTreeBase<MachineBasicBlock>;
- SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
- for (MachineBasicBlock *Succ : SplitBB->successors()) {
- DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
- DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
- }
- DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
- MDT->getBase().applyUpdates(DTUpdates);
- }
+ if (Next != MBB.end() && !Next->isTerminator())
+ splitBlock(MBB, *BranchMI, MDT);
MBB.addSuccessor(EarlyExitBlock);
MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
@@ -382,6 +374,23 @@ bool SIInsertSkips::kill(MachineInstr &MI) {
}
}
+void SIInsertSkips::earlyTerm(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc DL = MI.getDebugLoc();
+
+ ensureEarlyExitBlock(MBB, true);
+
+ auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
+ .addMBB(EarlyExitBlock);
+ auto Next = std::next(MI.getIterator());
+
+ if (Next != MBB.end() && !Next->isTerminator())
+ splitBlock(MBB, *BranchMI, MDT);
+
+ MBB.addSuccessor(EarlyExitBlock);
+ MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+}
+
// Returns true if a branch over the block was inserted.
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
@@ -407,6 +416,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
SkipThreshold = SkipThresholdFlag;
SmallVector<MachineInstr *, 4> KillInstrs;
+ SmallVector<MachineInstr *, 4> EarlyTermInstrs;
bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
@@ -465,18 +475,29 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
}
break;
+ case AMDGPU::SI_EARLY_TERMINATE_SCC0:
+ EarlyTermInstrs.push_back(&MI);
+ break;
+
default:
break;
}
}
}
+ for (MachineInstr *Instr : EarlyTermInstrs) {
+ // Early termination in GS does nothing
+ if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
+ earlyTerm(*Instr);
+ Instr->eraseFromParent();
+ }
for (MachineInstr *Kill : KillInstrs) {
skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
Kill->getDebugLoc());
Kill->eraseFromParent();
}
KillInstrs.clear();
+ EarlyTermInstrs.clear();
EarlyExitBlock = nullptr;
return MadeChange;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2a157eb20ab4..c12745586da1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -24,41 +24,15 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/InitializePasses.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <utility>
-
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -458,6 +432,7 @@ public:
#endif // NDEBUG
}
+ bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -486,7 +461,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- unsigned Reg = TRI->getEncodingValue(Op.getReg());
+ unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
if (TRI->isVGPR(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
@@ -623,8 +598,9 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
MachineOperand &DefMO = Inst.getOperand(I);
if (DefMO.isReg() && DefMO.isDef() &&
TRI->isVGPR(*MRI, DefMO.getReg())) {
- setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
- CurrScore);
+ setRegScore(
+ TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
+ EXP_CNT, CurrScore);
}
}
}
@@ -855,7 +831,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
setForceEmitWaitcnt();
bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
- if (MI.isDebugInstr())
+ if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
@@ -876,7 +852,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -963,26 +939,28 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
int CallAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
- RegInterval CallAddrOpInterval =
+
+ if (MI.getOperand(CallAddrOpIdx).isReg()) {
+ RegInterval CallAddrOpInterval =
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
- for (int RegNo = CallAddrOpInterval.first;
- RegNo < CallAddrOpInterval.second; ++RegNo)
- ScoreBrackets.determineWait(
+ for (int RegNo = CallAddrOpInterval.first;
+ RegNo < CallAddrOpInterval.second; ++RegNo)
+ ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
- int RtnAddrOpIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
- if (RtnAddrOpIdx != -1) {
- RegInterval RtnAddrOpInterval =
+ int RtnAddrOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+ if (RtnAddrOpIdx != -1) {
+ RegInterval RtnAddrOpInterval =
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
- for (int RegNo = RtnAddrOpInterval.first;
- RegNo < RtnAddrOpInterval.second; ++RegNo)
- ScoreBrackets.determineWait(
+ for (int RegNo = RtnAddrOpInterval.first;
+ RegNo < RtnAddrOpInterval.second; ++RegNo)
+ ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ }
}
-
} else {
// FIXME: Should not be relying on memoperands.
// Look at the source operands of every instruction to see if
@@ -1024,8 +1002,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
continue;
RegInterval Interval =
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
+
+ const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(*MRI, Op.getReg())) {
+ if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
// instruction, in which case they're guaranteed to write their
@@ -1055,7 +1035,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1088,8 +1068,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
} else {
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- ScoreBrackets.applyWaitcnt(
- AMDGPU::Waitcnt(~0u, ~0u, ~0u, II->getOperand(1).getImm()));
+ auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+ ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
}
}
}
@@ -1097,7 +1077,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
if (ForceEmitZeroWaitcnts)
- Wait = AMDGPU::Waitcnt::allZero(IV);
+ Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
if (ForceEmitWaitcnt[VM_CNT])
Wait.VmCnt = 0;
@@ -1137,12 +1117,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- unsigned ICnt = II->getOperand(1).getImm();
+ unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
+ ->getImm();
OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
if (!TrackedWaitcntSet.count(&*II))
Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
if (Wait.VsCnt != ICnt) {
- II->getOperand(1).setImm(Wait.VsCnt);
+ TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
Modified = true;
}
Wait.VsCnt = ~0u;
@@ -1189,12 +1170,50 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
return Modified;
}
-// This is a flat memory operation. Check to see if it has memory
-// tokens for both LDS and Memory, and if so mark it as a flat.
+// This is a flat memory operation. Check to see if it has memory tokens other
+// than LDS. Other address spaces supported by flat memory operations involve
+// global memory.
+bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
+ assert(TII->isFLAT(MI));
+
+ // All flat instructions use the VMEM counter.
+ assert(TII->usesVM_CNT(MI));
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access VMEM.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves VMEM.
+ // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+ // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+ // (GDS) address space is not supported by flat operations. Therefore, simply
+ // return true unless only the LDS address space is found.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ assert(AS != AMDGPUAS::REGION_ADDRESS);
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ return true;
+ }
+
+ return false;
+}
+
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either LDS or FLAT.
bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+ assert(TII->isFLAT(MI));
+
+ // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+ if (!TII->usesLGKM_CNT(MI))
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access LDS.
if (MI.memoperands_empty())
return true;
+ // See if any memory operand specifies an address space that involves LDS.
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
@@ -1221,7 +1240,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (TII->isFLAT(Inst)) {
assert(Inst.mayLoadOrStore());
- if (TII->usesVM_CNT(Inst)) {
+ int FlatASCount = 0;
+
+ if (mayAccessVMEMThroughFlat(Inst)) {
+ ++FlatASCount;
if (!ST->hasVscnt())
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
else if (Inst.mayLoad() &&
@@ -1231,15 +1253,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
}
- if (TII->usesLGKM_CNT(Inst)) {
+ if (mayAccessLDSThroughFlat(Inst)) {
+ ++FlatASCount;
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
-
- // This is a flat memory operation, so note it - it will require
- // that both the VM and LGKM be flushed to zero if it is pending when
- // a VM or LGKM dependency occurs.
- if (mayAccessLDSThroughFlat(Inst))
- ScoreBrackets->setPendingFlat();
}
+
+ // A Flat memory operation must access at least one address space.
+ assert(FlatASCount);
+
+ // This is a flat memory operation that access both VMEM and LDS, so note it
+ // - it will require that both the VM and LGKM be flushed to zero if it is
+ // pending when a VM or LGKM dependency occurs.
+ if (FlatASCount > 1)
+ ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
// TODO: get a better carve out.
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
@@ -1266,34 +1292,29 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
- ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
} else {
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
}
+ } else if (SIInstrInfo::isEXP(Inst)) {
+ unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+ if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+ else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+ else
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
case AMDGPU::S_SENDMSGHALT:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
break;
- case AMDGPU::EXP:
- case AMDGPU::EXP_DONE: {
- int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
- if (Imm >= 32 && Imm <= 63)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
- else if (Imm >= 12 && Imm <= 15)
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
- else
- ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
- break;
- }
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
break;
- default:
- break;
}
}
}
@@ -1381,9 +1402,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
- // Assume VCCZ is correct at basic block boundaries, unless and until we need
- // to handle cases where that is not true.
+ // Track the correctness of vccz through this basic block. There are two
+ // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
+ // ST->partialVCCWritesUpdateVCCZ().
bool VCCZCorrect = true;
+ if (ST->hasReadVCCZBug()) {
+ // vccz could be incorrect at a basic block boundary if a predecessor wrote
+ // to vcc and then issued an smem load.
+ VCCZCorrect = false;
+ } else if (!ST->partialVCCWritesUpdateVCCZ()) {
+ // vccz could be incorrect at a basic block boundary if a predecessor wrote
+ // to vcc_lo or vcc_hi.
+ VCCZCorrect = false;
+ }
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
@@ -1404,14 +1435,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
continue;
}
- // We might need to restore vccz to its correct value for either of two
- // different reasons; see ST->hasReadVCCZBug() and
- // ST->partialVCCWritesUpdateVCCZ().
- bool RestoreVCCZ = false;
- if (readsVCCZ(Inst)) {
- if (!VCCZCorrect)
- RestoreVCCZ = true;
- else if (ST->hasReadVCCZBug()) {
+ // Generate an s_waitcnt instruction to be placed before Inst, if needed.
+ Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+ OldWaitcntInstr = nullptr;
+
+ // Restore vccz if it's not known to be correct already.
+ bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+
+ // Don't examine operands unless we need to track vccz correctness.
+ if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
+ if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+ Inst.definesRegister(AMDGPU::VCC_HI)) {
+ // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ if (!ST->partialVCCWritesUpdateVCCZ())
+ VCCZCorrect = false;
+ } else if (Inst.definesRegister(AMDGPU::VCC)) {
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
// vccz bit, so when we detect that an instruction may read from a
// corrupt vccz bit, we need to:
@@ -1419,10 +1457,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// operations to complete.
// 2. Restore the correct value of vccz by writing the current value
// of vcc back to vcc.
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ if (ST->hasReadVCCZBug() &&
+ ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- RestoreVCCZ = true;
+ // Writes to vcc while there's an outstanding smem read may get
+ // clobbered as soon as any read completes.
+ VCCZCorrect = false;
+ } else {
+ // Writes to vcc will fix any incorrect value in vccz.
+ VCCZCorrect = true;
}
}
}
@@ -1432,23 +1476,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
const Value *Ptr = Memop->getValue();
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
}
- }
-
- if (!ST->partialVCCWritesUpdateVCCZ()) {
- // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
- // Writes to vcc will fix it.
- if (Inst.definesRegister(AMDGPU::VCC_LO) ||
- Inst.definesRegister(AMDGPU::VCC_HI))
+ if (ST->hasReadVCCZBug()) {
+ // This smem read could complete and clobber vccz at any time.
VCCZCorrect = false;
- else if (Inst.definesRegister(AMDGPU::VCC))
- VCCZCorrect = true;
+ }
}
- // Generate an s_waitcnt instruction to be placed before
- // cur_Inst, if needed.
- Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
- OldWaitcntInstr = nullptr;
-
updateEventWaitcntAfter(Inst, &ScoreBrackets);
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 428c21c896d5..7ce042b67aba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -33,6 +33,7 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit VINTRP = 0;
field bit SDWA = 0;
field bit DPP = 0;
+ field bit TRANS = 0;
// Memory instruction formats.
field bit MUBUF = 0;
@@ -110,9 +111,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a D16 buffer instruction.
field bit D16Buf = 0;
- // This field indicates that FLAT instruction accesses FLAT_GLBL or
- // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
- field bit IsNonFlatSeg = 0;
+ // This field indicates that FLAT instruction accesses FLAT_GLBL segment.
+ // Must be 0 for non-FLAT instructions.
+ field bit IsFlatGlobal = 0;
// Reads the mode register, usually for FP environment.
field bit ReadsModeReg = 0;
@@ -130,6 +131,10 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is one of DOT instructions.
field bit IsDOT = 0;
+ // This field indicates that FLAT instruction accesses FLAT_SCRATCH segment.
+ // Must be 0 for non-FLAT instructions.
+ field bit IsFlatScratch = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -149,17 +154,18 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{13} = VINTRP;
let TSFlags{14} = SDWA;
let TSFlags{15} = DPP;
+ let TSFlags{16} = TRANS;
- let TSFlags{16} = MUBUF;
- let TSFlags{17} = MTBUF;
- let TSFlags{18} = SMRD;
- let TSFlags{19} = MIMG;
- let TSFlags{20} = EXP;
- let TSFlags{21} = FLAT;
- let TSFlags{22} = DS;
+ let TSFlags{17} = MUBUF;
+ let TSFlags{18} = MTBUF;
+ let TSFlags{19} = SMRD;
+ let TSFlags{20} = MIMG;
+ let TSFlags{21} = EXP;
+ let TSFlags{22} = FLAT;
+ let TSFlags{23} = DS;
- let TSFlags{23} = VGPRSpill;
- let TSFlags{24} = SGPRSpill;
+ let TSFlags{24} = VGPRSpill;
+ let TSFlags{25} = SGPRSpill;
let TSFlags{32} = VM_CNT;
let TSFlags{33} = EXP_CNT;
@@ -187,7 +193,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{50} = D16Buf;
- let TSFlags{51} = IsNonFlatSeg;
+ let TSFlags{51} = IsFlatGlobal;
let TSFlags{52} = FPDPRounding;
@@ -197,17 +203,14 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{55} = IsDOT;
- let SchedRW = [Write32Bit];
+ let TSFlags{56} = IsFlatScratch;
- field bits<1> DisableSIDecoder = 0;
- field bits<1> DisableVIDecoder = 0;
- field bits<1> DisableDecoder = 0;
+ let SchedRW = [Write32Bit];
- let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
let AsmVariantName = AMDGPUAsmVariants.Default;
// Avoid changing source registers in a way that violates constant bus read limitations.
- let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0))))));
+ let hasExtraSrcRegAllocReq = !or(VOP1, VOP2, VOP3, VOPC, SDWA, VALU);
}
class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
@@ -362,15 +365,4 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
let VALU = 1;
}
-class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern> {
- let EXP = 1;
- let EXP_CNT = 1;
- let mayLoad = 0; // Set to 1 if done bit is set.
- let mayStore = 1;
- let UseNamedOperandTable = 1;
- let Uses = [EXEC];
- let SchedRW = [WriteExport];
-}
-
} // End Uses = [EXEC]
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9af8ffedce0f..dfd0075bf03a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -13,53 +13,20 @@
#include "SIInstrInfo.h"
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUInstrInfo.h"
#include "GCNHazardRecognizer.h"
-#include "SIDefines.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <utility>
using namespace llvm;
@@ -69,6 +36,9 @@ using namespace llvm;
#include "AMDGPUGenInstrInfo.inc"
namespace llvm {
+
+class AAResults;
+
namespace AMDGPU {
#define GET_D16ImageDimIntrinsics_IMPL
#define GET_ImageDimIntrinsicTable_IMPL
@@ -136,7 +106,7 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
}
bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const {
+ AAResults *AA) const {
// TODO: The generic check fails for VALU instructions that should be
// rematerializable due to implicit reads of exec. We really want all of the
// generic logic for this except for this.
@@ -144,8 +114,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
- case AMDGPU::V_ACCVGPR_READ_B32:
- case AMDGPU::V_ACCVGPR_WRITE_B32:
+ case AMDGPU::V_ACCVGPR_READ_B32_e64:
+ case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
// No implicit operands.
return MI.getNumOperands() == MI.getDesc().getNumOperands();
default:
@@ -418,7 +388,7 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
}
if (isFLAT(LdSt)) {
- // Instructions have either vaddr or saddr or both.
+ // Instructions have either vaddr or saddr or both or none.
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (BaseOp)
BaseOps.push_back(BaseOp);
@@ -459,10 +429,8 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
auto Base2 = MO2->getValue();
if (!Base1 || !Base2)
return false;
- const MachineFunction &MF = *MI1.getParent()->getParent();
- const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
- Base1 = GetUnderlyingObject(Base1, DL);
- Base2 = GetUnderlyingObject(Base2, DL);
+ Base1 = getUnderlyingObject(Base1);
+ Base2 = getUnderlyingObject(Base2);
if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
return false;
@@ -474,27 +442,33 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2,
unsigned NumLoads,
unsigned NumBytes) const {
- // If current mem ops pair do not have same base pointer, then they cannot be
- // clustered.
- assert(!BaseOps1.empty() && !BaseOps2.empty());
- const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
- const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
- if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+ // If the mem ops (to be clustered) do not have the same base ptr, then they
+ // should not be clustered
+ if (!BaseOps1.empty() && !BaseOps2.empty()) {
+ const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+ const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+ return false;
+ } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
+ // If only one base op is empty, they do not have the same base ptr
return false;
-
- // Compute max cluster size based on average number bytes clustered till now,
- // and decide based on it, if current mem ops pair can be clustered or not.
- assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
- "Invalid NumLoads/NumBytes values");
- unsigned MaxNumLoads;
- if (NumBytes <= 4 * NumLoads) {
- // Loads are dword or smaller (on average).
- MaxNumLoads = 5;
- } else {
- // Loads are bigger than a dword (on average).
- MaxNumLoads = 4;
}
- return NumLoads <= MaxNumLoads;
+
+ // In order to avoid regester pressure, on an average, the number of DWORDS
+ // loaded together by all clustered mem ops should not exceed 8. This is an
+ // empirical value based on certain observations and performance related
+ // experiments.
+ // The good thing about this heuristic is - it avoids clustering of too many
+ // sub-word loads, and also avoids clustering of wide loads. Below is the
+ // brief summary of how the heuristic behaves for various `LoadSize`.
+ // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
+ // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
+ // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
+ // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
+ // (5) LoadSize >= 17: do not cluster
+ const unsigned LoadSize = NumBytes / NumLoads;
+ const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
+ return NumDWORDs <= 8;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
@@ -533,6 +507,157 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc));
}
+/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
+/// to directly copy, so an intermediate VGPR needs to be used.
+static void indirectCopyToAGPR(const SIInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc,
+ RegScavenger &RS,
+ Register ImpDefSuperReg = Register(),
+ Register ImpUseSuperReg = Register()) {
+ const SIRegisterInfo &RI = TII.getRegisterInfo();
+
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_32RegClass.contains(SrcReg));
+
+ // First try to find defining accvgpr_write to avoid temporary registers.
+ for (auto Def = MI, E = MBB.begin(); Def != E; ) {
+ --Def;
+ if (!Def->definesRegister(SrcReg, &RI))
+ continue;
+ if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+ break;
+
+ MachineOperand &DefOp = Def->getOperand(1);
+ assert(DefOp.isReg() || DefOp.isImm());
+
+ if (DefOp.isReg()) {
+ // Check that register source operand if not clobbered before MI.
+ // Immediate operands are always safe to propagate.
+ bool SafeToPropagate = true;
+ for (auto I = Def; I != MI && SafeToPropagate; ++I)
+ if (I->modifiesRegister(DefOp.getReg(), &RI))
+ SafeToPropagate = false;
+
+ if (!SafeToPropagate)
+ break;
+
+ DefOp.setIsKill(false);
+ }
+
+ MachineInstrBuilder Builder =
+ BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+ .add(DefOp);
+ if (ImpDefSuperReg)
+ Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+
+ if (ImpUseSuperReg) {
+ Builder.addReg(ImpUseSuperReg,
+ getKillRegState(KillSrc) | RegState::Implicit);
+ }
+
+ return;
+ }
+
+ RS.enterBasicBlock(MBB);
+ RS.forward(MI);
+
+ // Ideally we want to have three registers for a long reg_sequence copy
+ // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
+ unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+ *MBB.getParent());
+
+ // Registers in the sequence are allocated contiguously so we can just
+ // use register number to pick one of three round-robin temps.
+ unsigned RegNo = DestReg % 3;
+ Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp)
+ report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
+ RS.setRegUsed(Tmp);
+ // Only loop through if there are any free registers left, otherwise
+ // scavenger may report a fatal error without emergency spill slot
+ // or spill with the slot.
+ while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+ Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+ break;
+ Tmp = Tmp2;
+ RS.setRegUsed(Tmp);
+ }
+
+ // Insert copy to temporary VGPR.
+ unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
+ if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
+ TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
+ } else {
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+ }
+
+ MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ if (ImpUseSuperReg) {
+ UseBuilder.addReg(ImpUseSuperReg,
+ getKillRegState(KillSrc) | RegState::Implicit);
+ }
+
+ MachineInstrBuilder DefBuilder
+ = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+ .addReg(Tmp, RegState::Kill);
+
+ if (ImpDefSuperReg)
+ DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+}
+
+static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, const DebugLoc &DL,
+ MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
+ const TargetRegisterClass *RC, bool Forward) {
+ const SIRegisterInfo &RI = TII.getRegisterInfo();
+ ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
+ MachineBasicBlock::iterator I = MI;
+ MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
+
+ for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
+ int16_t SubIdx = BaseIndices[Idx];
+ Register Reg = RI.getSubReg(DestReg, SubIdx);
+ unsigned Opcode = AMDGPU::S_MOV_B32;
+
+ // Is SGPR aligned? If so try to combine with next.
+ Register Src = RI.getSubReg(SrcReg, SubIdx);
+ bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
+ bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
+ if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
+ // Can use SGPR64 copy
+ unsigned Channel = RI.getChannelFromSubReg(SubIdx);
+ SubIdx = RI.getSubRegFromChannel(Channel, 2);
+ Opcode = AMDGPU::S_MOV_B64;
+ Idx++;
+ }
+
+ LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
+ .addReg(RI.getSubReg(SrcReg, SubIdx))
+ .addReg(SrcReg, RegState::Implicit);
+
+ if (!FirstMI)
+ FirstMI = LastMI;
+
+ if (!Forward)
+ I--;
+ }
+
+ assert(FirstMI && LastMI);
+ if (!Forward)
+ std::swap(FirstMI, LastMI);
+
+ FirstMI->addOperand(
+ MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
+
+ if (KillSrc)
+ LastMI->addRegisterKilled(SrcReg, &RI);
+}
+
void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
@@ -563,7 +688,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
AMDGPU::SReg_32RegClass.contains(SrcReg) ||
AMDGPU::AGPR_32RegClass.contains(SrcReg));
unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
- AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32;
+ AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
BuildMI(MBB, MI, DL, get(Opc), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
@@ -639,88 +764,36 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (DestReg == AMDGPU::SCC) {
// Copying 64-bit or 32-bit sources to SCC barely makes sense,
// but SelectionDAG emits such copies for i1 sources.
- // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit.
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
- SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0);
+ // This copy can only be produced by patterns
+ // with explicit SCC, which are known to be enabled
+ // only for subtargets with S_CMP_LG_U64 present.
+ assert(ST.hasScalarCompareEq64());
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+ } else {
+ assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
}
- assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
-
- BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
- .addReg(SrcReg, getKillRegState(KillSrc))
- .addImm(0);
return;
}
- if (RC == &AMDGPU::AGPR_32RegClass) {
- assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
- AMDGPU::SReg_32RegClass.contains(SrcReg) ||
- AMDGPU::AGPR_32RegClass.contains(SrcReg));
- if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
- // First try to find defining accvgpr_write to avoid temporary registers.
- for (auto Def = MI, E = MBB.begin(); Def != E; ) {
- --Def;
- if (!Def->definesRegister(SrcReg, &RI))
- continue;
- if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
- break;
-
- MachineOperand &DefOp = Def->getOperand(1);
- assert(DefOp.isReg() || DefOp.isImm());
-
- if (DefOp.isReg()) {
- // Check that register source operand if not clobbered before MI.
- // Immediate operands are always safe to propagate.
- bool SafeToPropagate = true;
- for (auto I = Def; I != MI && SafeToPropagate; ++I)
- if (I->modifiesRegister(DefOp.getReg(), &RI))
- SafeToPropagate = false;
-
- if (!SafeToPropagate)
- break;
-
- DefOp.setIsKill(false);
- }
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
- .add(DefOp);
- return;
- }
-
- RegScavenger RS;
- RS.enterBasicBlock(MBB);
- RS.forward(MI);
-
- // Ideally we want to have three registers for a long reg_sequence copy
- // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
- unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
- *MBB.getParent());
-
- // Registers in the sequence are allocated contiguously so we can just
- // use register number to pick one of three round-robin temps.
- unsigned RegNo = DestReg % 3;
- Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
- if (!Tmp)
- report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
- RS.setRegUsed(Tmp);
- // Only loop through if there are any free registers left, otherwise
- // scavenger may report a fatal error without emergency spill slot
- // or spill with the slot.
- while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
- unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
- if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
- break;
- Tmp = Tmp2;
- RS.setRegUsed(Tmp);
- }
- copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
- .addReg(Tmp, RegState::Kill);
+ if (RC == &AMDGPU::AGPR_32RegClass) {
+ if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
return;
}
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ // FIXME: Pass should maintain scavenger to avoid scan through the block on
+ // every AGPR spill.
+ RegScavenger RS;
+ indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
return;
}
@@ -790,31 +863,38 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- unsigned EltSize = 4;
- unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
if (RI.isSGPRClass(RC)) {
- // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
- if (!(RI.getRegSizeInBits(*RC) % 64)) {
- Opcode = AMDGPU::S_MOV_B64;
- EltSize = 8;
- } else {
- Opcode = AMDGPU::S_MOV_B32;
- EltSize = 4;
- }
-
if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
- } else if (RI.hasAGPRs(RC)) {
+ expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
+ return;
+ }
+
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ if (RI.hasAGPRs(RC)) {
Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
- AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+ AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
} else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
- Opcode = AMDGPU::V_ACCVGPR_READ_B32;
+ Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
}
- ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
- bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
+ // For the cases where we need an intermediate instruction/temporary register
+ // (destination is an AGPR), we need a scavenger.
+ //
+ // FIXME: The pass should maintain this for us so we don't have to re-scan the
+ // whole block for every handled copy.
+ std::unique_ptr<RegScavenger> RS;
+ if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
+ RS.reset(new RegScavenger());
+
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4);
+
+ // If there is an overlap, we can't kill the super-register on the last
+ // instruction, since it will also kill the components made live by this def.
+ const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
unsigned SubIdx;
@@ -823,22 +903,23 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else
SubIdx = SubIndices[SubIndices.size() - Idx - 1];
- if (Opcode == TargetOpcode::COPY) {
- copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
- RI.getSubReg(SrcReg, SubIdx), KillSrc);
- continue;
- }
+ bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
- MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
- get(Opcode), RI.getSubReg(DestReg, SubIdx));
-
- Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
-
- if (Idx == 0)
- Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+ if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
+ Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
+ Register ImpUseSuper = SrcReg;
+ indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
+ RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
+ ImpDefSuper, ImpUseSuper);
+ } else {
+ MachineInstrBuilder Builder =
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
+ .addReg(RI.getSubReg(SrcReg, SubIdx));
+ if (Idx == 0)
+ Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
- bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
- Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ }
}
}
@@ -928,8 +1009,6 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
Register TrueReg,
Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- MachineFunction *MF = MBB.getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const TargetRegisterClass *BoolXExecRC =
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
@@ -1089,78 +1168,123 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
return AMDGPU::COPY;
}
-static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) {
+const MCInstrDesc &
+SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
+ bool IsIndirectSrc) const {
+ if (IsIndirectSrc) {
+ if (VecSize <= 32) // 4 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
+ if (VecSize <= 64) // 8 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
+ if (VecSize <= 96) // 12 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
+ if (VecSize <= 128) // 16 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
+ if (VecSize <= 160) // 20 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
+ if (VecSize <= 256) // 32 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
+ if (VecSize <= 512) // 64 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
+ if (VecSize <= 1024) // 128 bytes
+ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
+
+ llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
+ }
+
+ if (VecSize <= 32) // 4 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
+ if (VecSize <= 64) // 8 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
+ if (VecSize <= 96) // 12 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
+ if (VecSize <= 128) // 16 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
+ if (VecSize <= 160) // 20 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
+ if (VecSize <= 256) // 32 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
+ if (VecSize <= 512) // 64 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
+ if (VecSize <= 1024) // 128 bytes
+ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
+
+ llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
+}
+
+static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
if (VecSize <= 32) // 4 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
if (VecSize <= 64) // 8 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
if (VecSize <= 96) // 12 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
if (VecSize <= 128) // 16 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
if (VecSize <= 160) // 20 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
if (VecSize <= 256) // 32 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
if (VecSize <= 512) // 64 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
if (VecSize <= 1024) // 128 bytes
- return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32;
+ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
}
-static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) {
+static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
if (VecSize <= 32) // 4 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
if (VecSize <= 64) // 8 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
if (VecSize <= 96) // 12 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
if (VecSize <= 128) // 16 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
if (VecSize <= 160) // 20 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
if (VecSize <= 256) // 32 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
if (VecSize <= 512) // 64 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
if (VecSize <= 1024) // 128 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
}
-static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) {
+static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
if (VecSize <= 64) // 8 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
if (VecSize <= 128) // 16 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
if (VecSize <= 256) // 32 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
if (VecSize <= 512) // 64 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
if (VecSize <= 1024) // 128 bytes
- return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16;
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
}
-const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo(
- unsigned VecSize, unsigned EltSize, bool IsSGPR) const {
+const MCInstrDesc &
+SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
+ bool IsSGPR) const {
if (IsSGPR) {
switch (EltSize) {
case 32:
- return get(getIndirectSGPRWritePseudo32(VecSize));
+ return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
case 64:
- return get(getIndirectSGPRWritePseudo64(VecSize));
+ return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
default:
llvm_unreachable("invalid reg indexing elt size");
}
}
assert(EltSize == 32 && "invalid reg indexing elt size");
- return get(getIndirectVGPRWritePseudoOpc(VecSize));
+ return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
}
static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
@@ -1219,8 +1343,16 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_A32_SAVE;
case 8:
return AMDGPU::SI_SPILL_A64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_A96_SAVE;
case 16:
return AMDGPU::SI_SPILL_A128_SAVE;
+ case 20:
+ return AMDGPU::SI_SPILL_A160_SAVE;
+ case 24:
+ return AMDGPU::SI_SPILL_A192_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_A256_SAVE;
case 64:
return AMDGPU::SI_SPILL_A512_SAVE;
case 128:
@@ -1260,7 +1392,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
- if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
+ if (SrcReg.isVirtual() && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
@@ -1269,11 +1401,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
- .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
- // Add the scratch resource registers as implicit uses because we may end up
- // needing them, and need to ensure that the reserved registers are
- // correctly handled.
+
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
return;
@@ -1283,18 +1412,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
: getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
- auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
- if (RI.hasAGPRs(RC)) {
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- MIB.addReg(Tmp, RegState::Define);
- }
- MIB.addReg(SrcReg, getKillRegState(isKill)) // data
- .addFrameIndex(FrameIndex) // addr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
@@ -1353,8 +1476,16 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_A32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_A64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_A96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_A128_RESTORE;
+ case 20:
+ return AMDGPU::SI_SPILL_A160_RESTORE;
+ case 24:
+ return AMDGPU::SI_SPILL_A192_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_A256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_A512_RESTORE;
case 128:
@@ -1401,143 +1532,36 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
- .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
+
return;
}
unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
- auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
- if (RI.hasAGPRs(RC)) {
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- MIB.addReg(Tmp, RegState::Define);
- }
- MIB.addFrameIndex(FrameIndex) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addFrameIndex(FrameIndex) // vaddr
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
-/// \param @Offset Offset in bytes of the FrameIndex being spilled
-unsigned SIInstrInfo::calculateLDSSpillAddress(
- MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
- unsigned FrameOffset, unsigned Size) const {
- MachineFunction *MF = MBB.getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
- unsigned WavefrontSize = ST.getWavefrontSize();
-
- Register TIDReg = MFI->getTIDReg();
- if (!MFI->hasCalculatedTID()) {
- MachineBasicBlock &Entry = MBB.getParent()->front();
- MachineBasicBlock::iterator Insert = Entry.front();
- const DebugLoc &DL = Insert->getDebugLoc();
-
- TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
- *MF);
- if (TIDReg == AMDGPU::NoRegister)
- return TIDReg;
-
- if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
- WorkGroupSize > WavefrontSize) {
- Register TIDIGXReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
- Register TIDIGYReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
- Register TIDIGZReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
- Register InputPtrReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
- if (!Entry.isLiveIn(Reg))
- Entry.addLiveIn(Reg);
- }
-
- RS->enterBasicBlock(Entry);
- // FIXME: Can we scavenge an SReg_64 and access the subregs?
- Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
- Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
- BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
- .addReg(InputPtrReg)
- .addImm(SI::KernelInputOffsets::NGROUPS_Z);
- BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
- .addReg(InputPtrReg)
- .addImm(SI::KernelInputOffsets::NGROUPS_Y);
-
- // NGROUPS.X * NGROUPS.Y
- BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
- .addReg(STmp1)
- .addReg(STmp0);
- // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
- .addReg(STmp1)
- .addReg(TIDIGXReg);
- // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
- .addReg(STmp0)
- .addReg(TIDIGYReg)
- .addReg(TIDReg);
- // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
- getAddNoCarry(Entry, Insert, DL, TIDReg)
- .addReg(TIDReg)
- .addReg(TIDIGZReg)
- .addImm(0); // clamp bit
- } else {
- // Get the wave id
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
- TIDReg)
- .addImm(-1)
- .addImm(0);
-
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
- TIDReg)
- .addImm(-1)
- .addReg(TIDReg);
- }
-
- BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
- TIDReg)
- .addImm(2)
- .addReg(TIDReg);
- MFI->setTIDReg(TIDReg);
- }
-
- // Add FrameIndex to LDS offset
- unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
- getAddNoCarry(MBB, MI, DL, TmpReg)
- .addImm(LDSOffset)
- .addReg(TIDReg)
- .addImm(0); // clamp bit
-
- return TmpReg;
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const {
+ insertNoops(MBB, MI, 1);
}
-void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- int Count) const {
+void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned Quantity) const {
DebugLoc DL = MBB.findDebugLoc(MI);
- while (Count > 0) {
- int Arg;
- if (Count >= 8)
- Arg = 7;
- else
- Arg = Count - 1;
- Count -= 8;
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
- .addImm(Arg);
+ while (Quantity > 0) {
+ unsigned Arg = std::min(Quantity, 8u);
+ Quantity -= Arg;
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
}
}
-void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const {
- insertWaitStates(MBB, MI, 1);
-}
-
void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
auto MF = MBB.getParent();
SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
@@ -1593,7 +1617,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B32));
break;
-
+ case AMDGPU::S_OR_B64_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_OR_B64));
+ break;
case AMDGPU::S_OR_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
@@ -1670,36 +1698,35 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16:
- case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16:
- case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32:
- case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1:
- case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2:
- case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4:
- case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8:
- case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: {
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
+ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
unsigned Opc;
if (RI.hasVGPRs(EltRC)) {
- Opc = ST.useVGPRIndexMode() ?
- AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32;
+ Opc = AMDGPU::V_MOVRELD_B32_e32;
} else {
- Opc = RI.getRegSizeInBits(*EltRC) == 64 ?
- AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32;
+ Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
+ : AMDGPU::S_MOVRELD_B32;
}
const MCInstrDesc &OpDesc = get(Opc);
@@ -1722,6 +1749,78 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
+ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
+ assert(ST.useVGPRIndexMode());
+ Register VecReg = MI.getOperand(0).getReg();
+ bool IsUndef = MI.getOperand(1).isUndef();
+ Register Idx = MI.getOperand(3).getReg();
+ Register SubReg = MI.getOperand(4).getImm();
+
+ MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(Idx)
+ .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
+ SetOn->getOperand(3).setIsUndef();
+
+ const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, OpDesc)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .add(MI.getOperand(2))
+ .addReg(VecReg, RegState::ImplicitDefine)
+ .addReg(VecReg,
+ RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+
+ const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
+ const int ImpUseIdx = ImpDefIdx + 1;
+ MIB->tieOperands(ImpDefIdx, ImpUseIdx);
+
+ MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+ finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
+ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
+ assert(ST.useVGPRIndexMode());
+ Register Dst = MI.getOperand(0).getReg();
+ Register VecReg = MI.getOperand(1).getReg();
+ bool IsUndef = MI.getOperand(1).isUndef();
+ Register Idx = MI.getOperand(2).getReg();
+ Register SubReg = MI.getOperand(3).getImm();
+
+ MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(Idx)
+ .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
+ SetOn->getOperand(3).setIsUndef();
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
+ .addDef(Dst)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
+ .addReg(AMDGPU::M0, RegState::Implicit);
+
+ MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+ finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
+
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::SI_PC_ADD_REL_OFFSET: {
MachineFunction &MF = *MBB.getParent();
Register Reg = MI.getOperand(0).getReg();
@@ -2062,7 +2161,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// buzz;
RS->enterBasicBlockEnd(MBB);
- unsigned Scav = RS->scavengeRegisterBackwards(
+ Register Scav = RS->scavengeRegisterBackwards(
AMDGPU::SReg_64RegClass,
MachineBasicBlock::iterator(GetPC), false, 0);
MRI.replaceRegWith(PCReg, Scav);
@@ -2170,6 +2269,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
case AMDGPU::SI_MASK_BRANCH:
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
+ case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
@@ -2264,7 +2364,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(TBB);
if (BytesAdded)
- *BytesAdded = 4;
+ *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
return 1;
}
@@ -2291,7 +2391,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
fixImplicitOperands(*CondBr);
if (BytesAdded)
- *BytesAdded = 4;
+ *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
return 1;
}
@@ -2308,7 +2408,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
CondReg.setIsKill(Cond[1].isKill());
if (BytesAdded)
- *BytesAdded = 8;
+ *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
return 2;
}
@@ -2337,7 +2437,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
case VCCZ: {
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
- assert(MRI.getRegClass(FalseReg) == RC);
+ if (MRI.getRegClass(FalseReg) != RC)
+ return false;
int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
@@ -2351,7 +2452,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
// with a vector one.
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
- assert(MRI.getRegClass(FalseReg) == RC);
+ if (MRI.getRegClass(FalseReg) != RC)
+ return false;
int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
@@ -2489,8 +2591,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
- case AMDGPU::V_ACCVGPR_WRITE_B32:
- case AMDGPU::V_ACCVGPR_READ_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
+ case AMDGPU::V_ACCVGPR_READ_B32_e64:
return true;
default:
return false;
@@ -2543,7 +2645,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::S_MOV_B32:
- case AMDGPU::V_ACCVGPR_WRITE_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
break;
}
@@ -2567,7 +2669,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (RI.isAGPR(*MRI, DstReg)) {
if (!isInlineConstant(Imm))
return false;
- NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
+ NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
}
if (Is16Bit) {
@@ -2588,15 +2690,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
UseMI.setDesc(get(NewOpc));
UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
- UseMI.getOperand(1).setTargetFlags(0);
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
return true;
}
- if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
- Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
+ if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -2611,10 +2712,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;
- bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
- bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
- Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -2686,10 +2787,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MRI->hasOneUse(Src0->getReg())) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
Src0Inlined = true;
- } else if ((Register::isPhysicalRegister(Src0->getReg()) &&
+ } else if ((Src0->getReg().isPhysical() &&
(ST.getConstantBusLimit(Opc) <= 1 &&
RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
- (Register::isVirtualRegister(Src0->getReg()) &&
+ (Src0->getReg().isVirtual() &&
(ST.getConstantBusLimit(Opc) <= 1 &&
RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
return false;
@@ -2704,9 +2805,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MRI->hasOneUse(Src1->getReg()) &&
commuteInstruction(UseMI)) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
+ } else if ((Src1->getReg().isPhysical() &&
RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
- (Register::isVirtualRegister(Src1->getReg()) &&
+ (Src1->getReg().isVirtual() &&
RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
return false;
// VGPR is okay as Src1 - fallthrough
@@ -2864,6 +2965,18 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
return AMDGPU::NoRegister;
}
+static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
+ MachineInstr &NewMI) {
+ if (LV) {
+ unsigned NumOps = MI.getNumOperands();
+ for (unsigned I = 1; I < NumOps; ++I) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (Op.isReg() && Op.isKill())
+ LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
+ }
+ }
+}
+
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
MachineInstr &MI,
LiveVariables *LV) const {
@@ -2911,61 +3024,73 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
+ MachineInstrBuilder MIB;
if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
- (ST.getConstantBusLimit(Opc) > 1 ||
- !Src0->isReg() ||
+ (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
unsigned NewOpc =
- IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
- : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
- if (pseudoToMCOpcode(NewOpc) != -1)
- return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
- .add(*Dst)
- .add(*Src0)
- .add(*Src1)
- .addImm(Imm);
- }
- unsigned NewOpc =
- IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
+ : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
+ if (pseudoToMCOpcode(NewOpc) != -1) {
+ MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .add(*Src1)
+ .addImm(Imm);
+ updateLiveVariables(LV, MI, *MIB);
+ return MIB;
+ }
+ }
+ unsigned NewOpc = IsFMA
+ ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (auto Imm = getFoldableImm(Src1)) {
- if (pseudoToMCOpcode(NewOpc) != -1)
- return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
- .add(*Dst)
- .add(*Src0)
- .addImm(Imm)
- .add(*Src2);
+ if (pseudoToMCOpcode(NewOpc) != -1) {
+ MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .addImm(Imm)
+ .add(*Src2);
+ updateLiveVariables(LV, MI, *MIB);
+ return MIB;
+ }
}
if (auto Imm = getFoldableImm(Src0)) {
if (pseudoToMCOpcode(NewOpc) != -1 &&
- isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
- AMDGPU::OpName::src0), Src1))
- return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
- .add(*Dst)
- .add(*Src1)
- .addImm(Imm)
- .add(*Src2);
+ isOperandLegal(
+ MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
+ Src1)) {
+ MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src1)
+ .addImm(Imm)
+ .add(*Src2);
+ updateLiveVariables(LV, MI, *MIB);
+ return MIB;
+ }
}
}
- unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
- : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64)
+ : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
- return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
- .add(*Dst)
- .addImm(Src0Mods ? Src0Mods->getImm() : 0)
- .add(*Src0)
- .addImm(Src1Mods ? Src1Mods->getImm() : 0)
- .add(*Src1)
- .addImm(0) // Src mods
- .add(*Src2)
- .addImm(Clamp ? Clamp->getImm() : 0)
- .addImm(Omod ? Omod->getImm() : 0);
+ MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .addImm(Src0Mods ? Src0Mods->getImm() : 0)
+ .add(*Src0)
+ .addImm(Src1Mods ? Src1Mods->getImm() : 0)
+ .add(*Src1)
+ .addImm(0) // Src mods
+ .add(*Src2)
+ .addImm(Clamp ? Clamp->getImm() : 0)
+ .addImm(Omod ? Omod->getImm() : 0);
+ updateLiveVariables(LV, MI, *MIB);
+ return MIB;
}
// It's not generally safe to move VALU instructions across these since it will
@@ -3003,9 +3128,6 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// Target-independent instructions do not have an implicit-use of EXEC, even
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
-
- // TODO: Don't treat setreg with known constant that only changes MODE as
- // barrier.
return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
@@ -3053,7 +3175,7 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// EXEC = 0, but checking for that case here seems not worth it
// given the typical code patterns.
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
- Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
+ isEXP(Opcode) ||
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
return true;
@@ -3070,7 +3192,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
//
// However, executing them with EXEC = 0 causes them to operate on undefined
// data, which we avoid by returning true here.
- if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
+ if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+ Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
return true;
return false;
@@ -3241,9 +3364,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (OpInfo.RegClass < 0)
return false;
- const MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-
if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -3396,8 +3516,11 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
Inst32.add(*Src2);
} else {
// In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
- // replaced with an implicit read of vcc. This was already added
- // during the initial BuildMI, so find it to preserve the flags.
+ // replaced with an implicit read of vcc or vcc_lo. The implicit read
+ // of vcc was already added during the initial BuildMI, but we
+ // 1) may need to change vcc to vcc_lo to preserve the original register
+ // 2) have to preserve the original flags.
+ fixImplicitOperands(*Inst32);
copyFlagsToImplicitVCC(*Inst32, *Src2);
}
}
@@ -3420,7 +3543,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
if (!MO.isUse())
return false;
- if (Register::isVirtualRegister(MO.getReg()))
+ if (MO.getReg().isVirtual())
return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
// Null is free
@@ -3464,13 +3587,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
if (SIInstrInfo::isVALU(MI)) {
switch (MI.getOpcode()) {
case AMDGPU::V_READLANE_B32:
- case AMDGPU::V_READLANE_B32_gfx6_gfx7:
- case AMDGPU::V_READLANE_B32_gfx10:
- case AMDGPU::V_READLANE_B32_vi:
case AMDGPU::V_WRITELANE_B32:
- case AMDGPU::V_WRITELANE_B32_gfx6_gfx7:
- case AMDGPU::V_WRITELANE_B32_gfx10:
- case AMDGPU::V_WRITELANE_B32_vi:
return false;
}
@@ -3489,7 +3606,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
static bool isSubRegOf(const SIRegisterInfo &TRI,
const MachineOperand &SuperVec,
const MachineOperand &SubReg) {
- if (Register::isPhysicalRegister(SubReg.getReg()))
+ if (SubReg.getReg().isPhysical())
return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
@@ -3530,7 +3647,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
continue;
Register Reg = Op.getReg();
- if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+ if (!Reg.isVirtual() && !RC->contains(Reg)) {
ErrInfo = "inlineasm operand has incorrect register class.";
return false;
}
@@ -3600,7 +3717,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (RegClass != -1) {
Register Reg = MI.getOperand(i).getReg();
- if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg))
+ if (Reg == AMDGPU::NoRegister || Reg.isVirtual())
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3636,7 +3753,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
} else {
// No immediates on GFX9
if (!MO.isReg()) {
- ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+ ErrInfo =
+ "Only reg allowed as operands in SDWA instructions on GFX9+";
return false;
}
}
@@ -3693,7 +3811,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo =
"Dst register should be tied to implicit use of preserved register";
return false;
- } else if (Register::isPhysicalRegister(TiedMO.getReg()) &&
+ } else if (TiedMO.getReg().isPhysical() &&
Dst.getReg() != TiedMO.getReg()) {
ErrInfo = "Dst register should use same physical register as preserved";
return false;
@@ -3752,11 +3870,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
++ConstantBusCount;
SmallVector<Register, 2> SGPRsUsed;
- Register SGPRUsed = findImplicitSGPRRead(MI);
- if (SGPRUsed != AMDGPU::NoRegister) {
- ++ConstantBusCount;
- SGPRsUsed.push_back(SGPRUsed);
- }
+ Register SGPRUsed;
for (int OpIdx : OpIndices) {
if (OpIdx == -1)
@@ -3765,8 +3879,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
if (MO.isReg()) {
SGPRUsed = MO.getReg();
- if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
- return !RI.regsOverlap(SGPRUsed, SGPR);
+ if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
+ return SGPRUsed != SGPR;
})) {
++ConstantBusCount;
SGPRsUsed.push_back(SGPRUsed);
@@ -3777,7 +3891,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
}
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+ SGPRUsed = findImplicitSGPRRead(MI);
+ if (SGPRUsed != AMDGPU::NoRegister) {
+ // Implicit uses may safely overlap true overands
+ if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
+ return !RI.regsOverlap(SGPRUsed, SGPR);
+ })) {
+ ++ConstantBusCount;
+ SGPRsUsed.push_back(SGPRUsed);
+ }
+ }
+
// v_writelane_b32 is an exception from constant bus restriction:
// vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
@@ -3825,8 +3950,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
// Verify misc. restrictions on specific instructions.
- if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
- Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+ if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
+ Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
const MachineOperand &Src0 = MI.getOperand(Src0Idx);
const MachineOperand &Src1 = MI.getOperand(Src1Idx);
const MachineOperand &Src2 = MI.getOperand(Src2Idx);
@@ -3837,6 +3962,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
}
+ if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
+ SISrcMods::ABS) ||
+ (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
+ SISrcMods::ABS) ||
+ (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
+ SISrcMods::ABS)) {
+ ErrInfo = "ABS not allowed in VOP3B instructions";
+ return false;
+ }
}
if (isSOP2(MI) || isSOPC(MI)) {
@@ -3945,7 +4079,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
+ if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
if (Offset->getImm() != 0) {
ErrInfo = "subtarget does not support offsets in flat instructions";
@@ -4079,21 +4213,21 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
}
case AMDGPU::S_ADD_I32:
- return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
+ return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
case AMDGPU::S_ADDC_U32:
return AMDGPU::V_ADDC_U32_e32;
case AMDGPU::S_SUB_I32:
- return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
// FIXME: These are not consistently handled, and selected when the carry is
// used.
case AMDGPU::S_ADD_U32:
- return AMDGPU::V_ADD_I32_e32;
+ return AMDGPU::V_ADD_CO_U32_e32;
case AMDGPU::S_SUB_U32:
- return AMDGPU::V_SUB_I32_e32;
+ return AMDGPU::V_SUB_CO_U32_e32;
case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
- case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32;
- case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
- case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
+ case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
+ case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
+ case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
@@ -4104,15 +4238,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
- case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+ case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
- case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+ case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
- case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
- case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
- case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
- case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
- case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+ case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
+ case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
+ case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
+ case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
+ case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
@@ -4150,7 +4284,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
Desc.OpInfo[OpNo].RegClass == -1) {
Register Reg = MI.getOperand(OpNo).getReg();
- if (Register::isVirtualRegister(Reg))
+ if (Reg.isVirtual())
return MRI.getRegClass(Reg);
return RI.getPhysRegClass(Reg);
}
@@ -4164,11 +4298,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
- unsigned Size = TRI->getRegSizeInBits(*RC);
+ unsigned Size = RI.getRegSizeInBits(*RC);
unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
@@ -4255,11 +4387,13 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
return false;
Register Reg = MO.getReg();
- const TargetRegisterClass *RC = Register::isVirtualRegister(Reg)
- ? MRI.getRegClass(Reg)
- : RI.getPhysRegClass(Reg);
const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
+ if (Reg.isPhysical())
+ return DRC->contains(Reg);
+
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+
if (MO.getSubReg()) {
const MachineFunction *MF = MO.getParent()->getParent()->getParent();
const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
@@ -4290,7 +4424,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const TargetRegisterClass *DefinedRC =
OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
if (!MO)
@@ -4469,8 +4602,8 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
};
- if (Opc == AMDGPU::V_PERMLANE16_B32 ||
- Opc == AMDGPU::V_PERMLANEX16_B32) {
+ if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
// src1 and src2 must be scalar
MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
@@ -4493,7 +4626,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
int ConstantBusLimit = ST.getConstantBusLimit(Opc);
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
SmallDenseSet<unsigned> SGPRsUsed;
- unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+ Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
if (SGPRReg != AMDGPU::NoRegister) {
SGPRsUsed.insert(SGPRReg);
--ConstantBusLimit;
@@ -4597,16 +4730,32 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
// pointer value is uniform.
MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
SBase->setReg(SGPR);
}
MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+ Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
SOff->setReg(SGPR);
}
}
+// FIXME: Remove this when SelectionDAG is obsoleted.
+void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
+ MachineInstr &MI) const {
+ if (!isSegmentSpecificFLAT(MI))
+ return;
+
+ // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
+ // thinks they are uniform, so a readfirstlane should be valid.
+ MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
+ if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
+ return;
+
+ Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
+ SAddr->setReg(ToSGPR);
+}
+
void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
MachineBasicBlock::iterator I,
const TargetRegisterClass *DstRC,
@@ -4671,59 +4820,82 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
MachineBasicBlock::iterator I = LoopBB.begin();
+ SmallVector<Register, 8> ReadlanePieces;
+ Register CondReg = AMDGPU::NoRegister;
+
Register VRsrc = Rsrc.getReg();
unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
- Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
- Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
- Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
- Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
- Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
-
- // Beginning of the loop, read the next Rsrc variant.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
- .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
- .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
- .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
- .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
-
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
- .addReg(SRsrcSub0)
- .addImm(AMDGPU::sub0)
- .addReg(SRsrcSub1)
- .addImm(AMDGPU::sub1)
- .addReg(SRsrcSub2)
- .addImm(AMDGPU::sub2)
- .addReg(SRsrcSub3)
- .addImm(AMDGPU::sub3);
+ unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
+ unsigned NumSubRegs = RegSize / 32;
+ assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
+
+ for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
+
+ Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
+ .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
+
+ // Read the next variant <- also loop target.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
+ .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
+
+ ReadlanePieces.push_back(CurRegLo);
+ ReadlanePieces.push_back(CurRegHi);
+
+ // Comparison is to be done as 64-bit.
+ Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
+ .addReg(CurRegLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(CurRegHi)
+ .addImm(AMDGPU::sub1);
+
+ Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
+ auto Cmp =
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
+ .addReg(CurReg);
+ if (NumSubRegs <= 2)
+ Cmp.addReg(VRsrc);
+ else
+ Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
+
+ // Combine the comparision results with AND.
+ if (CondReg == AMDGPU::NoRegister) // First.
+ CondReg = NewCondReg;
+ else { // If not the first, we create an AND.
+ Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+ .addReg(CondReg)
+ .addReg(NewCondReg);
+ CondReg = AndReg;
+ }
+ } // End for loop.
+
+ auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
+ Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
+
+ // Build scalar Rsrc.
+ auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
+ unsigned Channel = 0;
+ for (Register Piece : ReadlanePieces) {
+ Merge.addReg(Piece)
+ .addImm(TRI->getSubRegFromChannel(Channel++));
+ }
// Update Rsrc operand to use the SGPR Rsrc.
Rsrc.setReg(SRsrc);
Rsrc.setIsKill(true);
- // Identify all lanes with identical Rsrc operands in their VGPRs.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
- .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
- .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
- .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
- .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
- BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond)
- .addReg(CondReg0)
- .addReg(CondReg1);
-
- MRI.setSimpleHint(SaveExec, AndCond);
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ MRI.setSimpleHint(SaveExec, CondReg);
// Update EXEC to matching lanes, saving original to SaveExec.
BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
- .addReg(AndCond, RegState::Kill);
+ .addReg(CondReg, RegState::Kill);
// The original instruction is here; we insert the terminators after it.
I = LoopBB.end();
@@ -4732,19 +4904,29 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
.addReg(Exec)
.addReg(SaveExec);
+
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
}
// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
// with SGPRs by iterating over all unique values across all lanes.
-static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
- MachineOperand &Rsrc, MachineDominatorTree *MDT) {
+// Returns the loop basic block that now contains \p MI.
+static MachineBasicBlock *
+loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
+ MachineOperand &Rsrc, MachineDominatorTree *MDT,
+ MachineBasicBlock::iterator Begin = nullptr,
+ MachineBasicBlock::iterator End = nullptr) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineBasicBlock::iterator I(&MI);
+ if (!Begin.isValid())
+ Begin = &MI;
+ if (!End.isValid()) {
+ End = &MI;
+ ++End;
+ }
const DebugLoc &DL = MI.getDebugLoc();
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -4753,13 +4935,17 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
- BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
+ BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
// Killed uses in the instruction we are waterfalling around will be
// incorrect due to the added control-flow.
- for (auto &MO : MI.uses()) {
- if (MO.isReg() && MO.isUse()) {
- MRI.clearKillFlags(MO.getReg());
+ MachineBasicBlock::iterator AfterMI = MI;
+ ++AfterMI;
+ for (auto I = Begin; I != AfterMI; I++) {
+ for (auto &MO : I->uses()) {
+ if (MO.isReg() && MO.isUse()) {
+ MRI.clearKillFlags(MO.getReg());
+ }
}
}
@@ -4776,11 +4962,11 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
LoopBB->addSuccessor(LoopBB);
LoopBB->addSuccessor(RemainderBB);
- // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
- MachineBasicBlock::iterator J = I++;
+ // Move Begin to MI to the LoopBB, and the remainder of the block to
+ // RemainderBB.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
- LoopBB->splice(LoopBB->begin(), &MBB, J);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
+ LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
MBB.addSuccessor(LoopBB);
@@ -4803,6 +4989,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
// Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
+ return LoopBB;
}
// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
@@ -4848,27 +5035,35 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
return std::make_tuple(RsrcPtr, NewSRsrc);
}
-void SIInstrInfo::legalizeOperands(MachineInstr &MI,
- MachineDominatorTree *MDT) const {
+MachineBasicBlock *
+SIInstrInfo::legalizeOperands(MachineInstr &MI,
+ MachineDominatorTree *MDT) const {
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineBasicBlock *CreatedBB = nullptr;
// Legalize VOP2
if (isVOP2(MI) || isVOPC(MI)) {
legalizeOperandsVOP2(MRI, MI);
- return;
+ return CreatedBB;
}
// Legalize VOP3
if (isVOP3(MI)) {
legalizeOperandsVOP3(MRI, MI);
- return;
+ return CreatedBB;
}
// Legalize SMRD
if (isSMRD(MI)) {
legalizeOperandsSMRD(MRI, MI);
- return;
+ return CreatedBB;
+ }
+
+ // Legalize FLAT
+ if (isFLAT(MI)) {
+ legalizeOperandsFLAT(MRI, MI);
+ return CreatedBB;
}
// Legalize REG_SEQUENCE and PHI
@@ -4877,8 +5072,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (MI.getOpcode() == AMDGPU::PHI) {
const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
- if (!MI.getOperand(i).isReg() ||
- !Register::isVirtualRegister(MI.getOperand(i).getReg()))
+ if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
continue;
const TargetRegisterClass *OpRC =
MRI.getRegClass(MI.getOperand(i).getReg());
@@ -4914,7 +5108,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Update all the operands so they have the same type.
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
+ if (!Op.isReg() || !Op.getReg().isVirtual())
continue;
// MI is a PHI instruction.
@@ -4939,7 +5133,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// subregister index types e.g. sub0_sub1 + sub2 + sub3
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
+ if (!Op.isReg() || !Op.getReg().isVirtual())
continue;
const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
@@ -4952,7 +5146,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
}
}
- return;
+ return CreatedBB;
}
// Legalize INSERT_SUBREG
@@ -4967,7 +5161,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
MachineOperand &Op = MI.getOperand(1);
legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
}
- return;
+ return CreatedBB;
}
// Legalize SI_INIT_M0
@@ -4975,7 +5169,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
MachineOperand &Src = MI.getOperand(0);
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
- return;
+ return CreatedBB;
}
// Legalize MIMG and MUBUF/MTBUF for shaders.
@@ -4983,21 +5177,44 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
// scratch memory access. In both cases, the legalization never involves
// conversion to the addr64 form.
- if (isMIMG(MI) ||
- (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
- (isMUBUF(MI) || isMTBUF(MI)))) {
+ if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
+ (isMUBUF(MI) || isMTBUF(MI)))) {
MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
- if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
- SRsrc->setReg(SGPR);
- }
+ if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
+ CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
- if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
- SSamp->setReg(SGPR);
+ if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
+ CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
+
+ return CreatedBB;
+ }
+
+ // Legalize SI_CALL
+ if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
+ MachineOperand *Dest = &MI.getOperand(0);
+ if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
+ // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
+ // following copies, we also need to move copies from and to physical
+ // registers into the loop block.
+ unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
+ unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+ // Also move the copies to physical registers into the loop block
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::iterator Start(&MI);
+ while (Start->getOpcode() != FrameSetupOpcode)
+ --Start;
+ MachineBasicBlock::iterator End(&MI);
+ while (End->getOpcode() != FrameDestroyOpcode)
+ ++End;
+ // Also include following copies of the return value
+ ++End;
+ while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
+ MI.definesRegister(End->getOperand(1).getReg()))
+ ++End;
+ CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
}
- return;
}
// Legalize MUBUF* instructions.
@@ -5011,7 +5228,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
RI.getRegClass(RsrcRC))) {
// The operands are legal.
// FIXME: We may need to legalize operands besided srsrc.
- return;
+ return CreatedBB;
}
// Legalize a VGPR Rsrc.
@@ -5046,7 +5263,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo)
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
.addDef(CondReg0)
.addReg(RsrcPtr, 0, AMDGPU::sub0)
.addReg(VAddr->getReg(), 0, AMDGPU::sub0)
@@ -5072,8 +5289,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
} else if (!VAddr && ST.hasAddr64()) {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
- assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
- < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
"FIXME: Need to emit flat atomics here");
unsigned RsrcPtr, NewSRsrc;
@@ -5146,15 +5362,19 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
} else {
// This is another variant; legalize Rsrc with waterfall loop from VGPRs
// to SGPRs.
- loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
+ CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
+ return CreatedBB;
}
}
+ return CreatedBB;
}
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
- MachineDominatorTree *MDT) const {
+MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
+ MachineDominatorTree *MDT) const {
SetVectorType Worklist;
Worklist.insert(&TopInst);
+ MachineBasicBlock *CreatedBB = nullptr;
+ MachineBasicBlock *CreatedBBTmp = nullptr;
while (!Worklist.empty()) {
MachineInstr &Inst = *Worklist.pop_back_val();
@@ -5174,13 +5394,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
Inst.eraseFromParent();
continue;
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_I32: {
// FIXME: The u32 versions currently selected use the carry.
- if (moveScalarAddSub(Worklist, Inst, MDT))
+ bool Changed;
+ std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
+ if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+ CreatedBB = CreatedBBTmp;
+ if (Changed)
continue;
// Default handling
break;
+ }
case AMDGPU::S_AND_B64:
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
Inst.eraseFromParent();
@@ -5259,19 +5484,19 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
break;
case AMDGPU::S_LSHL_B64:
if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHLREV_B64;
+ NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I64:
if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_ASHRREV_I64;
+ NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B64:
if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHRREV_B64;
+ NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
swapOperands(Inst);
}
break;
@@ -5361,7 +5586,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
.add(Inst.getOperand(3))
.addReg(CarryInReg)
.addImm(0);
- legalizeOperands(*CarryOp);
+ CreatedBBTmp = legalizeOperands(*CarryOp);
+ if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+ CreatedBB = CreatedBBTmp;
MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
Inst.eraseFromParent();
@@ -5376,8 +5603,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
MachineOperand &Src1 = Inst.getOperand(3);
unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
- ? AMDGPU::V_ADD_I32_e64
- : AMDGPU::V_SUB_I32_e64;
+ ? AMDGPU::V_ADD_CO_U32_e64
+ : AMDGPU::V_SUB_CO_U32_e64;
const TargetRegisterClass *NewRC =
RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
Register DestReg = MRI.createVirtualRegister(NewRC);
@@ -5387,7 +5614,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
.add(Src1)
.addImm(0); // clamp bit
- legalizeOperands(*NewInstr, MDT);
+ CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
+ if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+ CreatedBB = CreatedBBTmp;
MRI.replaceRegWith(Dest0.getReg(), DestReg);
addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
@@ -5406,7 +5635,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
- legalizeOperands(Inst, MDT);
+ CreatedBBTmp = legalizeOperands(Inst, MDT);
+ if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+ CreatedBB = CreatedBBTmp;
continue;
}
@@ -5462,7 +5693,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
unsigned NewDstReg = AMDGPU::NoRegister;
if (HasDst) {
Register DstReg = Inst.getOperand(0).getReg();
- if (Register::isPhysicalRegister(DstReg))
+ if (DstReg.isPhysical())
continue;
// Update the destination register class.
@@ -5470,8 +5701,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
if (!NewDstRC)
continue;
- if (Inst.isCopy() &&
- Register::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
// class, we just replace all uses of dst with src. These kinds of
@@ -5498,16 +5728,20 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
}
// Legalize the operands
- legalizeOperands(Inst, MDT);
+ CreatedBBTmp = legalizeOperands(Inst, MDT);
+ if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
+ CreatedBB = CreatedBBTmp;
if (HasDst)
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
}
+ return CreatedBB;
}
// Add/sub require special handling to deal with carry outs.
-bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT) const {
+std::pair<bool, MachineBasicBlock *>
+SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
if (ST.hasAddNoCarry()) {
// Assume there is no user of scc since we don't select this in that case.
// Since scc isn't used, it doesn't really matter if the i32 or u32 variant
@@ -5532,13 +5766,13 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
- legalizeOperands(Inst, MDT);
+ MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
- return true;
+ return std::make_pair(true, NewBB);
}
- return false;
+ return std::make_pair(false, nullptr);
}
void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
@@ -5626,7 +5860,7 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned SubOp = ST.hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
+ AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
.addImm(0)
@@ -5855,7 +6089,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
AMDGPU::sub1, Src1SubRC);
- unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
MachineInstr *LoHalf =
BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
.addReg(CarryReg, RegState::Define)
@@ -6055,7 +6289,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
.addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
.addImm(0)
.addImm(BitWidth);
@@ -6152,7 +6386,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
.addReg(ImmReg, RegState::Kill)
.add(Src0);
- BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
.add(Src1)
.addImm(16)
.addReg(TmpReg, RegState::Kill);
@@ -6162,7 +6396,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
.addImm(0xffff);
- BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
.addReg(ImmReg, RegState::Kill)
.add(Src0)
.add(Src1);
@@ -6176,7 +6410,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
.add(Src0);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
.addImm(0xffff0000);
- BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
.add(Src1)
.addReg(ImmReg, RegState::Kill)
.addReg(TmpReg, RegState::Kill);
@@ -6209,7 +6443,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
if (MI.isCopy()) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
@@ -6407,7 +6641,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
// GFX9 doesn't have ELEMENT_SIZE.
if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+ uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
@@ -6503,8 +6737,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// If we have a definitive size, we can use it. Otherwise we need to inspect
// the operands to know the size.
- if (isFixedSize(MI))
- return DescSize;
+ if (isFixedSize(MI)) {
+ unsigned Size = DescSize;
+
+ // If we hit the buggy offset, an extra nop will be inserted in MC so
+ // estimate the worst case.
+ if (MI.isBranch() && ST.hasOffset3fBug())
+ Size += 4;
+
+ return Size;
+ }
// 4-byte instructions may have a 32-bit literal encoded after them. Check
// operands that coud ever be literals.
@@ -6555,8 +6797,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case TargetOpcode::INLINEASM_BR: {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
- return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(),
- &MF->getSubtarget());
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
}
default:
return DescSize;
@@ -6716,7 +6957,7 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
- return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
}
@@ -6737,7 +6978,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
if (!UnusedCarry.isValid())
return MachineInstrBuilder();
- return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
}
@@ -6763,10 +7004,6 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
}
void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
- MachineBasicBlock *MBB = MI.getParent();
- MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-
if (!ST.isWave32())
return;
@@ -6789,20 +7026,6 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
}
-unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace,
- bool Signed) const {
- if (!ST.hasFlatInstOffsets())
- return 0;
-
- if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
- return 0;
-
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
- return Signed ? 12 : 11;
-
- return Signed ? 13 : 12;
-}
-
bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
bool Signed) const {
// TODO: Should 0 be special cased?
@@ -6812,16 +7035,31 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
return false;
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
- return (Signed && isInt<12>(Offset)) ||
- (!Signed && isUInt<11>(Offset));
+ unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
+ return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
+}
+
+std::pair<int64_t, int64_t> SIInstrInfo::splitFlatOffset(int64_t COffsetVal,
+ unsigned AddrSpace,
+ bool IsSigned) const {
+ int64_t RemainderOffset = COffsetVal;
+ int64_t ImmField = 0;
+ const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned);
+ if (IsSigned) {
+ // Use signed division by a power of two to truncate towards 0.
+ int64_t D = 1LL << (NumBits - 1);
+ RemainderOffset = (COffsetVal / D) * D;
+ ImmField = COffsetVal - RemainderOffset;
+ } else if (COffsetVal >= 0) {
+ ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
+ RemainderOffset = COffsetVal - ImmField;
}
- return (Signed && isInt<13>(Offset)) ||
- (!Signed && isUInt<12>(Offset));
+ assert(isLegalFLATOffset(ImmField, AddrSpace, IsSigned));
+ assert(RemainderOffset + ImmField == COffsetVal);
+ return {ImmField, RemainderOffset};
}
-
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
@@ -6962,7 +7200,7 @@ static bool followSubRegDef(MachineInstr &MI,
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
- if (!Register::isVirtualRegister(P.Reg))
+ if (!P.Reg.isVirtual())
return nullptr;
auto RSR = P;
@@ -6973,7 +7211,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
case AMDGPU::COPY:
case AMDGPU::V_MOV_B32_e32: {
auto &Op1 = MI->getOperand(1);
- if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) {
+ if (Op1.isReg() && Op1.getReg().isVirtual()) {
if (Op1.isUndef())
return nullptr;
RSR = getRegSubRegPair(Op1);
@@ -7035,36 +7273,51 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
auto *TRI = MRI.getTargetRegisterInfo();
auto *DefBB = DefMI.getParent();
- const int MaxUseInstScan = 10;
- int NumUseInst = 0;
+ const int MaxUseScan = 10;
+ int NumUse = 0;
- for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
+ for (auto &Use : MRI.use_nodbg_operands(VReg)) {
+ auto &UseInst = *Use.getParent();
// Don't bother searching between blocks, although it is possible this block
// doesn't modify exec.
if (UseInst.getParent() != DefBB)
return true;
- if (++NumUseInst > MaxUseInstScan)
+ if (++NumUse > MaxUseScan)
return true;
}
+ if (NumUse == 0)
+ return false;
+
const int MaxInstScan = 20;
int NumInst = 0;
// Stop scan when we have seen all the uses.
for (auto I = std::next(DefMI.getIterator()); ; ++I) {
+ assert(I != DefBB->end());
+
if (I->isDebugInstr())
continue;
if (++NumInst > MaxInstScan)
return true;
- if (I->readsRegister(VReg))
- if (--NumUseInst == 0)
- return false;
+ for (const MachineOperand &Op : I->operands()) {
+ // We don't check reg masks here as they're used only on calls:
+ // 1. EXEC is only considered const within one BB
+ // 2. Call should be a terminator instruction if present in a BB
- if (I->modifiesRegister(AMDGPU::EXEC, TRI))
- return true;
+ if (!Op.isReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ if (Op.isUse()) {
+ if (Reg == VReg && --NumUse == 0)
+ return false;
+ } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
+ return true;
+ }
}
}
@@ -7158,3 +7411,25 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return SchedModel.computeInstrLatency(&MI);
}
+
+unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
+ switch (MF.getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_PS:
+ return 1;
+ case CallingConv::AMDGPU_VS:
+ return 2;
+ case CallingConv::AMDGPU_GS:
+ return 3;
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_LS:
+ case CallingConv::AMDGPU_ES:
+ report_fatal_error("ds_ordered_count unsupported for this calling conv");
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::C:
+ case CallingConv::Fast:
+ default:
+ // Assume other calling conventions are various compute callable functions
+ return 0;
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 53e2ffba0f65..ce59fe86c688 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -14,22 +14,12 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
-#include "AMDGPUInstrInfo.h"
-#include "SIDefines.h"
+#include "AMDGPUMIRFormatter.h"
#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Compiler.h"
-#include <cassert>
-#include <cstdint>
#define GET_INSTRINFO_HEADER
#include "AMDGPUGenInstrInfo.inc"
@@ -37,17 +27,20 @@
namespace llvm {
class APInt;
+class GCNSubtarget;
+class LiveVariables;
class MachineDominatorTree;
class MachineRegisterInfo;
class RegScavenger;
-class GCNSubtarget;
class TargetRegisterClass;
+class ScheduleHazardRecognizer;
class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
const SIRegisterInfo RI;
const GCNSubtarget &ST;
TargetSchedModel SchedModel;
+ mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
// The inverse predicate should have the negative value.
enum BranchPredicate {
@@ -81,8 +74,9 @@ public:
private:
void swapOperands(MachineInstr &Inst) const;
- bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT = nullptr) const;
+ std::pair<bool, MachineBasicBlock *>
+ moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
@@ -201,10 +195,6 @@ public:
const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
- unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI,
- RegScavenger *RS, unsigned TmpReg,
- unsigned Offset, unsigned Size) const;
-
void materializeImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL,
@@ -248,9 +238,12 @@ public:
// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
- const MCInstrDesc &getIndirectRegWritePseudo(
- unsigned VecSize, unsigned EltSize, bool IsSGPR) const;
+ const MCInstrDesc &getIndirectRegWriteMovRelPseudo(unsigned VecSize,
+ unsigned EltSize,
+ bool IsSGPR) const;
+ const MCInstrDesc &getIndirectGPRIDXPseudo(unsigned VecSize,
+ bool IsIndirectSrc) const;
LLVM_READONLY
int commuteOpcode(unsigned Opc) const;
@@ -508,12 +501,28 @@ public:
// i.e. global_* or scratch_*.
static bool isSegmentSpecificFLAT(const MachineInstr &MI) {
auto Flags = MI.getDesc().TSFlags;
- return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
+ return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+ }
+
+ bool isSegmentSpecificFLAT(uint16_t Opcode) const {
+ auto Flags = get(Opcode).TSFlags;
+ return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+ }
+
+ static bool isFLATGlobal(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsFlatGlobal;
+ }
+
+ bool isFLATGlobal(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsFlatGlobal;
}
- // FIXME: Make this more precise
static bool isFLATScratch(const MachineInstr &MI) {
- return isSegmentSpecificFLAT(MI);
+ return MI.getDesc().TSFlags & SIInstrFlags::IsFlatScratch;
+ }
+
+ bool isFLATScratch(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsFlatScratch;
}
// Any FLAT encoded instruction, including global_* and scratch_*.
@@ -569,6 +578,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::DPP;
}
+ static bool isTRANS(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::TRANS;
+ }
+
+ bool isTRANS(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::TRANS;
+ }
+
static bool isVOP3P(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
}
@@ -677,7 +694,7 @@ public:
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
- unsigned Dest = MI.getOperand(0).getReg();
+ Register Dest = MI.getOperand(0).getReg();
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return !RI.isSGPRReg(MRI, Dest);
@@ -883,6 +900,7 @@ public:
MachineRegisterInfo &MRI) const;
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+ void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const;
void legalizeGenericOperand(MachineBasicBlock &InsertMBB,
MachineBasicBlock::iterator I,
@@ -893,20 +911,22 @@ public:
/// Legalize all operands in this instruction. This function may create new
/// instructions and control-flow around \p MI. If present, \p MDT is
/// updated.
- void legalizeOperands(MachineInstr &MI,
- MachineDominatorTree *MDT = nullptr) const;
+ /// \returns A new basic block that contains \p MI if new blocks were created.
+ MachineBasicBlock *
+ legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
/// Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary. If present, \p MDT is updated.
- void moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
-
- void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
- int Count) const;
+ MachineBasicBlock *moveToVALU(MachineInstr &MI,
+ MachineDominatorTree *MDT = nullptr) const;
void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
+ void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned Quantity) const override;
+
void insertReturn(MachineBasicBlock &MBB) const;
/// Return the number of wait states that result from executing this
/// instruction.
@@ -1015,14 +1035,18 @@ public:
return isUInt<12>(Imm);
}
- unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const;
-
/// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
/// encoded instruction. If \p Signed, this is for an instruction that
/// interprets the offset as signed.
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
bool Signed) const;
+ /// Split \p COffsetVal into {immediate offset field, remainder offset}
+ /// values.
+ std::pair<int64_t, int64_t> splitFlatOffset(int64_t COffsetVal,
+ unsigned AddrSpace,
+ bool IsSigned) const;
+
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
@@ -1053,6 +1077,14 @@ public:
unsigned getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
unsigned *PredCost = nullptr) const override;
+
+ const MIRFormatter *getMIRFormatter() const override {
+ if (!Formatter.get())
+ Formatter = std::make_unique<AMDGPUMIRFormatter>();
+ return Formatter.get();
+ }
+
+ static unsigned getDSShaderTypeValue(const MachineFunction &MF);
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
@@ -1148,6 +1180,12 @@ namespace AMDGPU {
LLVM_READONLY
int getVCMPXNoSDstOp(uint16_t Opcode);
+ LLVM_READONLY
+ int getFlatScratchInstSTfromSS(uint16_t Opcode);
+
+ LLVM_READONLY
+ int getFlatScratchInstSSfromSV(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7aee52f91360..5adc9e817d41 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -28,7 +28,6 @@ def SIEncodingFamily {
int GFX9 = 5;
int GFX10 = 6;
int SDWA10 = 7;
- int GFX10_B = 8;
}
//===----------------------------------------------------------------------===//
@@ -55,10 +54,6 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
-def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
@@ -177,19 +172,6 @@ class SDBufferAtomic<string opcode> : SDNode <opcode,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
-class SDBufferAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
- SDTypeProfile<0, 8,
- [SDTCisVT<0, ty>, // vdata
- SDTCisVT<1, v4i32>, // rsrc
- SDTCisVT<2, i32>, // vindex(VGPR)
- SDTCisVT<3, i32>, // voffset(VGPR)
- SDTCisVT<4, i32>, // soffset(SGPR)
- SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // cachepolicy(imm)
- SDTCisVT<7, i1>]>, // idxen(imm)
- [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
->;
-
def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
@@ -203,8 +185,7 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
-def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
-def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
+def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -228,8 +209,6 @@ class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
-def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>;
-
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
@@ -280,41 +259,31 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
// Returns 1 if the source arguments have modifiers, 0 if they do not.
// XXX - do f16 instructions?
class isFloatType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, f16.Value), 1,
- !if(!eq(SrcVT.Value, f32.Value), 1,
- !if(!eq(SrcVT.Value, f64.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1,
- !if(!eq(SrcVT.Value, v4f16.Value), 1,
- !if(!eq(SrcVT.Value, v2f32.Value), 1,
- !if(!eq(SrcVT.Value, v2f64.Value), 1,
- 0)))))));
+ bit ret = !or(!eq(SrcVT.Value, f16.Value),
+ !eq(SrcVT.Value, f32.Value),
+ !eq(SrcVT.Value, f64.Value),
+ !eq(SrcVT.Value, v2f16.Value),
+ !eq(SrcVT.Value, v4f16.Value),
+ !eq(SrcVT.Value, v2f32.Value),
+ !eq(SrcVT.Value, v2f64.Value));
}
class isIntType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, i16.Value), 1,
- !if(!eq(SrcVT.Value, i32.Value), 1,
- !if(!eq(SrcVT.Value, i64.Value), 1,
- 0)));
+ bit ret = !or(!eq(SrcVT.Value, i16.Value),
+ !eq(SrcVT.Value, i32.Value),
+ !eq(SrcVT.Value, i64.Value));
}
class isPackedType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, v2i16.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1,
- !if(!eq(SrcVT.Value, v4f16.Value), 1, 0)
- ));
+ bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
+ !eq(SrcVT.Value, v2f16.Value),
+ !eq(SrcVT.Value, v4f16.Value));
}
//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
-let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_global").AddrSpaces in {
-defm atomic_csub_global : binary_atomic_op<SIatomic_csub>;
-}
-
foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
@@ -328,23 +297,6 @@ defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
} // End let AddressSpaces = ...
} // End foreach AddrSpace
-def atomic_fadd_global_noret : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_load_fadd node:$ptr, node:$value)> {
- // FIXME: Move this
- let MemoryVT = f32;
- let IsAtomic = 1;
- let AddressSpaces = StoreAddress_global.AddrSpaces;
-}
-
-def atomic_pk_fadd_global_noret : PatFrag<
- (ops node:$ptr, node:$value),
- (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> {
- // FIXME: Move this
- let MemoryVT = v2f16;
- let IsAtomic = 1;
- let AddressSpaces = StoreAddress_global.AddrSpaces;
-}
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
@@ -450,16 +402,15 @@ def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)
}
def load_align8_local_m0 : PatFrag<(ops node:$ptr),
- (load_local_m0 node:$ptr)> {
+ (load_local_m0 node:$ptr)>, Aligned<8> {
let IsLoad = 1;
let IsNonExtLoad = 1;
- let MinAlignment = 8;
}
+
def load_align16_local_m0 : PatFrag<(ops node:$ptr),
- (load_local_m0 node:$ptr)> {
+ (load_local_m0 node:$ptr)>, Aligned<16> {
let IsLoad = 1;
let IsNonExtLoad = 1;
- let MinAlignment = 16;
}
} // End IsLoad = 1
@@ -535,20 +486,18 @@ def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
}
}
-def store_align16_local_m0 : PatFrag <
- (ops node:$value, node:$ptr),
- (store_local_m0 node:$value, node:$ptr)> {
+def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)>,
+ Aligned<8> {
let IsStore = 1;
let IsTruncStore = 0;
- let MinAlignment = 16;
}
-def store_align8_local_m0 : PatFrag <
- (ops node:$value, node:$ptr),
- (store_local_m0 node:$value, node:$ptr)> {
+def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)>,
+ Aligned<16> {
let IsStore = 1;
let IsTruncStore = 0;
- let MinAlignment = 8;
}
let AddressSpaces = StoreAddress_local.AddrSpaces in {
@@ -583,6 +532,48 @@ def si_setcc_uniform : PatFrag <
}]>;
//===----------------------------------------------------------------------===//
+// SDNodes PatFrags for a16 loads and stores with 3 components.
+// v3f16/v3i16 is widened to v4f16/v4i16, so we need to match on the memory
+// load/store size.
+//===----------------------------------------------------------------------===//
+
+class mubuf_intrinsic_load<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen),
+ (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen)> {
+ let IsLoad = 1;
+ let MemoryVT = vt;
+}
+
+class mubuf_intrinsic_store<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen),
+ (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen)> {
+ let IsStore = 1;
+ let MemoryVT = vt;
+}
+
+class mtbuf_intrinsic_load<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen),
+ (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen)> {
+ let IsLoad = 1;
+ let MemoryVT = vt;
+}
+
+class mtbuf_intrinsic_store<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen),
+ (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen)> {
+ let IsStore = 1;
+ let MemoryVT = vt;
+}
+
+//===----------------------------------------------------------------------===//
// SDNodes PatFrags for d16 loads
//===----------------------------------------------------------------------===//
@@ -668,7 +659,6 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
-defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>;
defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
@@ -1051,6 +1041,12 @@ class NamedOperandBit_0<string Name, AsmOperandClass MatchClass> :
let ParserMatchClass = MatchClass;
}
+class NamedOperandBit_1<string Name, AsmOperandClass MatchClass> :
+ OperandWithDefaultOps<i1, (ops (i1 1))> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
@@ -1102,8 +1098,15 @@ def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
+def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>;
+
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def GLC_0 : NamedOperandBit_0<"GLC", NamedMatchClass<"GLC">>;
+def GLC_1 : NamedOperandBit_1<"GLC", NamedMatchClass<"GLC_1">>;
+
def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def SLC_0 : NamedOperandBit_0<"SLC", NamedMatchClass<"SLC">>;
+
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
@@ -1115,7 +1118,7 @@ def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
-def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
+def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT", 0>>;
def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;
@@ -1133,10 +1136,10 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
-def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
-def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
-def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
-def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+def op_sel0 : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
+def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
+def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
+def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
@@ -1308,11 +1311,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">;
def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
-def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
// VOP3Mods, but the input source is known to never be NaN.
@@ -1328,9 +1331,6 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
-
-def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;
-
//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
@@ -1389,9 +1389,9 @@ def HWREG {
}
class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
- int ret = !or(Reg,
- !or(!shl(Offset, 6),
- !shl(!add(Size, -1), 11)));
+ int ret = !and(!or(Reg,
+ !shl(Offset, 6),
+ !shl(!add(Size, -1), 11)), 65535);
}
//===----------------------------------------------------------------------===//
@@ -1416,56 +1416,6 @@ class SIMCInstr <string pseudo, int subtarget> {
}
//===----------------------------------------------------------------------===//
-// EXP classes
-//===----------------------------------------------------------------------===//
-
-class EXP_Helper<bit done> : EXPCommon<
- (outs),
- (ins exp_tgt:$tgt,
- ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
- exp_vm:$vm, exp_compr:$compr, i32imm:$en),
- "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", []> {
- let AsmMatchConverter = "cvtExp";
-}
-
-// Split EXP instruction into EXP and EXP_DONE so we can set
-// mayLoad for done=1.
-multiclass EXP_m<bit done> {
- let mayLoad = done, DisableWQM = 1 in {
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : EXP_Helper<done>,
- SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
- }
-
- let done = done in {
- def _si : EXP_Helper<done>,
- SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
- EXPe {
- let AssemblerPredicate = isGFX6GFX7;
- let DecoderNamespace = "GFX6GFX7";
- let DisableDecoder = DisableSIDecoder;
- }
-
- def _vi : EXP_Helper<done>,
- SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
- EXPe_vi {
- let AssemblerPredicate = isGFX8GFX9;
- let DecoderNamespace = "GFX8";
- let DisableDecoder = DisableVIDecoder;
- }
-
- def _gfx10 : EXP_Helper<done>,
- SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.GFX10>,
- EXPe {
- let AssemblerPredicate = isGFX10Plus;
- let DecoderNamespace = "GFX10";
- let DisableDecoder = DisableSIDecoder;
- }
- }
- }
-}
-
-//===----------------------------------------------------------------------===//
// Vector ALU classes
//===----------------------------------------------------------------------===//
@@ -1528,6 +1478,10 @@ class getVOPSrc0ForVT<ValueType VT> {
);
}
+class getSOPSrcForVT<ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
+}
+
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT> {
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
@@ -1583,13 +1537,11 @@ class getVOP3SrcForVT<ValueType VT> {
// Float or packed int
class isModifierType<ValueType SrcVT> {
- bit ret =
- !if(!eq(SrcVT.Value, f16.Value), 1,
- !if(!eq(SrcVT.Value, f32.Value), 1,
- !if(!eq(SrcVT.Value, f64.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1,
- !if(!eq(SrcVT.Value, v2i16.Value), 1,
- 0)))));
+ bit ret = !or(!eq(SrcVT.Value, f16.Value),
+ !eq(SrcVT.Value, f32.Value),
+ !eq(SrcVT.Value, f64.Value),
+ !eq(SrcVT.Value, v2f16.Value),
+ !eq(SrcVT.Value, v2i16.Value));
}
// Return type of input modifiers operand for specified input operand
@@ -1612,7 +1564,7 @@ class getOpSelMod <ValueType VT> {
}
// Return type of input modifiers operand specified input operand for DPP
-class getSrcModExt <ValueType VT> {
+class getSrcModDPP <ValueType VT> {
bit isFP = isFloatType<VT>.ret;
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
@@ -1635,7 +1587,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
// Returns the input arguments for VOP3 instructions for the given SrcVT.
class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
RegisterOperand Src2RC, int NumSrcArgs,
- bit HasIntClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
+ bit HasClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
dag ret =
@@ -1644,20 +1596,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
(ins),
/* else */
!if (!eq(NumSrcArgs, 1),
- !if (!eq(HasModifiers, 1),
+ !if (HasModifiers,
// VOP1 with modifiers
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod0:$clamp, omod0:$omod)
/* else */,
// VOP1 without modifiers
- !if (!eq(HasIntClamp, 1),
+ !if (HasClamp,
(ins Src0RC:$src0, clampmod0:$clamp),
(ins Src0RC:$src0))
/* endif */ ),
!if (!eq(NumSrcArgs, 2),
- !if (!eq(HasModifiers, 1),
+ !if (HasModifiers,
// VOP 2 with modifiers
- !if( !eq(HasOMod, 1),
+ !if(HasOMod,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod0:$clamp, omod0:$omod),
@@ -1666,21 +1618,21 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
clampmod0:$clamp))
/* else */,
// VOP2 without modifiers
- !if (!eq(HasIntClamp, 1),
+ !if (HasClamp,
(ins Src0RC:$src0, Src1RC:$src1, clampmod0:$clamp),
(ins Src0RC:$src0, Src1RC:$src1))
/* endif */ )
/* NumSrcArgs == 3 */,
- !if (!eq(HasModifiers, 1),
- !if (!eq(HasSrc2Mods, 1),
+ !if (HasModifiers,
+ !if (HasSrc2Mods,
// VOP3 with modifiers
- !if (!eq(HasOMod, 1),
+ !if (HasOMod,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2Mod:$src2_modifiers, Src2RC:$src2,
clampmod0:$clamp, omod0:$omod),
- !if (!eq(HasIntClamp, 1),
+ !if (HasClamp,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2Mod:$src2_modifiers, Src2RC:$src2,
@@ -1689,11 +1641,11 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2Mod:$src2_modifiers, Src2RC:$src2))),
// VOP3 with modifiers except src2
- !if (!eq(HasOMod, 1),
+ !if (HasOMod,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2RC:$src2, clampmod0:$clamp, omod0:$omod),
- !if (!eq(HasIntClamp, 1),
+ !if (HasClamp,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2RC:$src2, clampmod0:$clamp),
@@ -1702,119 +1654,87 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
Src2RC:$src2))))
/* else */,
// VOP3 without modifiers
- !if (!eq(HasIntClamp, 1),
+ !if (HasClamp,
(ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2, clampmod0:$clamp),
(ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2))
/* endif */ ))));
}
-/// XXX - src1 may only allow VGPRs?
+class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs,
+ bit HasClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOpSel,
+ bit IsVOP3P> {
+ // getInst64 handles clamp and omod. implicit mutex between vop3p and omod
+ dag base = getIns64 <Src0RC, Src1RC, Src2RC, NumSrcArgs,
+ HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+ Src0Mod, Src1Mod, Src2Mod>.ret;
+ dag opsel = (ins op_sel0:$op_sel);
+ dag vop3pFields = (ins op_sel_hi0:$op_sel_hi, neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+ dag ret = !con(base,
+ !if(HasOpSel, opsel,(ins)),
+ !if(IsVOP3P, vop3pFields,(ins)));
+}
-// The modifiers (except clamp) are dummy operands for the benefit of
-// printing and parsing. They defer their values to looking at the
-// srcN_modifiers for what to print.
class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
- RegisterOperand Src2RC, int NumSrcArgs,
- bit HasClamp,
+ RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
- dag ret = !if (!eq(NumSrcArgs, 2),
- !if (HasClamp,
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod0:$clamp,
- op_sel:$op_sel, op_sel_hi:$op_sel_hi,
- neg_lo:$neg_lo, neg_hi:$neg_hi),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- op_sel:$op_sel, op_sel_hi:$op_sel_hi,
- neg_lo:$neg_lo, neg_hi:$neg_hi)),
- // else NumSrcArgs == 3
- !if (HasClamp,
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod0:$clamp,
- op_sel:$op_sel, op_sel_hi:$op_sel_hi,
- neg_lo:$neg_lo, neg_hi:$neg_hi),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- op_sel:$op_sel, op_sel_hi:$op_sel_hi,
- neg_lo:$neg_lo, neg_hi:$neg_hi))
- );
+ dag ret = getInsVOP3Base<Src0RC, Src1RC, Src2RC, NumSrcArgs,
+ HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/,
+ 0/*HasOMod*/, Src0Mod, Src1Mod, Src2Mod,
+ 1/*HasOpSel*/, 1/*IsVOP3P*/>.ret;
}
-class getInsVOP3OpSel <RegisterOperand Src0RC,
- RegisterOperand Src1RC,
- RegisterOperand Src2RC,
- int NumSrcArgs,
- bit HasClamp,
- Operand Src0Mod,
- Operand Src1Mod,
- Operand Src2Mod> {
- dag ret = !if (!eq(NumSrcArgs, 2),
- !if (HasClamp,
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod0:$clamp,
- op_sel:$op_sel),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- op_sel:$op_sel)),
- // else NumSrcArgs == 3
- !if (HasClamp,
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod0:$clamp,
- op_sel:$op_sel),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- op_sel:$op_sel))
- );
+class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs,
+ bit HasClamp, bit HasOMod,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+ dag ret = getInsVOP3Base<Src0RC, Src1RC,
+ Src2RC, NumSrcArgs,
+ HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/, HasOMod,
+ Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/, 0>.ret;
}
-class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
dag ret = !if (!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
- (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl),
+ (ins ),
!if (!eq(NumSrcArgs, 1),
- !if (!eq(HasModifiers, 1),
+ !if (HasModifiers,
// VOP1_DPP with modifiers
(ins DstRC:$old, Src0Mod:$src0_modifiers,
- Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+ Src0RC:$src0)
/* else */,
// VOP1_DPP without modifiers
- (ins DstRC:$old, Src0RC:$src0,
- dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
- /* endif */)
- /* NumSrcArgs == 2 */,
- !if (!eq(HasModifiers, 1),
+ (ins DstRC:$old, Src0RC:$src0)
+ /* endif */),
+ !if (HasModifiers,
// VOP2_DPP with modifiers
(ins DstRC:$old,
Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
- bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+ Src1Mod:$src1_modifiers, Src1RC:$src1)
/* else */,
// VOP2_DPP without modifiers
(ins DstRC:$old,
- Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
- row_mask:$row_mask, bank_mask:$bank_mask,
- bound_ctrl:$bound_ctrl)
- /* endif */)));
+ Src0RC:$src0, Src1RC:$src1)
+ )));
+}
+
+class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
+ dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod>.ret,
+ (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+ bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
- int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod> {
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins FI:$fi));
@@ -1823,30 +1743,9 @@ class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Sr
class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !if (!eq(NumSrcArgs, 0),
- // VOP1 without input operands (V_NOP)
- (ins dpp8:$dpp8, FI:$fi),
- !if (!eq(NumSrcArgs, 1),
- !if (!eq(HasModifiers, 1),
- // VOP1_DPP with modifiers
- (ins DstRC:$old, Src0Mod:$src0_modifiers,
- Src0RC:$src0, dpp8:$dpp8, FI:$fi)
- /* else */,
- // VOP1_DPP without modifiers
- (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi)
- /* endif */)
- /* NumSrcArgs == 2 */,
- !if (!eq(HasModifiers, 1),
- // VOP2_DPP with modifiers
- (ins DstRC:$old,
- Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- dpp8:$dpp8, FI:$fi)
- /* else */,
- // VOP2_DPP without modifiers
- (ins DstRC:$old,
- Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi)
- /* endif */)));
+ dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod>.ret,
+ (ins dpp8:$dpp8, FI:$fi));
}
@@ -1860,7 +1759,7 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
(ins),
!if(!eq(NumSrcArgs, 1),
// VOP1
- !if(!eq(HasSDWAOMod, 0),
+ !if(!not(HasSDWAOMod),
// VOP1_SDWA without omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp,
@@ -1878,7 +1777,7 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
// VOP2_SDWA
- !if(!eq(HasSDWAOMod, 0),
+ !if(!not(HasSDWAOMod),
// VOP2_SDWA without omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
@@ -1894,12 +1793,12 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
(ins)/* endif */)));
}
-// Outs for DPP and SDWA
-class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
+// Outs for DPP
+class getOutsDPP <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
dag ret = !if(HasDst,
!if(!eq(DstVT.Size, 1),
(outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
- (outs DstRCExt:$vdst)),
+ (outs DstRCDPP:$vdst)),
(outs)); // V_NOP
}
@@ -1938,7 +1837,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
string iclamp = !if(HasIntClamp, "$clamp", "");
string ret =
- !if(!eq(HasModifiers, 0),
+ !if(!not(HasModifiers),
getAsm32<HasDst, NumSrcArgs, DstVT>.ret # iclamp,
dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", ""));
}
@@ -1964,6 +1863,7 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
class getAsmVOP3OpSel <int NumSrcArgs,
bit HasClamp,
+ bit HasOMod,
bit Src0HasMods,
bit Src1HasMods,
bit Src2HasMods> {
@@ -2000,7 +1900,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
string src1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
" $src1_modifiers,"));
- string args = !if(!eq(HasModifiers, 0),
+ string args = !if(!not(HasModifiers),
getAsm32<0, NumSrcArgs, DstVT>.ret,
", "#src0#src1);
string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
@@ -2010,22 +1910,12 @@ class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
string ret = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret#"$fi";
}
-class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
- string dst = !if(HasDst,
- !if(!eq(DstVT.Size, 1),
- "$sdst",
- "$vdst"),
- ""); // use $sdst for VOPC
- string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
- string src1 = !if(!eq(NumSrcArgs, 1), "",
- !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
- " $src1_modifiers,"));
- string args = !if(!eq(HasModifiers, 0),
- getAsm32<0, NumSrcArgs, DstVT>.ret,
- ", "#src0#src1);
- string ret = dst#args#"$dpp8$fi";
+class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32>
+ : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT> {
+ let ret = dst#args#" $dpp8$fi";
}
+
class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
@@ -2063,7 +1953,7 @@ class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
"");
string src0 = "$src0_modifiers";
string src1 = "$src1_modifiers";
- string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod");
+ string out_mods = !if(!not(HasOMod), "$clamp", "$clamp$omod");
string args = !if(!eq(NumSrcArgs, 0), "",
!if(!eq(NumSrcArgs, 1),
", "#src0,
@@ -2107,14 +1997,6 @@ class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
}
-class BitOr<bit a, bit b> {
- bit ret = !if(a, 1, !if(b, 1, 0));
-}
-
-class BitAnd<bit a, bit b> {
- bit ret = !if(a, !if(b, 1, 0), 0);
-}
-
def PatGenMode {
int NoPattern = 0;
int Pattern = 1;
@@ -2146,24 +2028,19 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret;
field Operand Src1Mod = getSrcMod<Src1VT, EnableF32SrcMods>.ret;
field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret;
- field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
- field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+ field Operand Src0ModDPP = getSrcModDPP<Src0VT>.ret;
+ field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
- field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+ field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
- field bit HasSrc0 = !if(!eq(Src0VT.Value, untyped.Value), 0, 1);
- field bit HasSrc1 = !if(!eq(Src1VT.Value, untyped.Value), 0, 1);
- field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
-
- // TODO: Modifiers logic is somewhat adhoc here, to be refined later
- // HasModifiers affects the normal and DPP encodings. We take note of EnableF32SrcMods, which
- // enables modifiers for i32 type.
- field bit HasModifiers = BitOr<isModifierType<Src0VT>.ret, EnableF32SrcMods>.ret;
+ field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
+ field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
+ field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value);
// HasSrc*FloatMods affects the SDWA encoding. We ignore EnableF32SrcMods.
field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
@@ -2175,16 +2052,12 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
- field bit HasSrc0Mods = HasModifiers;
- field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
- field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
-
- field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret;
+ field bit HasClamp = !or(isModifierType<Src0VT>.ret, EnableClamp);
field bit HasSDWAClamp = EmitDst;
- field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
+ field bit HasFPClamp = !and(isFloatType<DstVT>.ret, HasClamp);
field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
field bit HasClampLo = HasClamp;
- field bit HasClampHi = BitAnd<isPackedType<DstVT>.ret, HasClamp>.ret;
+ field bit HasClampHi = !and(isPackedType<DstVT>.ret, HasClamp);
field bit HasHigh = 0;
field bit IsPacked = isPackedType<Src0VT>.ret;
@@ -2192,6 +2065,16 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
field bit HasSDWAOMod = isFloatType<DstVT>.ret;
+ field bit HasModifiers = !or(isModifierType<Src0VT>.ret,
+ isModifierType<Src1VT>.ret,
+ isModifierType<Src2VT>.ret,
+ HasOMod,
+ EnableF32SrcMods);
+
+ field bit HasSrc0Mods = HasModifiers;
+ field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
+ field bit HasSrc2Mods = !if(HasModifiers, !or(HasSrc2FloatMods, HasSrc2IntMods), 0);
+
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA = HasExt;
@@ -2211,8 +2094,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
// output. This is manually overridden for them.
field dag Outs32 = Outs;
field dag Outs64 = Outs;
- field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
- field dag OutsDPP8 = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsDPP = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsDPP8 = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret;
field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
@@ -2223,11 +2106,10 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
NumSrcArgs, HasClamp,
Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
- NumSrcArgs,
- HasClamp,
- getOpSelMod<Src0VT>.ret,
- getOpSelMod<Src1VT>.ret,
- getOpSelMod<Src2VT>.ret>.ret;
+ NumSrcArgs, HasClamp, HasOMod,
+ getOpSelMod<Src0VT>.ret,
+ getOpSelMod<Src1VT>.ret,
+ getOpSelMod<Src2VT>.ret>.ret;
field dag InsDPP = !if(HasExtDPP,
getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
@@ -2245,7 +2127,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret;
field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
- HasClamp,
+ HasClamp, HasOMod,
HasSrc0FloatMods,
HasSrc1FloatMods,
HasSrc2FloatMods>.ret;
@@ -2381,7 +2263,6 @@ class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
VINTRPCommon <outs, ins, asm, []>,
VINTRPe <op>,
SIMCInstr<opName, encodingFamily> {
- let DisableDecoder = DisableSIDecoder;
}
class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
@@ -2391,7 +2272,6 @@ class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
SIMCInstr<opName, SIEncodingFamily.VI> {
let AssemblerPredicate = VIAssemblerPredicate;
let DecoderNamespace = "GFX8";
- let DisableDecoder = DisableVIDecoder;
}
// FIXME-GFX10: WIP.
@@ -2492,8 +2372,7 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)],
[!cast<string>(SIEncodingFamily.GFX10)],
- [!cast<string>(SIEncodingFamily.SDWA10)],
- [!cast<string>(SIEncodingFamily.GFX10_B)]];
+ [!cast<string>(SIEncodingFamily.SDWA10)]];
}
// Get equivalent SOPK instruction.
@@ -2567,11 +2446,28 @@ def getVCMPXNoSDstOp : InstrMapping {
// Maps a SOPP to a SOPP with S_NOP
def getSOPPWithRelaxation : InstrMapping {
- let FilterClass = "Base_SOPP";
- let RowFields = ["AsmString"];
- let ColFields = ["Size"];
- let KeyCol = ["4"];
- let ValueCols = [["8"]];
+ let FilterClass = "SOPPRelaxTable";
+ let RowFields = ["KeyName"];
+ let ColFields = ["IsRelaxed"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Maps flat scratch opcodes by addressing modes
+def getFlatScratchInstSTfromSS : InstrMapping {
+ let FilterClass = "FlatScratchInst";
+ let RowFields = ["SVOp"];
+ let ColFields = ["Mode"];
+ let KeyCol = ["SS"];
+ let ValueCols = [["ST"]];
+}
+
+def getFlatScratchInstSSfromSV : InstrMapping {
+ let FilterClass = "FlatScratchInst";
+ let RowFields = ["SVOp"];
+ let ColFields = ["Mode"];
+ let KeyCol = ["SV"];
+ let ValueCols = [["SS"]];
}
include "SIInstructions.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0c4c9e0e9df2..7c1cbd67c993 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -19,43 +19,7 @@ include "VOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
-
-//===----------------------------------------------------------------------===//
-// EXP Instructions
-//===----------------------------------------------------------------------===//
-
-defm EXP : EXP_m<0>;
-defm EXP_DONE : EXP_m<1>;
-
-class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
- (int_amdgcn_exp timm:$tgt, timm:$en,
- (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
- (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
- done_val, timm:$vm),
- (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
- ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
->;
-
-class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
- (int_amdgcn_exp_compr timm:$tgt, timm:$en,
- (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
- done_val, timm:$vm),
- (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
- (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
->;
-
-// FIXME: The generated DAG matcher seems to have strange behavior
-// with a 1-bit literal to match, so use a -1 for checking a true
-// 1-bit value.
-def : ExpPattern<i32, EXP, 0>;
-def : ExpPattern<i32, EXP_DONE, -1>;
-def : ExpPattern<f32, EXP, 0>;
-def : ExpPattern<f32, EXP_DONE, -1>;
-
-def : ExpComprPattern<v2i16, EXP, 0>;
-def : ExpComprPattern<v2i16, EXP_DONE, -1>;
-def : ExpComprPattern<v2f16, EXP, 0>;
-def : ExpComprPattern<v2f16, EXP_DONE, -1>;
+include "EXPInstructions.td"
//===----------------------------------------------------------------------===//
// VINTRP Instructions
@@ -264,6 +228,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
let WaveSizePredicate = isWave64 in {
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
+def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
}
@@ -324,7 +289,7 @@ def SI_IF: CFPseudoInstSI <
def SI_ELSE : CFPseudoInstSI <
(outs SReg_1:$dst),
- (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+ (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
}
@@ -356,6 +321,14 @@ def SI_IF_BREAK : CFPseudoInstSI <
let isReMaterializable = 1;
}
+// Branch to the early termination block of the shader if SCC is 0.
+// This uses SCC from a previous SALU operation, i.e. the update of
+// a mask of live lanes after a kill/demote operation.
+// Only valid in pixel shaders.
+def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> {
+ let Uses = [EXEC,SCC];
+}
+
let Uses = [EXEC] in {
multiclass PseudoInstKill <dag ins> {
@@ -426,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
(outs), (ins i64imm:$src),
[(int_amdgcn_init_exec (i64 timm:$src))]> {
let Defs = [EXEC];
- let usesCustomInserter = 1;
- let isAsCheapAsAMove = 1;
- let WaveSizePredicate = isWave64;
-}
-
-// FIXME: Intrinsic should be mangled for wave size.
-def SI_INIT_EXEC_LO : SPseudoInstSI <
- (outs), (ins i32imm:$src), []> {
- let Defs = [EXEC_LO];
- let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
- let WaveSizePredicate = isWave32;
}
-// FIXME: Wave32 version
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
(outs), (ins SSrc_b32:$input, i32imm:$shift),
[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
let Defs = [EXEC];
- let usesCustomInserter = 1;
-}
-
-def : GCNPat <
- (int_amdgcn_init_exec timm:$src),
- (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
- let WaveSizePredicate = isWave32;
}
// Return for returning shaders to a shader variant epilog.
@@ -580,64 +534,97 @@ def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
} // End Uses = [EXEC], Defs = [M0, EXEC]
-
-// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
-// expecting to be executed with gpr indexing mode enabled)
-// instruction in which the vector operand appears only twice, once as
-// def and once as use. Using this pseudo avoids problems with the Two
-// Address instructions pass.
-class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+// This is a pseudo variant of the v_movreld_b32 instruction in which the
+// vector operand appears only twice, once as def and once as use. Using this
+// pseudo avoids problems with the Two Address instructions pass.
+class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
RegisterOperand val_ty> : PseudoInstSI <
(outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
let Constraints = "$vsrc = $vdst";
let Uses = [M0];
}
-class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
- INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
+class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
+ INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> {
let VALU = 1;
let VOP1 = 1;
let Uses = [M0, EXEC];
}
-class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
RegisterOperand val_ty> :
- INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
+ INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> {
let SALU = 1;
let SOP1 = 1;
let Uses = [M0];
}
-class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
- S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
-class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
- S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
-
-
-def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
-def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
-def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
-def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
-def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
-def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
-def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
-def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
+class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>;
+class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>;
+
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
+
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>;
+
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>;
+
+// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these
+// pseudos we avoid spills or copies being inserted within indirect sequences
+// that switch the VGPR indexing mode. Spills to accvgprs could be effected by
+// this mode switching.
+
+class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
+ (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> {
+ let Constraints = "$vsrc = $vdst";
+ let VALU = 1;
+ let Uses = [M0, EXEC];
+ let Defs = [M0];
+}
-def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
-def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
-def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
-def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
-def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
-def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
-def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
-def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
-def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
-def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
-def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
-def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
-def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
+class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
+ (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> {
+ let VALU = 1;
+ let Uses = [M0, EXEC];
+ let Defs = [M0];
+}
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
@@ -671,30 +658,33 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
-multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
+// needs to be used and an extra instruction to move between VGPR and AGPR.
+// UsesTmp adds to the total size of an expanded spill in this case.
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
SchedRW = [WriteVMEM] in {
def _SAVE : VPseudoInstSI <
(outs),
- (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ (ins vgpr_class:$vdata, i32imm:$vaddr,
SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
// Size field is unsigned char and cannot fit more.
let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
def _RESTORE : VPseudoInstSI <
(outs vgpr_class:$vdata),
- (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
- i32imm:$offset)> {
+ (ins i32imm:$vaddr,
+ SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 0;
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
// Size field is unsigned char and cannot fit more.
let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
@@ -711,42 +701,15 @@ defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
-multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
- let UseNamedOperandTable = 1, VGPRSpill = 1,
- Constraints = "@earlyclobber $tmp",
- SchedRW = [WriteVMEM] in {
- def _SAVE : VPseudoInstSI <
- (outs VGPR_32:$tmp),
- (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
- SReg_32:$soffset, i32imm:$offset)> {
- let mayStore = 1;
- let mayLoad = 0;
- // (2 * 4) + (16 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
- // Size field is unsigned char and cannot fit more.
- let Size = !if(!le(MaxSize, 256), MaxSize, 252);
- }
-
- def _RESTORE : VPseudoInstSI <
- (outs vgpr_class:$vdata, VGPR_32:$tmp),
- (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
- i32imm:$offset)> {
- let mayStore = 0;
- let mayLoad = 1;
-
- // (2 * 4) + (16 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
- // Size field is unsigned char and cannot fit more.
- let Size = !if(!le(MaxSize, 256), MaxSize, 252);
- }
- } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
-}
-
-defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>;
-defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>;
-defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
-defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
-defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
+defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>;
+defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>;
+defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
+defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
+defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
+defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
+defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
+defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
@@ -768,7 +731,7 @@ def : GCNPat<
def : GCNPat<
(AMDGPUelse i1:$src, bb:$target),
- (SI_ELSE $src, $target, 0)
+ (SI_ELSE $src, $target)
>;
def : Pat <
@@ -804,12 +767,9 @@ def : Pat <
let OtherPredicates = [UnsafeFPMath] in {
-//def : RcpPat<V_RCP_F64_e32, f64>;
-//defm : RsqPat<V_RSQ_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F32_e32, f32>;
-def : RsqPat<V_RSQ_F64_e32, f64>;
// Convert (x - floor(x)) to fract(x)
def : GCNPat <
@@ -889,7 +849,8 @@ def : GCNPat <
// VOP2 Patterns
//===----------------------------------------------------------------------===//
-// TODO: Check only no src2 mods?
+// NoMods pattern used for mac. If there are any source modifiers then it's
+// better to select mad instead of mac.
class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
: GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
(vt (VOP3NoMods vt:$src1)),
@@ -898,18 +859,41 @@ class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-
// Prefer mac form when there are no modifiers.
let AddedComplexity = 9 in {
+let OtherPredicates = [HasMadMacF32Insts] in {
def : FMADPat <f32, V_MAC_F32_e64, fmad>;
def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+} // OtherPredicates = [HasMadMacF32Insts]
+
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select mad instead of mac.
+let SubtargetPredicate = isGFX6GFX7GFX10,
+ OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
+def : GCNPat <
+ (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0),
+ (VOP3NoMods f32:$src1)),
+ (VOP3NoMods f32:$src2))),
+ (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
+let SubtargetPredicate = HasFmaLegacy32 in
+def : GCNPat <
+ (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
+ (VOP3NoMods f32:$src1),
+ (VOP3NoMods f32:$src2))),
+ (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
let SubtargetPredicate = Has16BitInsts in {
def : FMADPat <f16, V_MAC_F16_e64, fmad>;
def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
-}
-
-}
+} // SubtargetPredicate = Has16BitInsts
+} // AddedComplexity = 9
class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
: GCNPat<
@@ -920,11 +904,20 @@ class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-let SubtargetPredicate = HasMadMacF32Insts in
-def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
-def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
- let SubtargetPredicate = Has16BitInsts;
-}
+let OtherPredicates = [HasMadMacF32Insts] in
+def : FMADModsPat<f32, V_MAD_F32_e64, AMDGPUfmad_ftz>;
+
+let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
+def : GCNPat <
+ (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod),
+ (VOP3Mods f32:$src1, i32:$src1_mod)),
+ (VOP3Mods f32:$src2, i32:$src2_mod))),
+ (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1,
+ $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let SubtargetPredicate = Has16BitInsts in
+def : FMADModsPat<f16, V_MAD_F16_e64, AMDGPUfmad_ftz>;
class VOPSelectModsPat <ValueType vt> : GCNPat <
(vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
@@ -1241,7 +1234,7 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
>;
def : ClampPat<V_MAX_F32_e64, f32>;
-def : ClampPat<V_MAX_F64, f64>;
+def : ClampPat<V_MAX_F64_e64, f64>;
def : ClampPat<V_MAX_F16_e64, f16>;
let SubtargetPredicate = HasVOP3PInsts in {
@@ -1422,12 +1415,12 @@ def : GCNPat <
def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
def : GCNPat <
(fcopysign f32:$src0, f16:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;
@@ -1435,19 +1428,19 @@ def : GCNPat <
(fcopysign f64:$src0, f16:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
- (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
(V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
>;
def : GCNPat <
(fcopysign f16:$src0, f32:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), $src1))
>;
def : GCNPat <
(fcopysign f16:$src0, f64:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
@@ -1499,8 +1492,13 @@ def : GCNPat <
>;
def : GCNPat <
- (i32 frameindex:$fi),
- (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+ (p5 frameindex:$fi),
+ (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
+>;
+
+def : GCNPat <
+ (p5 frameindex:$fi),
+ (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
>;
def : GCNPat <
@@ -1565,19 +1563,103 @@ def : GCNPat <
// VOP3 Patterns
//===----------------------------------------------------------------------===//
-def : IMad24Pat<V_MAD_I32_I24, 1>;
-def : UMad24Pat<V_MAD_U32_U24, 1>;
+def : IMad24Pat<V_MAD_I32_I24_e64, 1>;
+def : UMad24Pat<V_MAD_U32_U24_e64, 1>;
+
+// BFI patterns
+
+def BFIImm32 : PatFrag<
+ (ops node:$x, node:$y, node:$z),
+ (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
+ [{
+ auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
+ auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
+ return X && NotX &&
+ ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
+ }]
+>;
+
+// Definition from ISA doc:
+// (y & x) | (z & ~x)
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+ (V_BFI_B32_e64 $x, $y, $z)
+>;
+
+// (y & C) | (z & ~C)
+def : AMDGPUPat <
+ (BFIImm32 i32:$x, i32:$y, i32:$z),
+ (V_BFI_B32_e64 $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+ (REG_SEQUENCE SReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+>;
+
+// SHA-256 Ch function
+// z ^ (x & (y ^ z))
+def : AMDGPUPat <
+ (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+ (V_BFI_B32_e64 $x, $y, $z)
+>;
-// FIXME: This should only be done for VALU inputs
-defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
+// 64-bit version
+def : AMDGPUPat <
+ (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (REG_SEQUENCE SReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f32:$src0, f32:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f32:$src0, f64:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+ (fcopysign f64:$src0, f64:$src1),
+ (REG_SEQUENCE SReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f64:$src0, f32:$src1),
+ (REG_SEQUENCE SReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
+ $src1), sub1)
+>;
+
+def : ROTRPattern <V_ALIGNBIT_B32_e64>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
/********** ====================== **********/
@@ -1618,7 +1700,7 @@ def : GCNPat <
(add (sub_oneuse (umax i32:$src0, i32:$src1),
(umin i32:$src0, i32:$src1)),
i32:$src2),
- (V_SAD_U32 $src0, $src1, $src2, (i1 0))
+ (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
>;
def : GCNPat <
@@ -1626,7 +1708,7 @@ def : GCNPat <
(sub i32:$src0, i32:$src1),
(sub i32:$src1, i32:$src0)),
i32:$src2),
- (V_SAD_U32 $src0, $src1, $src2, (i1 0))
+ (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
>;
//===----------------------------------------------------------------------===//
@@ -1877,9 +1959,9 @@ def : GCNPat <
def : GCNPat <
(i32 (bswap i32:$a)),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
- (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+ (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
>;
// FIXME: This should have been narrowed to i32 during legalization.
@@ -1887,19 +1969,19 @@ def : GCNPat <
def : GCNPat <
(i64 (bswap i64:$a)),
(REG_SEQUENCE VReg_64,
- (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 24)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 8))),
sub0,
- (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 24)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 8))),
sub1)
@@ -1914,7 +1996,7 @@ let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
// register value, but this is what seems to work.
def : GCNPat <
(i32 (bswap i32:$a)),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
>;
// FIXME: This should have been narrowed to i32 during legalization.
@@ -1922,10 +2004,10 @@ def : GCNPat <
def : GCNPat <
(i64 (bswap i64:$a)),
(REG_SEQUENCE VReg_64,
- (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
+ (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
(S_MOV_B32 (i32 0x00010203))),
sub0,
- (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
+ (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
(S_MOV_B32 (i32 0x00010203))),
sub1)
>;
@@ -1934,18 +2016,18 @@ def : GCNPat <
// The 12s emit 0s.
def : GCNPat <
(i16 (bswap i16:$a)),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
def : GCNPat <
(i32 (zext (bswap i16:$a))),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
def : GCNPat <
(v2i16 (bswap v2i16:$a)),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
>;
}
@@ -1981,7 +2063,7 @@ def : GCNPat<
// TODO: Handle fneg like other types.
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
+ (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src)
>;
} // End AddedComplexity = -5
@@ -1997,7 +2079,7 @@ multiclass SelectCanonicalizeAsMax<
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
+ (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> {
let OtherPredicates = f64_preds;
}
@@ -2059,14 +2141,22 @@ def : GCNPat <
SRCMODS.NONE, $src2)
>;
-// COPY is workaround tablegen bug from multiple outputs
-// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
(v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
def : GCNPat <
+ (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+ (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+>;
+
+def : GCNPat <
(v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
@@ -2177,12 +2267,12 @@ let SubtargetPredicate = isGFX6 in {
// FIXME: DAG should also custom lower this.
def : GCNPat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
- (V_ADD_F64
+ (V_ADD_F64_e64
$mods,
$x,
SRCMODS.NEG,
(V_CNDMASK_B64_PSEUDO
- (V_MIN_F64
+ (V_MIN_F64_e64
SRCMODS.NONE,
(V_FRACT_F64_e64 $mods, $x),
SRCMODS.NONE,
@@ -2213,7 +2303,7 @@ def : GCNPat<
def : GCNPat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
- (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+ (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = NotHasAddNoCarryInsts;
}
@@ -2241,8 +2331,77 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
-defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
+// Bitfield extract patterns
+
+def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
+ return isMask_32(Imm);
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)),
+ IMMZeroBasedBitfieldMask:$mask),
+ (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask)))
+>;
+
+// x & ((1 << y) - 1)
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x & ~(-1 << y)
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src,
+ (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x & (-1 >> (bitwidth - y))
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+ (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
+ (sub 32, i32:$width)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
+ (sub 32, i32:$width)),
+ (V_BFE_I32_e64 $src, (i32 0), $width)
+>;
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i32:$x, i32:$z),
+ (and i32:$y, (or i32:$x, i32:$z))),
+ (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i64:$x, i64:$z),
+ (and i64:$y, (or i64:$x, i64:$z))),
+ (REG_SEQUENCE SReg_64,
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0,
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1)
+>;
multiclass IntMed3Pat<Instruction med3Inst,
SDPatternOperator min,
@@ -2267,8 +2426,8 @@ multiclass IntMed3Pat<Instruction med3Inst,
>;
}
-defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
-defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
+defm : IntMed3Pat<V_MED3_I32_e64, smin, smax, smin_oneuse, smax_oneuse>;
+defm : IntMed3Pat<V_MED3_U32_e64, umin, umax, umin_oneuse, umax_oneuse>;
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
@@ -2315,12 +2474,12 @@ multiclass Int16Med3Pat<Instruction med3Inst,
>;
}
-def : FPMed3Pat<f32, V_MED3_F32>;
+def : FPMed3Pat<f32, V_MED3_F32_e64>;
let OtherPredicates = [isGFX9Plus] in {
-def : FP16Med3Pat<f16, V_MED3_F16>;
-defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
-defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
+def : FP16Med3Pat<f16, V_MED3_F16_e64>;
+defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
+defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, umax_oneuse, umin_oneuse>;
} // End Predicates = [isGFX9Plus]
class AMDGPUGenericInstruction : GenericInstruction {
@@ -2428,10 +2587,12 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let Namespace = "AMDGPU" in {
def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
}
-class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
- let OutOperandList = (outs type0:$dst);
+class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
+ let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
type2:$soffset, untyped_imm_0:$offset,
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
@@ -2452,6 +2613,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
@@ -2494,3 +2656,11 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
let hasSideEffects = 0;
let mayStore = 1;
}
+
+def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 2eb1c52f1b59..b39420f3c7db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -58,33 +58,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iterator>
-#include <utility>
using namespace llvm;
@@ -171,7 +149,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
return false;
// TODO: We should be able to merge physical reg addreses.
- if (Register::isPhysicalRegister(AddrOp->getReg()))
+ if (AddrOp->getReg().isPhysical())
return false;
// If an address has only one use then there will be on other
@@ -393,6 +371,15 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
return DS_WRITE;
+ case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
+ case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
+ case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
+ case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
+ case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
+ case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
+ case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
+ case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
+ return UNKNOWN;
}
}
@@ -604,7 +591,7 @@ static void addDefsUsesToList(const MachineInstr &MI,
if (Op.isReg()) {
if (Op.isDef())
RegDefs.insert(Op.getReg());
- else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
+ else if (Op.readsReg() && Op.getReg().isPhysical())
PhysRegUses.insert(Op.getReg());
}
}
@@ -633,11 +620,10 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
// be moved for merging, then we need to move the def-instruction as well.
// This can only happen for physical registers such as M0; virtual
// registers are in SSA form.
- if (Use.isReg() &&
- ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
- PhysRegUses.count(Use.getReg())))) {
+ if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+ (Use.isDef() && RegDefs.count(Use.getReg())) ||
+ (Use.isDef() && Use.getReg().isPhysical() &&
+ PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
addDefsUsesToList(MI, RegDefs, PhysRegUses);
return true;
@@ -1667,7 +1653,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *LoHalf =
- BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
.addReg(CarryReg, RegState::Define)
.addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
.add(OffsetLo)
@@ -1730,7 +1716,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
// Expecting base computation as:
// %OFFSET0:sgpr_32 = S_MOV_B32 8000
// %LO:vgpr_32, %c:sreg_64_xexec =
-// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
+// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
// %Base:vreg_64 =
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
@@ -1752,7 +1738,7 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
- if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
+ if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
!BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
return;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 36d52ac3ee89..5839e59b4d7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -48,28 +48,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include <cassert>
-#include <iterator>
using namespace llvm;
@@ -99,6 +82,7 @@ private:
unsigned MovTermOpc;
unsigned Andn2TermOpc;
unsigned XorTermrOpc;
+ unsigned OrTermrOpc;
unsigned OrSaveExecOpc;
unsigned Exec;
@@ -106,14 +90,19 @@ private:
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
void emitLoop(MachineInstr &MI);
- void emitEndCf(MachineInstr &MI);
+
+ MachineBasicBlock *emitEndCf(MachineInstr &MI);
+
+ void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
void combineMasks(MachineInstr &MI);
- void process(MachineInstr &MI);
+ bool removeMBBifRedundant(MachineBasicBlock &MBB);
+
+ MachineBasicBlock *process(MachineInstr &MI);
// Skip to the next instruction, ignoring debug instructions, and trivial
// block boundaries (blocks that have one (typically fallthrough) successor,
@@ -122,6 +111,19 @@ private:
skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
MachineBasicBlock::iterator It) const;
+ /// Find the insertion point for a new conditional branch.
+ MachineBasicBlock::iterator
+ skipToUncondBrOrEnd(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ assert(I->isTerminator());
+
+ // FIXME: What if we had multiple pre-existing conditional branches?
+ MachineBasicBlock::iterator End = MBB.end();
+ while (I != End && !I->isUnconditionalBranch())
+ ++I;
+ return I;
+ }
+
// Remove redundant SI_END_CF instructions.
void optimizeEndCf();
@@ -141,9 +143,6 @@ public:
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
AU.addPreservedID(LiveVariablesID);
- AU.addPreservedID(MachineLoopInfoID);
- AU.addPreservedID(MachineDominatorsID);
- AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -167,8 +166,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
static bool hasKill(const MachineBasicBlock *Begin,
const MachineBasicBlock *End, const SIInstrInfo *TII) {
DenseSet<const MachineBasicBlock*> Visited;
- SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
- Begin->succ_end());
+ SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
while (!Worklist.empty()) {
MachineBasicBlock *MBB = Worklist.pop_back_val();
@@ -275,6 +273,10 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
.addReg(Tmp, RegState::Kill);
+ // Skip ahead to the unconditional branch in case there are other terminators
+ // present.
+ I = skipToUncondBrOrEnd(MBB, I);
+
// Insert the S_CBRANCH_EXECZ instruction which will be optimized later
// during SIRemoveShortExecBranches.
MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
@@ -315,44 +317,37 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
- bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
- // We are running before TwoAddressInstructions, and si_else's operands are
- // tied. In order to correctly tie the registers, split this into a copy of
- // the src like it does.
- Register CopyReg = MRI->createVirtualRegister(BoolRC);
- MachineInstr *CopyExec =
- BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
- .add(MI.getOperand(1)); // Saved EXEC
-
// This must be inserted before phis and any spill code inserted before the
// else.
- Register SaveReg = ExecModified ?
- MRI->createVirtualRegister(BoolRC) : DstReg;
+ Register SaveReg = MRI->createVirtualRegister(BoolRC);
MachineInstr *OrSaveExec =
BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
- .addReg(CopyReg);
+ .add(MI.getOperand(1)); // Saved EXEC
MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
MachineBasicBlock::iterator ElsePt(MI);
- if (ExecModified) {
- MachineInstr *And =
- BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
- .addReg(Exec)
- .addReg(SaveReg);
+ // This accounts for any modification of the EXEC mask within the block and
+ // can be optimized out pre-RA when not required.
+ MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
+ .addReg(Exec)
+ .addReg(SaveReg);
- if (LIS)
- LIS->InsertMachineInstrInMaps(*And);
- }
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*And);
MachineInstr *Xor =
BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
.addReg(Exec)
.addReg(DstReg);
+ // Skip ahead to the unconditional branch in case there are other terminators
+ // present.
+ ElsePt = skipToUncondBrOrEnd(MBB, ElsePt);
+
MachineInstr *Branch =
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.addMBB(DestBB);
@@ -365,18 +360,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
- LIS->InsertMachineInstrInMaps(*CopyExec);
LIS->InsertMachineInstrInMaps(*OrSaveExec);
LIS->InsertMachineInstrInMaps(*Xor);
LIS->InsertMachineInstrInMaps(*Branch);
- // src reg is tied to dst reg.
LIS->removeInterval(DstReg);
LIS->createAndComputeVirtRegInterval(DstReg);
- LIS->createAndComputeVirtRegInterval(CopyReg);
- if (ExecModified)
- LIS->createAndComputeVirtRegInterval(SaveReg);
+ LIS->createAndComputeVirtRegInterval(SaveReg);
// Let this be recomputed.
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
@@ -435,8 +426,9 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
.addReg(Exec)
.add(MI.getOperand(0));
+ auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
MachineInstr *Branch =
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.add(MI.getOperand(1));
if (LIS) {
@@ -479,19 +471,37 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
} while (true);
}
-void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
+MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned CFMask = MI.getOperand(0).getReg();
- MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock::iterator InsPt =
- Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
- : MBB.begin();
- MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
- .addReg(Exec)
- .add(MI.getOperand(0));
+ MachineBasicBlock::iterator InsPt = MBB.begin();
+
+ // If we have instructions that aren't prolog instructions, split the block
+ // and emit a terminator instruction. This ensures correct spill placement.
+ // FIXME: We should unconditionally split the block here.
+ bool NeedBlockSplit = false;
+ Register DataReg = MI.getOperand(0).getReg();
+ for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
+ I != E; ++I) {
+ if (I->modifiesRegister(DataReg, TRI)) {
+ NeedBlockSplit = true;
+ break;
+ }
+ }
+
+ unsigned Opcode = OrOpc;
+ MachineBasicBlock *SplitBB = &MBB;
+ if (NeedBlockSplit) {
+ SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
+ Opcode = OrTermrOpc;
+ InsPt = MI;
+ }
+
+ MachineInstr *NewMI =
+ BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
+ .addReg(Exec)
+ .add(MI.getOperand(0));
LoweredEndCf.insert(NewMI);
@@ -512,6 +522,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
if (LIS)
LIS->handleMove(*NewMI);
+ return SplitBB;
}
// Returns replace operands for a logical operation, either single result
@@ -519,7 +530,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const {
MachineOperand &Op = MI.getOperand(OpNo);
- if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) {
+ if (!Op.isReg() || !Op.getReg().isVirtual()) {
Src.push_back(Op);
return;
}
@@ -539,7 +550,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
for (const auto &SrcOp : Def->explicit_operands())
if (SrcOp.isReg() && SrcOp.isUse() &&
- (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec))
+ (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
Src.push_back(SrcOp);
}
@@ -593,15 +604,18 @@ void SILowerControlFlow::optimizeEndCf() {
if (LIS)
LIS->RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();
+ removeMBBifRedundant(MBB);
}
}
}
-void SILowerControlFlow::process(MachineInstr &MI) {
+MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
MachineBasicBlock::iterator I(MI);
MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
+ MachineBasicBlock *SplitBB = &MBB;
+
switch (MI.getOpcode()) {
case AMDGPU::SI_IF:
emitIf(MI);
@@ -620,7 +634,7 @@ void SILowerControlFlow::process(MachineInstr &MI) {
break;
case AMDGPU::SI_END_CF:
- emitEndCf(MI);
+ SplitBB = emitEndCf(MI);
break;
default:
@@ -645,6 +659,147 @@ void SILowerControlFlow::process(MachineInstr &MI) {
break;
}
}
+
+ return SplitBB;
+}
+
+void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
+ MachineInstr &MI) {
+ MachineFunction &MF = *MBB->getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ bool IsWave32 = ST.isWave32();
+
+ if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+ // This should be before all vector instructions.
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
+ .addImm(MI.getOperand(0).getImm());
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ Register InputReg = MI.getOperand(0).getReg();
+ MachineInstr *FirstMI = &*MBB->begin();
+ if (InputReg.isVirtual()) {
+ MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+ assert(DefInstr && DefInstr->isCopy());
+ if (DefInstr->getParent() == MBB) {
+ if (DefInstr != FirstMI) {
+ // If the `InputReg` is defined in current block, we also need to
+ // move that instruction to the beginning of the block.
+ DefInstr->removeFromParent();
+ MBB->insert(FirstMI, DefInstr);
+ if (LIS)
+ LIS->handleMove(*DefInstr);
+ } else {
+ // If first instruction is definition then move pointer after it.
+ FirstMI = &*std::next(FirstMI->getIterator());
+ }
+ }
+ }
+
+ // Insert instruction sequence at block beginning (before vector operations).
+ const DebugLoc DL = MI.getDebugLoc();
+ const unsigned WavefrontSize = ST.getWavefrontSize();
+ const unsigned Mask = (WavefrontSize << 1) - 1;
+ Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ auto BfmMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+ .addReg(CountReg)
+ .addImm(0);
+ auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(WavefrontSize);
+ auto CmovMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
+ .addImm(-1);
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+
+ LIS->InsertMachineInstrInMaps(*BfeMI);
+ LIS->InsertMachineInstrInMaps(*BfmMI);
+ LIS->InsertMachineInstrInMaps(*CmpMI);
+ LIS->InsertMachineInstrInMaps(*CmovMI);
+
+ LIS->removeInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
+ auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
+ auto *S = B->getNextNode();
+ if (!S)
+ return nullptr;
+ if (B->isSuccessor(S)) {
+ // The only fallthrough candidate
+ MachineBasicBlock::iterator I(B->getFirstInstrTerminator());
+ MachineBasicBlock::iterator E = B->end();
+ for (; I != E; I++) {
+ if (I->isBranch() && TII->getBranchDestBlock(*I) == S)
+ // We have unoptimized branch to layout successor
+ return nullptr;
+ }
+ }
+ return S;
+ };
+
+ for (auto &I : MBB.instrs()) {
+ if (!I.isDebugInstr() && !I.isUnconditionalBranch())
+ return false;
+ }
+
+ assert(MBB.succ_size() == 1 && "MBB has more than one successor");
+
+ MachineBasicBlock *Succ = *MBB.succ_begin();
+ MachineBasicBlock *FallThrough = nullptr;
+
+ while (!MBB.predecessors().empty()) {
+ MachineBasicBlock *P = *MBB.pred_begin();
+ if (GetFallThroughSucc(P) == &MBB)
+ FallThrough = P;
+ P->ReplaceUsesOfBlockWith(&MBB, Succ);
+ }
+ MBB.removeSuccessor(Succ);
+ if (LIS) {
+ for (auto &I : MBB.instrs())
+ LIS->RemoveMachineInstrFromMaps(I);
+ }
+ MBB.clear();
+ MBB.eraseFromParent();
+ if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
+ if (!GetFallThroughSucc(Succ)) {
+ MachineFunction *MF = FallThrough->getParent();
+ MachineFunction::iterator FallThroughPos(FallThrough);
+ MF->splice(std::next(FallThroughPos), Succ);
+ } else
+ BuildMI(*FallThrough, FallThrough->end(),
+ FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(Succ);
+ }
+
+ return true;
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -666,6 +821,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
MovTermOpc = AMDGPU::S_MOV_B32_term;
Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ OrTermrOpc = AMDGPU::S_OR_B32_term;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
Exec = AMDGPU::EXEC_LO;
} else {
@@ -675,6 +831,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
MovTermOpc = AMDGPU::S_MOV_B64_term;
Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ OrTermrOpc = AMDGPU::S_OR_B64_term;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
Exec = AMDGPU::EXEC;
}
@@ -682,19 +839,21 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SmallVector<MachineInstr *, 32> Worklist;
MachineFunction::iterator NextBB;
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; BI = NextBB) {
+ for (MachineFunction::iterator BI = MF.begin();
+ BI != MF.end(); BI = NextBB) {
NextBB = std::next(BI);
- MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock *MBB = &*BI;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ MachineBasicBlock::iterator I, E, Next;
+ E = MBB->end();
+ for (I = MBB->begin(); I != E; I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
+ MachineBasicBlock *SplitMBB = MBB;
switch (MI.getOpcode()) {
case AMDGPU::SI_IF:
- process(MI);
+ SplitMBB = process(MI);
break;
case AMDGPU::SI_ELSE:
@@ -705,12 +864,25 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
if (InsertKillCleanups)
Worklist.push_back(&MI);
else
- process(MI);
+ SplitMBB = process(MI);
+ break;
+
+ // FIXME: find a better place for this
+ case AMDGPU::SI_INIT_EXEC:
+ case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
+ lowerInitExec(MBB, MI);
+ if (LIS)
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
break;
default:
break;
}
+
+ if (SplitMBB != MBB) {
+ MBB = Next->getParent();
+ E = MBB->end();
+ }
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 236a24a02ece..9570680ad9cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -22,20 +22,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "si-i1-copies"
@@ -89,16 +82,15 @@ private:
void lowerCopiesFromI1();
void lowerPhis();
void lowerCopiesToI1();
- bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+ bool isConstantLaneMask(Register Reg, bool &Val) const;
void buildMergeLaneMasks(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
unsigned DstReg, unsigned PrevReg, unsigned CurReg);
MachineBasicBlock::iterator
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
- bool isVreg1(unsigned Reg) const {
- return Register::isVirtualRegister(Reg) &&
- MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
+ bool isVreg1(Register Reg) const {
+ return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
}
bool isLaneMaskReg(unsigned Reg) const {
@@ -185,10 +177,8 @@ public:
}
}
- if (Divergent && PDT.dominates(&DefBlock, MBB)) {
- for (MachineBasicBlock *Succ : MBB->successors())
- Stack.push_back(Succ);
- }
+ if (Divergent && PDT.dominates(&DefBlock, MBB))
+ append_range(Stack, MBB->successors());
}
while (!Stack.empty()) {
@@ -197,8 +187,7 @@ public:
continue;
ReachableOrdered.push_back(MBB);
- for (MachineBasicBlock *Succ : MBB->successors())
- Stack.push_back(Succ);
+ append_range(Stack, MBB->successors());
}
for (MachineBasicBlock *MBB : ReachableOrdered) {
@@ -214,7 +203,7 @@ public:
ReachableMap[MBB] = true;
if (HaveReachablePred) {
for (MachineBasicBlock *UnreachablePred : Stack) {
- if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+ if (!llvm::is_contained(Predecessors, UnreachablePred))
Predecessors.push_back(UnreachablePred);
}
}
@@ -348,7 +337,7 @@ private:
if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
return true;
- if (llvm::find(Blocks, &MBB) != Blocks.end())
+ if (llvm::is_contained(Blocks, &MBB))
return true;
return false;
@@ -658,7 +647,7 @@ void SILowerI1Copies::lowerPhis() {
}
}
- unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+ Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
if (NewReg != DstReg) {
MRI->replaceRegWith(NewReg, DstReg);
MI->eraseFromParent();
@@ -703,8 +692,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
Register SrcReg = MI.getOperand(1).getReg();
assert(!MI.getOperand(1).getSubReg());
- if (!Register::isVirtualRegister(SrcReg) ||
- (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
+ if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
unsigned TmpReg = createLaneMaskReg(*MF);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
@@ -740,7 +728,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
}
}
-bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
const MachineInstr *MI;
for (;;) {
MI = MRI->getUniqueVRegDef(Reg);
@@ -748,7 +736,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
break;
Reg = MI->getOperand(1).getReg();
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
return false;
if (!isLaneMaskReg(Reg))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 1349d3b6bf3f..30405059530e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -16,19 +16,12 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -97,7 +90,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CS : CSI) {
// Insert the spill to the stack frame.
- unsigned Reg = CS.getReg();
+ MCRegister Reg = CS.getReg();
MachineInstrSpan MIS(I, &SaveBlock);
const TargetRegisterClass *RC =
@@ -184,6 +177,16 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
}
}
+// TODO: To support shrink wrapping, this would need to copy
+// PrologEpilogInserter's updateLiveness.
+static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
+ MachineBasicBlock &EntryBB = MF.front();
+
+ for (const CalleeSavedInfo &CSIReg : CSI)
+ EntryBB.addLiveIn(CSIReg.getReg());
+ EntryBB.sortUniqueLiveIns();
+}
+
bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
@@ -206,7 +209,8 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
for (unsigned I = 0; CSRegs[I]; ++I) {
- unsigned Reg = CSRegs[I];
+ MCRegister Reg = CSRegs[I];
+
if (SavedRegs.test(Reg)) {
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
@@ -221,6 +225,10 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
for (MachineBasicBlock *SaveBlock : SaveBlocks)
insertCSRSaves(*SaveBlock, CSI, LIS);
+ // Add live ins to save blocks.
+ assert(SaveBlocks.size() == 1 && "shrink wrapping not fully implemented");
+ updateLiveness(MF, CSI);
+
for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
insertCSRRestores(*RestoreBlock, CSI, LIS);
return true;
@@ -233,38 +241,44 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
static bool lowerShiftReservedVGPR(MachineFunction &MF,
const GCNSubtarget &ST) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineFrameInfo &FrameInfo = MF.getFrameInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- Register LowestAvailableVGPR, ReservedVGPR;
- ArrayRef<MCPhysReg> AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF);
- for (MCPhysReg Reg : AllVGPR32s) {
- if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) {
- LowestAvailableVGPR = Reg;
- break;
- }
- }
+ const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
+ // Early out if pre-reservation of a VGPR for SGPR spilling is disabled.
+ if (!PreReservedVGPR)
+ return false;
+ // If there are no free lower VGPRs available, default to using the
+ // pre-reserved register instead.
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ Register LowestAvailableVGPR =
+ TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF);
if (!LowestAvailableVGPR)
- return false;
+ LowestAvailableVGPR = PreReservedVGPR;
- ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
- int i = 0;
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ Optional<int> FI;
+ // Check if we are reserving a CSR. Create a stack object for a possible spill
+ // in the function prologue.
+ if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
+ FI = FrameInfo.CreateSpillStackObject(4, Align(4));
+
+ // Find saved info about the pre-reserved register.
+ const auto *ReservedVGPRInfoItr =
+ llvm::find_if(FuncInfo->getSGPRSpillVGPRs(),
+ [PreReservedVGPR](const auto &SpillRegInfo) {
+ return SpillRegInfo.VGPR == PreReservedVGPR;
+ });
+
+ assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end());
+ auto Index =
+ std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr);
+
+ FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
for (MachineBasicBlock &MBB : MF) {
- for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
- if (Reg.VGPR == ReservedVGPR) {
- MBB.removeLiveIn(ReservedVGPR);
- MBB.addLiveIn(LowestAvailableVGPR);
- Optional<int> FI;
- if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
- FI = FrameInfo.CreateSpillStackObject(4, Align(4));
-
- FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i);
- }
- ++i;
- }
+ assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
+ MBB.addLiveIn(LowestAvailableVGPR);
MBB.sortUniqueLiveIns();
}
@@ -300,11 +314,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
+ std::unique_ptr<RegScavenger> RS;
+
+ bool NewReservedRegs = false;
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
- if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) ||
- SpillVGPRToAGPR) {
+ const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
+ (HasCSRs || FuncInfo->hasSpilledSGPRs());
+ if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) {
// Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
// are spilled to VGPRs, in which case we can eliminate the stack usage.
//
@@ -329,7 +347,13 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
- TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
+ NewReservedRegs = true;
+ if (!RS)
+ RS.reset(new RegScavenger());
+
+ // FIXME: change to enterBasicBlockEnd()
+ RS->enterBasicBlock(MBB);
+ TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get());
continue;
}
}
@@ -340,6 +364,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+ NewReservedRegs = true;
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
(void)Spilled;
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
@@ -368,5 +393,9 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
SaveBlocks.clear();
RestoreBlocks.clear();
+ // Updated the reserved registers with any VGPRs added for SGPR spills.
+ if (NewReservedRegs)
+ MRI.freezeReservedRegs(MF);
+
return MadeChange;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 788e9873f780..9a0cdc7b1f4d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -7,21 +7,7 @@
//===----------------------------------------------------------------------===//
#include "SIMachineFunctionInfo.h"
-#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUTargetMachine.h"
-#include "AMDGPUSubtarget.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Function.h"
-#include <cassert>
-#include <vector>
#define MAX_LANES 64
@@ -75,16 +61,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
if (!isEntryFunction()) {
- // Non-entry functions have no special inputs for now, other registers
- // required for scratch access.
- ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-
// TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
- ArgInfo.PrivateSegmentBuffer =
- ArgDescriptor::createRegister(ScratchRSrcReg);
+ if (!ST.enableFlatScratch()) {
+ // Non-entry functions have no special inputs for now, other registers
+ // required for scratch access.
+ ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(ScratchRSrcReg);
+ }
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
@@ -142,7 +130,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
if (isAmdHsaOrMesa) {
- PrivateSegmentBuffer = true;
+ if (!ST.enableFlatScratch())
+ PrivateSegmentBuffer = true;
if (UseFixedABI) {
DispatchPtr = true;
@@ -167,11 +156,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
- if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
+ if (ST.hasFlatAddressSpace() && isEntryFunction() &&
+ (isAmdHsaOrMesa || ST.enableFlatScratch())) {
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls or stack objects that may require it before argument
// lowering.
- if (HasCalls || HasStackObjects)
+ if (HasCalls || HasStackObjects || ST.enableFlatScratch())
FlatScratchInit = true;
}
@@ -352,6 +342,8 @@ bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
Register LaneVGPR = TRI->findUnusedRegister(
MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
+ if (LaneVGPR == Register())
+ return false;
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None));
FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
return true;
@@ -537,21 +529,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
}
yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
- const llvm::SIMachineFunctionInfo& MFI,
- const TargetRegisterInfo &TRI)
- : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
- MaxKernArgAlign(MFI.getMaxKernArgAlign()),
- LDSSize(MFI.getLDSSize()),
- IsEntryFunction(MFI.isEntryFunction()),
- NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
- MemoryBound(MFI.isMemoryBound()),
- WaveLimiter(MFI.needsWaveLimiter()),
- HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
- ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
- FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
- StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
- ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
- Mode(MFI.getMode()) {}
+ const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI)
+ : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
+ MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
+ DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+ NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
+ MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
+ HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
+ HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+ HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
+ Occupancy(MFI.getOccupancy()),
+ ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
+ FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
+ StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
+ ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+}
void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
@@ -562,11 +554,15 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
LDSSize = YamlMFI.LDSSize;
+ DynLDSAlign = YamlMFI.DynLDSAlign;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
+ Occupancy = YamlMFI.Occupancy;
IsEntryFunction = YamlMFI.IsEntryFunction;
NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
MemoryBound = YamlMFI.MemoryBound;
WaveLimiter = YamlMFI.WaveLimiter;
+ HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
+ HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index cf1629fda0af..35fb43162199 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -17,28 +17,17 @@
#include "AMDGPUMachineFunction.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <array>
-#include <cassert>
-#include <utility>
-#include <vector>
+#include "llvm/Support/raw_ostream.h"
namespace llvm {
class MachineFrameInfo;
class MachineFunction;
class TargetRegisterClass;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
class AMDGPUPseudoSourceValue : public PseudoSourceValue {
public:
@@ -76,6 +65,8 @@ public:
static bool classof(const PseudoSourceValue *V) {
return V->kind() == PSVBuffer;
}
+
+ void printCustom(raw_ostream &OS) const override { OS << "BufferResource"; }
};
class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue {
@@ -87,6 +78,8 @@ public:
static bool classof(const PseudoSourceValue *V) {
return V->kind() == PSVImage;
}
+
+ void printCustom(raw_ostream &OS) const override { OS << "ImageResource"; }
};
class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
@@ -277,12 +270,18 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
uint64_t ExplicitKernArgSize = 0;
unsigned MaxKernArgAlign = 0;
unsigned LDSSize = 0;
+ Align DynLDSAlign;
bool IsEntryFunction = false;
bool NoSignedZerosFPMath = false;
bool MemoryBound = false;
bool WaveLimiter = false;
+ bool HasSpilledSGPRs = false;
+ bool HasSpilledVGPRs = false;
uint32_t HighBitsOf32BitAddress = 0;
+ // TODO: 10 may be a better default since it's the maximum.
+ unsigned Occupancy = 0;
+
StringValue ScratchRSrcReg = "$private_rsrc_reg";
StringValue FrameOffsetReg = "$fp_reg";
StringValue StackPtrOffsetReg = "$sp_reg";
@@ -304,10 +303,13 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
UINT64_C(0));
YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
+ YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
+ YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
+ YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -318,6 +320,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
+ YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
}
};
@@ -370,10 +373,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// unit. Minimum - first, maximum - second.
std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
- DenseMap<const Value *,
- std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
- DenseMap<const Value *,
- std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
+ std::unique_ptr<const AMDGPUBufferPseudoSourceValue> BufferPSV;
+ std::unique_ptr<const AMDGPUImagePseudoSourceValue> ImagePSV;
std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
private:
@@ -684,9 +685,9 @@ public:
return ArgInfo.getPreloadedValue(Value);
}
- Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
auto Arg = std::get<0>(ArgInfo.getPreloadedValue(Value));
- return Arg ? Arg->getRegister() : Register();
+ return Arg ? Arg->getRegister() : MCRegister();
}
unsigned getGITPtrHigh() const {
@@ -884,22 +885,18 @@ public:
return LDSWaveSpillSize;
}
- const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII,
- const Value *BufferRsrc) {
- assert(BufferRsrc);
- auto PSV = BufferPSVs.try_emplace(
- BufferRsrc,
- std::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
- return PSV.first->second.get();
+ const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII) {
+ if (!BufferPSV)
+ BufferPSV = std::make_unique<AMDGPUBufferPseudoSourceValue>(TII);
+
+ return BufferPSV.get();
}
- const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII,
- const Value *ImgRsrc) {
- assert(ImgRsrc);
- auto PSV = ImagePSVs.try_emplace(
- ImgRsrc,
- std::make_unique<AMDGPUImagePseudoSourceValue>(TII));
- return PSV.first->second.get();
+ const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII) {
+ if (!ImagePSV)
+ ImagePSV = std::make_unique<AMDGPUImagePseudoSourceValue>(TII);
+
+ return ImagePSV.get();
}
const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 3ba05aadbbbe..278dd05b049c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -12,29 +12,10 @@
//===----------------------------------------------------------------------===//
#include "SIMachineScheduler.h"
-#include "AMDGPU.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <map>
-#include <set>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -375,8 +356,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
// The use of findDefBetween removes the case 4.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
- unsigned Reg = RegMaskPair.RegUnit;
- if (Register::isVirtualRegister(Reg) &&
+ Register Reg = RegMaskPair.RegUnit;
+ if (Reg.isVirtual() &&
isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
LIS)) {
@@ -763,8 +744,7 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
// depend (order dependency) on one of the
// instruction in the block, and are required for the
// high latency instruction we add.
- AdditionalElements.insert(AdditionalElements.end(),
- SubGraph.begin(), SubGraph.end());
+ llvm::append_range(AdditionalElements, SubGraph);
}
}
if (CompatibleGroup) {
@@ -1682,9 +1662,9 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
// Tracking of currently alive registers to determine VGPR Usage.
void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
- for (unsigned Reg : Regs) {
+ for (Register Reg : Regs) {
// For now only track virtual registers.
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
continue;
// If not already in the live set, then add it.
(void) LiveRegs.insert(Reg);
@@ -1742,9 +1722,9 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
std::vector<int> DiffSetPressure;
DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
- for (unsigned Reg : InRegs) {
+ for (Register Reg : InRegs) {
// For now only track virtual registers.
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
continue;
if (LiveRegsConsumers[Reg] > 1)
continue;
@@ -1754,9 +1734,9 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
}
}
- for (unsigned Reg : OutRegs) {
+ for (Register Reg : OutRegs) {
// For now only track virtual registers.
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
continue;
PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
@@ -1902,9 +1882,9 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
VgprUsage = 0;
SgprUsage = 0;
for (_Iterator RegI = First; RegI != End; ++RegI) {
- unsigned Reg = *RegI;
+ Register Reg = *RegI;
// For now only track virtual registers
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
continue;
PSetIterator PSetI = MRI.getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index 02e0a3fe1b61..a2f5a1453d6a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -14,20 +14,18 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
-#include <cassert>
#include <cstdint>
-#include <map>
-#include <memory>
#include <set>
#include <vector>
namespace llvm {
+class SIInstrInfo;
+class SIRegisterInfo;
+
enum SIScheduleCandReason {
NoCand,
RegUsage,
@@ -455,7 +453,7 @@ public:
MachineRegisterInfo *getMRI() { return &MRI; }
const TargetRegisterInfo *getTRI() { return TRI; }
ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
- SUnit& getEntrySU() { return EntrySU; }
+ SUnit &getEntrySU() { return EntrySU; }
SUnit& getExitSU() { return ExitSU; }
void restoreSULinksLeft();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4e6c72ca20e2..3caa75e4d958 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -15,31 +15,13 @@
#include "AMDGPU.h"
#include "AMDGPUMachineModuleInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <list>
+#include "llvm/Support/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -47,6 +29,10 @@ using namespace llvm::AMDGPU;
#define DEBUG_TYPE "si-memory-legalizer"
#define PASS_NAME "SI Memory Legalizer"
+static cl::opt<bool> AmdgcnSkipCacheInvalidations(
+ "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
+ cl::desc("Use this to skip inserting cache invalidating instructions."));
+
namespace {
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
@@ -125,6 +111,7 @@ private:
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsCrossAddressSpaceOrdering = false;
+ bool IsVolatile = false;
bool IsNonTemporal = false;
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -134,11 +121,13 @@ private:
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering =
AtomicOrdering::SequentiallyConsistent,
+ bool IsVolatile = false,
bool IsNonTemporal = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering),
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+ IsVolatile(IsVolatile),
IsNonTemporal(IsNonTemporal) {
// There is also no cross address space ordering if the ordering
// address space is the same as the instruction address space and
@@ -186,7 +175,13 @@ public:
}
/// \returns True if memory access of the machine instruction used to
- /// create this SIMemOpInfo is non-temporal, false otherwise.
+ /// create this SIMemOpInfo is volatile, false otherwise.
+ bool isVolatile() const {
+ return IsVolatile;
+ }
+
+ /// \returns True if memory access of the machine instruction used to
+ /// create this SIMemOpInfo is nontemporal, false otherwise.
bool isNonTemporal() const {
return IsNonTemporal;
}
@@ -249,12 +244,15 @@ public:
class SICacheControl {
protected:
+ /// AMDGPU subtarget info.
+ const GCNSubtarget &ST;
+
/// Instruction info.
const SIInstrInfo *TII = nullptr;
IsaVersion IV;
- /// Whether to insert cache invalidation instructions.
+ /// Whether to insert cache invalidating instructions.
bool InsertCacheInv;
SICacheControl(const GCNSubtarget &ST);
@@ -271,28 +269,21 @@ public:
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const = 0;
- /// Update \p MI memory instruction to indicate it is
- /// nontemporal. Return true iff the instruction was modified.
- virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
- const = 0;
+ /// Update \p MI memory instruction of kind \p Op associated with address
+ /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
+ /// true iff the instruction was modified.
+ virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op, bool IsVolatile,
+ bool IsNonTemporal) const = 0;
/// Inserts any necessary instructions at position \p Pos relative
- /// to instruction \p MI to ensure any caches associated with
- /// address spaces \p AddrSpace for memory scopes up to memory scope
- /// \p Scope are invalidated. Returns true iff any instructions
- /// inserted.
- virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const = 0;
-
- /// Inserts any necessary instructions at position \p Pos relative
- /// to instruction \p MI to ensure memory instructions of kind \p Op
- /// associated with address spaces \p AddrSpace have completed as
- /// observed by other memory instructions executing in memory scope
- /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
- /// ordering is between address spaces. Returns true iff any
- /// instructions inserted.
+ /// to instruction \p MI to ensure memory instructions before \p Pos of kind
+ /// \p Op associated with address spaces \p AddrSpace have completed. Used
+ /// between memory instructions to enforce the order they become visible as
+ /// observed by other memory instructions executing in memory scope \p Scope.
+ /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
+ /// address spaces. Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
@@ -300,6 +291,28 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
+ /// Inserts any necessary instructions at position \p Pos relative to
+ /// instruction \p MI to ensure any subsequent memory instructions of this
+ /// thread with address spaces \p AddrSpace will observe the previous memory
+ /// operations by any thread for memory scopes up to memory scope \p Scope .
+ /// Returns true iff any instructions inserted.
+ virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const = 0;
+
+ /// Inserts any necessary instructions at position \p Pos relative to
+ /// instruction \p MI to ensure previous memory instructions by this thread
+ /// with address spaces \p AddrSpace have completed and can be observed by
+ /// subsequent memory instructions by any thread executing in memory scope \p
+ /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
+ /// between address spaces. Returns true iff any instructions inserted.
+ virtual bool insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const = 0;
+
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
@@ -328,12 +341,10 @@ public:
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
- bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
-
- bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -341,6 +352,17 @@ public:
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
};
class SIGfx7CacheControl : public SIGfx6CacheControl {
@@ -348,16 +370,15 @@ public:
SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
- bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
};
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
- bool CuMode = false;
/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
@@ -367,19 +388,16 @@ protected:
public:
- SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
- SIGfx7CacheControl(ST), CuMode(CuMode) {};
+ SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
- bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
-
- bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
bool insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -387,6 +405,11 @@ public:
SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
};
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -525,11 +548,13 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsNonTemporal = true;
+ bool IsVolatile = false;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
for (const auto &MMO : MI->memoperands()) {
IsNonTemporal &= MMO->isNonTemporal();
+ IsVolatile |= MMO->isVolatile();
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
AtomicOrdering OpOrdering = MMO->getOrdering();
@@ -572,7 +597,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
- IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
+ IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
+ IsNonTemporal);
}
Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
@@ -650,10 +676,10 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
return constructFromMIWithMMO(MI);
}
-SICacheControl::SICacheControl(const GCNSubtarget &ST) {
+SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
- InsertCacheInv = !ST.isAmdPalOS();
+ InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
/* static */
@@ -663,7 +689,7 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
return std::make_unique<SIGfx7CacheControl>(ST);
- return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
+ return std::make_unique<SIGfx10CacheControl>(ST);
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -674,9 +700,6 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
bool Changed = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
- /// TODO: Do not set glc for rmw atomic operations as they
- /// implicitly bypass the L1 cache.
-
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -697,64 +720,48 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
/// sequentially consistent, and no other thread can access scratch
/// memory.
- /// Other address spaces do not hava a cache.
+ /// Other address spaces do not have a cache.
return Changed;
}
-bool SIGfx6CacheControl::enableNonTemporal(
- const MachineBasicBlock::iterator &MI) const {
+bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
- bool Changed = false;
-
- /// TODO: Do not enableGLCBit if rmw atomic.
- Changed |= enableGLCBit(MI);
- Changed |= enableSLCBit(MI);
- return Changed;
-}
-
-bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
bool Changed = false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ if (IsVolatile) {
+ if (Op == SIMemOp::LOAD)
+ Changed |= enableGLCBit(MI);
- if (Pos == Position::AFTER)
- ++MI;
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
- }
+ return Changed;
}
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
-
- /// Other address spaces do not hava a cache.
-
- if (Pos == Position::AFTER)
- --MI;
+ if (IsNonTemporal) {
+ // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+ return Changed;
+ }
return Changed;
}
@@ -776,7 +783,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool VMCnt = false;
bool LGKMCnt = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+ SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -798,12 +806,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
- // If no cross address space ordering then an LDS waitcnt is not
- // needed as LDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/GDS memory as LDS operations
- // could be reordered with respect to later global/GDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+ // not needed as LDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/GDS memory as LDS operations could be
+ // reordered with respect to later global/GDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
@@ -820,12 +828,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- // If no cross address space ordering then an GDS waitcnt is not
- // needed as GDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/LDS memory as GDS operations
- // could be reordered with respect to later global/LDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+ // is not needed as GDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/LDS memory as GDS operations could be
+ // reordered with respect to later global/LDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
@@ -855,10 +863,64 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
if (!InsertCacheInv)
return false;
@@ -869,9 +931,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
- const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
- ? AMDGPU::BUFFER_WBINVL1
- : AMDGPU::BUFFER_WBINVL1_VOL;
+ const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
+ ? AMDGPU::BUFFER_WBINVL1
+ : AMDGPU::BUFFER_WBINVL1_VOL;
if (Pos == Position::AFTER)
++MI;
@@ -880,7 +942,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(Flush));
+ BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
@@ -898,7 +960,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
/// sequentially consistent, and no other thread can access scratch
/// memory.
- /// Other address spaces do not hava a cache.
+ /// Other address spaces do not have a cache.
if (Pos == Position::AFTER)
--MI;
@@ -926,9 +988,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
- // CU mode and all waves of a work-group are on the same CU, and so the
- // L0 does not need to be bypassed.
- if (!CuMode) Changed |= enableGLCBit(MI);
+ // CU mode all waves of a work-group are on the same CU, and so the L0
+ // does not need to be bypassed.
+ if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -944,73 +1006,50 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
/// sequentially consistent, and no other thread can access scratch
/// memory.
- /// Other address spaces do not hava a cache.
+ /// Other address spaces do not have a cache.
return Changed;
}
-bool SIGfx10CacheControl::enableNonTemporal(
- const MachineBasicBlock::iterator &MI) const {
+bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
- bool Changed = false;
- Changed |= enableSLCBit(MI);
- /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
-
- return Changed;
-}
-
-bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
- return false;
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
bool Changed = false;
- MachineBasicBlock &MBB = *MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
-
- if (Pos == Position::AFTER)
- ++MI;
+ if (IsVolatile) {
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
- switch (Scope) {
- case SIAtomicScope::SYSTEM:
- case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
- Changed = true;
- break;
- case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
- // in CU mode and all waves of a work-group are on the same CU, and so the
- // L0 does not need to be invalidated.
- if (!CuMode) {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
- Changed = true;
- }
- break;
- case SIAtomicScope::WAVEFRONT:
- case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
- break;
- default:
- llvm_unreachable("Unsupported synchronization scope");
+ if (Op == SIMemOp::LOAD) {
+ Changed |= enableGLCBit(MI);
+ Changed |= enableDLCBit(MI);
}
- }
-
- /// The scratch address space does not need the global memory cache
- /// to be flushed as all memory operations by the same thread are
- /// sequentially consistent, and no other thread can access scratch
- /// memory.
- /// Other address spaces do not hava a cache.
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+ return Changed;
+ }
- if (Pos == Position::AFTER)
- --MI;
+ if (IsNonTemporal) {
+ // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
+ Changed |= enableSLCBit(MI);
+ return Changed;
+ }
return Changed;
}
@@ -1033,7 +1072,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool VSCnt = false;
bool LGKMCnt = false;
- if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+ SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1048,7 +1088,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
// which shares the same L0.
- if (!CuMode) {
+ if (!ST.isCuModeEnabled()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -1070,12 +1110,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::WORKGROUP:
- // If no cross address space ordering then an LDS waitcnt is not
- // needed as LDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/GDS memory as LDS operations
- // could be reordered with respect to later global/GDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+ // not needed as LDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/GDS memory as LDS operations could be
+ // reordered with respect to later global/GDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
@@ -1092,12 +1132,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- // If no cross address space ordering then an GDS waitcnt is not
- // needed as GDS operations for all waves are executed in a
- // total global ordering as observed by all waves. Required if
- // also synchronizing with global/LDS memory as GDS operations
- // could be reordered with respect to later global/LDS memory
- // operations of the same wave.
+ // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
+ // is not needed as GDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/LDS memory as GDS operations could be
+ // reordered with respect to later global/LDS memory operations of the
+ // same wave.
LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
@@ -1134,6 +1174,61 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
+ // in CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be invalidated.
+ if (!ST.isCuModeEnabled()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
@@ -1173,20 +1268,20 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
SIMemOp::LOAD,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
- Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ Changed |= CC->insertAcquire(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
return Changed;
}
- // Atomic instructions do not have the nontemporal attribute.
- if (MOI.isNonTemporal()) {
- Changed |= CC->enableNonTemporal(MI);
- return Changed;
- }
-
+ // Atomic instructions already bypass caches to the scope specified by the
+ // SyncScope operand. Only non-atomic volatile and nontemporal instructions
+ // need additional treatment.
+ Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
+ SIMemOp::LOAD, MOI.isVolatile(),
+ MOI.isNonTemporal());
return Changed;
}
@@ -1199,21 +1294,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Changed |= CC->insertRelease(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
return Changed;
}
- // Atomic instructions do not have the nontemporal attribute.
- if (MOI.isNonTemporal()) {
- Changed |= CC->enableNonTemporal(MI);
- return Changed;
- }
-
+ // Atomic instructions already bypass caches to the scope specified by the
+ // SyncScope operand. Only non-atomic volatile and nontemporal instructions
+ // need additional treatment.
+ Changed |= CC->enableVolatileAndOrNonTemporal(
+ MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
+ MOI.isNonTemporal());
return Changed;
}
@@ -1235,19 +1329,23 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
/// ordering and memory scope, then library does not need to
/// generate a fence. Could add support in this file for
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
- /// adding waitcnt before a S_BARRIER.
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ /// adding S_WAITCNT before a S_BARRIER.
+ Changed |= CC->insertRelease(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ // TODO: If both release and invalidate are happening they could be combined
+ // to use the single "BUFFER_WBL2" instruction. This could be done by
+ // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
+ // track cache invalidate and write back instructions.
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::BEFORE);
+ Changed |= CC->insertAcquire(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::BEFORE);
return Changed;
}
@@ -1266,11 +1364,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- SIMemOp::LOAD | SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Changed |= CC->insertRelease(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@@ -1283,9 +1380,9 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
- Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ Changed |= CC->insertAcquire(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace(),
+ Position::AFTER);
}
return Changed;
@@ -1303,7 +1400,8 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
- if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
+ // Unbundle instructions after the post-RA scheduler.
+ if (MI->isBundle()) {
MachineBasicBlock::instr_iterator II(MI->getIterator());
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
I != E && I->isBundledWithPred(); ++I) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 0e162ac42c11..3d659eca47db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -14,20 +14,9 @@
//===----------------------------------------------------------------------===//
//
#include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
#include <queue>
#define DEBUG_TYPE "si-mode-register"
@@ -242,8 +231,10 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
Status IPChange;
for (MachineInstr &MI : MBB) {
Status InstrMode = getInstructionMode(MI, TII);
- if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
- (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+ if (MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_B32_mode ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) {
// We preserve any explicit mode register setreg instruction we encounter,
// as we assume it has been inserted by a higher authority (this is
// likely to be a very rare occurrence).
@@ -267,7 +258,8 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
// If this is an immediate then we know the value being set, but if it is
// not an immediate then we treat the modified bits of the mode register
// as unknown.
- if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+ if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) {
unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm();
unsigned Mode = (Val << Offset) & Mask;
Status Setreg = Status(Mask, Mode);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index a9717c6ffb70..54f20912d0a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -7,15 +7,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -176,13 +171,17 @@ static unsigned getSaveExecOp(unsigned Opc) {
}
// These are only terminators to get correct spill code placement during
-// register allocation, so turn them back into normal instructions. Only one of
-// these is expected per block.
+// register allocation, so turn them back into normal instructions.
static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
switch (MI.getOpcode()) {
- case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_MOV_B32_term: {
- MI.setDesc(TII.get(AMDGPU::COPY));
+ bool RegSrc = MI.getOperand(1).isReg();
+ MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
+ return true;
+ }
+ case AMDGPU::S_MOV_B64_term: {
+ bool RegSrc = MI.getOperand(1).isReg();
+ MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
return true;
}
case AMDGPU::S_XOR_B64_term: {
@@ -197,6 +196,12 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
return true;
}
+ case AMDGPU::S_OR_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_OR_B64));
+ return true;
+ }
case AMDGPU::S_OR_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
@@ -220,19 +225,29 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
}
}
+// Turn all pseudoterminators in the block into their equivalent non-terminator
+// instructions. Returns the reverse iterator to the first non-terminator
+// instruction in the block.
static MachineBasicBlock::reverse_iterator fixTerminators(
const SIInstrInfo &TII,
MachineBasicBlock &MBB) {
MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+
+ bool Seen = false;
+ MachineBasicBlock::reverse_iterator FirstNonTerm = I;
for (; I != E; ++I) {
if (!I->isTerminator())
- return I;
+ return Seen ? FirstNonTerm : I;
- if (removeTerminatorBit(TII, *I))
- return I;
+ if (removeTerminatorBit(TII, *I)) {
+ if (!Seen) {
+ FirstNonTerm = I;
+ Seen = true;
+ }
+ }
}
- return E;
+ return FirstNonTerm;
}
static MachineBasicBlock::reverse_iterator findExecCopy(
@@ -291,8 +306,20 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (I == E)
continue;
- Register CopyToExec = isCopyToExec(*I, ST);
- if (!CopyToExec.isValid())
+ // It's possible to see other terminator copies after the exec copy. This
+ // can happen if control flow pseudos had their outputs used by phis.
+ Register CopyToExec;
+
+ unsigned SearchCount = 0;
+ const unsigned SearchLimit = 5;
+ while (I != E && SearchCount++ < SearchLimit) {
+ CopyToExec = isCopyToExec(*I, ST);
+ if (CopyToExec)
+ break;
+ ++I;
+ }
+
+ if (!CopyToExec)
continue;
// Scan backwards to find the def.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 8af00fcf62a8..162e96655df2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -13,9 +13,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -31,6 +30,17 @@ private:
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+ unsigned AndOpc;
+ unsigned Andn2Opc;
+ unsigned OrSaveExecOpc;
+ unsigned XorTermrOpc;
+ MCRegister CondReg;
+ MCRegister ExecReg;
+
+ Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
+ bool optimizeElseBranch(MachineBasicBlock &MBB);
public:
static char ID;
@@ -68,11 +78,28 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
return new SIOptimizeExecMaskingPreRA();
}
-static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+// See if there is a def between \p AndIdx and \p SelIdx that needs to live
+// beyond \p AndIdx.
+static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
+ SlotIndex SelIdx) {
+ LiveQueryResult AndLRQ = LR.Query(AndIdx);
+ return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
+}
+
+// FIXME: Why do we bother trying to handle physical registers here?
+static bool isDefBetween(const SIRegisterInfo &TRI,
+ LiveIntervals *LIS, Register Reg,
+ const MachineInstr &Sel, const MachineInstr &And) {
+ SlotIndex AndIdx = LIS->getInstructionIndex(And);
+ SlotIndex SelIdx = LIS->getInstructionIndex(Sel);
+
+ if (Reg.isVirtual())
+ return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
- if (MI.isFullCopy() && MI.getOperand(1).getReg() == Exec)
- return true;
+ for (MCRegUnitIterator UI(Reg.asMCReg(), &TRI); UI.isValid(); ++UI) {
+ if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx))
+ return true;
+ }
return false;
}
@@ -93,75 +120,71 @@ static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
// lanes.
//
// Returns %cc register on success.
-static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
- const GCNSubtarget &ST,
- MachineRegisterInfo &MRI,
- LiveIntervals *LIS) {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- bool Wave32 = ST.isWave32();
- const unsigned AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- const unsigned Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
- const unsigned CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
- const unsigned ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
+Register
+SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return Opc == AMDGPU::S_CBRANCH_VCCZ ||
Opc == AMDGPU::S_CBRANCH_VCCNZ; });
if (I == MBB.terminators().end())
- return AMDGPU::NoRegister;
+ return Register();
- auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
- *I, MRI, LIS);
+ auto *And =
+ TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
if (!And || And->getOpcode() != AndOpc ||
!And->getOperand(1).isReg() || !And->getOperand(2).isReg())
- return AMDGPU::NoRegister;
+ return Register();
MachineOperand *AndCC = &And->getOperand(1);
Register CmpReg = AndCC->getReg();
unsigned CmpSubReg = AndCC->getSubReg();
- if (CmpReg == ExecReg) {
+ if (CmpReg == Register(ExecReg)) {
AndCC = &And->getOperand(2);
CmpReg = AndCC->getReg();
CmpSubReg = AndCC->getSubReg();
- } else if (And->getOperand(2).getReg() != ExecReg) {
- return AMDGPU::NoRegister;
+ } else if (And->getOperand(2).getReg() != Register(ExecReg)) {
+ return Register();
}
- auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
+ auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
Cmp->getParent() != And->getParent())
- return AMDGPU::NoRegister;
+ return Register();
MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
if (Op1->isImm() && Op2->isReg())
std::swap(Op1, Op2);
if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
- return AMDGPU::NoRegister;
+ return Register();
Register SelReg = Op1->getReg();
- auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
+ auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
- return AMDGPU::NoRegister;
+ return Register();
if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
- return AMDGPU::NoRegister;
+ return Register();
Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
Op1->getImm() != 0 || Op2->getImm() != 1)
- return AMDGPU::NoRegister;
+ return Register();
+
+ Register CCReg = CC->getReg();
+
+ // If there was a def between the select and the and, we would need to move it
+ // to fold this.
+ if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
+ return Register();
LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
<< *And);
- Register CCReg = CC->getReg();
LIS->RemoveMachineInstrFromMaps(*And);
MachineInstr *Andn2 =
BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
@@ -180,8 +203,8 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
// Try to remove compare. Cmp value should not used in between of cmp
// and s_and_b64 if VCC or just unused if any other register.
- if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) ||
- (CmpReg == CondReg &&
+ if ((CmpReg.isVirtual() && MRI->use_nodbg_empty(CmpReg)) ||
+ (CmpReg == Register(CondReg) &&
std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
[&](const MachineInstr &MI) {
return MI.readsRegister(CondReg, TRI);
@@ -192,7 +215,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
Cmp->eraseFromParent();
// Try to remove v_cndmask_b32.
- if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) {
+ if (SelReg.isVirtual() && MRI->use_nodbg_empty(SelReg)) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
LIS->RemoveMachineInstrFromMaps(*Sel);
@@ -203,6 +226,81 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
return CCReg;
}
+// Optimize sequence
+// %dst = S_OR_SAVEEXEC %src
+// ... instructions not modifying exec ...
+// %tmp = S_AND $exec, %dst
+// $exec = S_XOR_term $exec, %tmp
+// =>
+// %dst = S_OR_SAVEEXEC %src
+// ... instructions not modifying exec ...
+// $exec = S_XOR_term $exec, %dst
+//
+// Clean up potentially unnecessary code added for safety during
+// control flow lowering.
+//
+// Return whether any changes were made to MBB.
+bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
+ if (MBB.empty())
+ return false;
+
+ // Check this is an else block.
+ auto First = MBB.begin();
+ MachineInstr &SaveExecMI = *First;
+ if (SaveExecMI.getOpcode() != OrSaveExecOpc)
+ return false;
+
+ auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
+ return MI.getOpcode() == XorTermrOpc;
+ });
+ if (I == MBB.terminators().end())
+ return false;
+
+ MachineInstr &XorTermMI = *I;
+ if (XorTermMI.getOperand(1).getReg() != Register(ExecReg))
+ return false;
+
+ Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
+ Register DstReg = XorTermMI.getOperand(2).getReg();
+
+ // Find potentially unnecessary S_AND
+ MachineInstr *AndExecMI = nullptr;
+ I--;
+ while (I != First && !AndExecMI) {
+ if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
+ I->getOperand(1).getReg() == Register(ExecReg))
+ AndExecMI = &*I;
+ I--;
+ }
+ if (!AndExecMI)
+ return false;
+
+ // Check for exec modifying instructions.
+ // Note: exec defs do not create live ranges beyond the
+ // instruction so isDefBetween cannot be used.
+ // Instead just check that the def segments are adjacent.
+ SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
+ SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
+ for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
+ LiveRange &RegUnit = LIS->getRegUnit(*UI);
+ if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
+ return false;
+ }
+
+ // Remove unnecessary S_AND
+ LIS->removeInterval(SavedExecReg);
+ LIS->removeInterval(DstReg);
+
+ SaveExecMI.getOperand(0).setReg(DstReg);
+
+ LIS->RemoveMachineInstrFromMaps(*AndExecMI);
+ AndExecMI->eraseFromParent();
+
+ LIS->createAndComputeVirtRegInterval(DstReg);
+
+ return true;
+}
+
bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -211,16 +309,28 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
MRI = &MF.getRegInfo();
-
- MachineRegisterInfo &MRI = MF.getRegInfo();
- LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
- DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ LIS = &getAnalysis<LiveIntervals>();
+
+ const bool Wave32 = ST.isWave32();
+ AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+ OrSaveExecOpc =
+ Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+ CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC);
+ ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+
+ DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
- if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
+ if (optimizeElseBranch(MBB)) {
+ RecalcRegs.insert(AMDGPU::SCC);
+ Changed = true;
+ }
+
+ if (Register Reg = optimizeVcndVcmpPair(MBB)) {
RecalcRegs.insert(Reg);
RecalcRegs.insert(AMDGPU::VCC_LO);
RecalcRegs.insert(AMDGPU::VCC_HI);
@@ -301,16 +411,18 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
unsigned ScanThreshold = 10;
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E
&& ScanThreshold--; ++I) {
- if (!isFullExecCopy(*I, ST))
+ // Continue scanning if this is not a full exec copy
+ if (!(I->isFullCopy() && I->getOperand(1).getReg() == Register(ExecReg)))
continue;
Register SavedExec = I->getOperand(0).getReg();
- if (SavedExec.isVirtual() && MRI.hasOneNonDBGUse(SavedExec) &&
- MRI.use_instr_nodbg_begin(SavedExec)->getParent() == I->getParent()) {
+ if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec) &&
+ MRI->use_instr_nodbg_begin(SavedExec)->getParent() ==
+ I->getParent()) {
LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
LIS->RemoveMachineInstrFromMaps(*I);
I->eraseFromParent();
- MRI.replaceRegWith(SavedExec, Exec);
+ MRI->replaceRegWith(SavedExec, ExecReg);
LIS->removeInterval(SavedExec);
Changed = true;
}
@@ -320,9 +432,9 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (Changed) {
for (auto Reg : RecalcRegs) {
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
LIS->removeInterval(Reg);
- if (!MRI.reg_empty(Reg))
+ if (!MRI->reg_empty(Reg))
LIS->createAndComputeVirtRegInterval(Reg);
} else {
LIS->removeAllRegUnitsForPhysReg(Reg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 9a1855c3458b..7d7a753bb333 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -10,47 +10,21 @@
///
/// E.g. original:
/// V_LSHRREV_B32_e32 %0, 16, %1
-/// V_ADD_I32_e32 %2, %0, %3
+/// V_ADD_CO_U32_e32 %2, %0, %3
/// V_LSHLREV_B32_e32 %4, 16, %2
///
/// Replace:
-/// V_ADD_I32_sdwa %4, %1, %3
+/// V_ADD_CO_U32_sdwa %4, %1, %3
/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <unordered_map>
using namespace llvm;
@@ -570,8 +544,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Register::isPhysicalRegister(Src1->getReg()) ||
- Register::isPhysicalRegister(Dst->getReg()))
+ if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
break;
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
@@ -609,8 +582,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Register::isPhysicalRegister(Src1->getReg()) ||
- Register::isPhysicalRegister(Dst->getReg()))
+ if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
break;
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
@@ -625,8 +597,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
break;
}
- case AMDGPU::V_BFE_I32:
- case AMDGPU::V_BFE_U32: {
+ case AMDGPU::V_BFE_I32_e64:
+ case AMDGPU::V_BFE_U32_e64: {
// e.g.:
// from: v_bfe_u32 v1, v0, 8, 8
// to SDWA src:v0 src_sel:BYTE_1
@@ -673,12 +645,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Register::isPhysicalRegister(Src0->getReg()) ||
- Register::isPhysicalRegister(Dst->getReg()))
+ if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
break;
return std::make_unique<SDWASrcOperand>(
- Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
+ Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
}
case AMDGPU::V_AND_B32_e32:
@@ -702,8 +673,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Register::isPhysicalRegister(ValSrc->getReg()) ||
- Register::isPhysicalRegister(Dst->getReg()))
+ if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
break;
return std::make_unique<SDWASrcOperand>(
@@ -863,19 +833,19 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
}
// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
-// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
-// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
+// V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
+// to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
//
// We are transforming from a VOP3 into a VOP2 form of the instruction.
// %19:vgpr_32 = V_AND_B32_e32 255,
// killed %16:vgpr_32, implicit $exec
-// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
+// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
//
// becomes
-// %47:vgpr_32 = V_ADD_I32_sdwa
+// %47:vgpr_32 = V_ADD_CO_U32_sdwa
// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
// implicit-def $vcc, implicit $exec
// %48:vgpr_32 = V_ADDC_U32_e32
@@ -883,8 +853,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const {
int Opc = MI.getOpcode();
- assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
- "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
+ assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
+ "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
// Can the candidate MI be shrunk?
if (!TII->canShrink(MI, *MRI))
@@ -992,6 +962,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;
+ if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
+ if (!Src0->isReg() && !Src0->isImm())
+ return false;
+ }
+
+ if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
+ if (!Src1->isReg() && !Src1->isImm())
+ return false;
+ }
+
return true;
}
@@ -1235,8 +1215,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
if (PotentialMI &&
- (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
- PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
+ (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
+ PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
pseudoOpConvertToVOP2(*PotentialMI, ST);
}
SDWAOperands.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 4c72fa235975..ab05081e55d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -13,13 +13,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/InitializePasses.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 09dfe8753792..dc08d9dcb9bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -12,19 +12,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -92,11 +86,10 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
return false;
Register Reg = MO.getReg();
-
- if (!TRI->isVGPR(*MRI, Reg))
+ if (Reg.isPhysical())
return false;
- if (Register::isPhysicalRegister(Reg))
+ if (!TRI->isVGPR(*MRI, Reg))
return false;
if (VRM->hasPhys(Reg))
@@ -104,7 +97,7 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
LiveInterval &LI = LIS->getInterval(Reg);
- for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+ for (MCRegister PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
if (!MRI->isPhysRegUsed(PhysReg) &&
Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
Matrix->assign(LI, PhysReg);
@@ -126,7 +119,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
continue;
const Register VirtReg = MO.getReg();
- if (Register::isPhysicalRegister(VirtReg))
+ if (VirtReg.isPhysical())
continue;
if (!VRM->hasPhys(VirtReg))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 442be886a8ac..9ca43512cd91 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -12,12 +12,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
using namespace llvm;
@@ -70,6 +68,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+ const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
E = MBB.rend();
@@ -136,9 +135,20 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
if (A->getOpcode() == AndN2)
MaskValue = ~MaskValue;
- if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
- MI.killsRegister(CondReg, TRI))
+ if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
+ if (!MI.killsRegister(CondReg, TRI)) {
+ // Replace AND with MOV
+ if (MaskValue == 0) {
+ BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
+ .addImm(0);
+ } else {
+ BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
+ .addReg(ExecReg);
+ }
+ }
+ // Remove AND instruction
A->eraseFromParent();
+ }
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
if (SReg == ExecReg) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
new file mode 100644
index 000000000000..877c8b81b2c0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -0,0 +1,56 @@
+//===-- SIProgramInfo.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The SIProgramInfo tracks resource usage and hardware flags for kernels and
+/// entry functions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "SIProgramInfo.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+uint64_t SIProgramInfo::getComputePGMRSrc1() const {
+ return S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
+ S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
+ S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) |
+ S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode) |
+ S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+}
+
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const {
+ if (AMDGPU::isCompute(CC)) {
+ return getComputePGMRSrc1();
+ }
+ uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
+ S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
+ S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) |
+ S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode);
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ Reg |= S_00B028_MEM_ORDERED(MemOrdered);
+ break;
+ case CallingConv::AMDGPU_VS:
+ Reg |= S_00B128_MEM_ORDERED(MemOrdered);
+ break;
+ case CallingConv::AMDGPU_GS:
+ Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered);
+ break;
+ case CallingConv::AMDGPU_HS:
+ Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered);
+ break;
+ default:
+ break;
+ }
+ return Reg;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 7c039a54b57f..9b72d0829d80 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -7,7 +7,8 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// Defines struct to track resource usage for kernels and entry functions.
+/// Defines struct to track resource usage and hardware flags for kernels and
+/// entry functions.
///
//
//===----------------------------------------------------------------------===//
@@ -15,6 +16,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H
+#include "llvm/IR/CallingConv.h"
+#include <cstdint>
+
namespace llvm {
/// Track resource usage for kernels / entry functions.
@@ -32,8 +36,6 @@ struct SIProgramInfo {
uint32_t MemOrdered = 0; // GFX10+
uint64_t ScratchSize = 0;
- uint64_t ComputePGMRSrc1 = 0;
-
// Fields set in PGM_RSRC2 pm4 packet.
uint32_t LDSBlocks = 0;
uint32_t ScratchBlocks = 0;
@@ -64,6 +66,10 @@ struct SIProgramInfo {
bool VCCUsed = false;
SIProgramInfo() = default;
+
+ /// Compute the value of the ComputePGMRsrc1 register.
+ uint64_t getComputePGMRSrc1() const;
+ uint64_t getPGMRSrc1(CallingConv::ID CC) const;
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5d6009ebf384..7a45d8c54f9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,21 +12,15 @@
//===----------------------------------------------------------------------===//
#include "SIRegisterInfo.h"
+#include "AMDGPU.h"
#include "AMDGPURegisterBankInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include <vector>
using namespace llvm;
@@ -40,6 +34,14 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::init(true));
std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
+
+// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
+// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
+// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
+// meaning index 7 in SubRegFromChannelTable.
+static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
@@ -53,7 +55,8 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
"getNumCoveredRegs() will not work with generated subreg masks!");
RegPressureIgnoredUnits.resize(getNumRegUnits());
- RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
+ RegPressureIgnoredUnits.set(
+ *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
for (auto Reg : AMDGPU::VGPR_HI16RegClass)
RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
@@ -78,8 +81,28 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
}
};
+ static llvm::once_flag InitializeSubRegFromChannelTableFlag;
+
+ static auto InitializeSubRegFromChannelTableOnce = [this]() {
+ for (auto &Row : SubRegFromChannelTable)
+ Row.fill(AMDGPU::NoSubRegister);
+ for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
+ unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
+ unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
+ assert(Width < SubRegFromChannelTableWidthMap.size());
+ Width = SubRegFromChannelTableWidthMap[Width];
+ if (Width == 0)
+ continue;
+ unsigned TableIdx = Width - 1;
+ assert(TableIdx < SubRegFromChannelTable.size());
+ assert(Offset < SubRegFromChannelTable[TableIdx].size());
+ SubRegFromChannelTable[TableIdx][Offset] = Idx;
+ }
+ };
llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
+ llvm::call_once(InitializeSubRegFromChannelTableFlag,
+ InitializeSubRegFromChannelTableOnce);
}
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
@@ -98,6 +121,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
+ case CallingConv::AMDGPU_Gfx:
return CSR_AMDGPU_HighRegs_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
@@ -118,12 +142,17 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
+ case CallingConv::AMDGPU_Gfx:
return CSR_AMDGPU_HighRegs_RegMask;
default:
return nullptr;
}
}
+const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
+ return CSR_AMDGPU_NoRegs_RegMask;
+}
+
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SIFrameLowering *TFI =
MF.getSubtarget<GCNSubtarget>().getFrameLowering();
@@ -156,71 +185,13 @@ const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
-// FIXME: TableGen should generate something to make this manageable for all
-// register classes. At a minimum we could use the opposite of
-// composeSubRegIndices and go up from the base 32-bit subreg.
unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
unsigned NumRegs) {
- // Table of NumRegs sized pieces at every 32-bit offset.
- static const uint16_t SubRegFromChannelTable[][32] = {
- {AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
- AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
- AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
- AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
- AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31},
- {AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3,
- AMDGPU::sub3_sub4, AMDGPU::sub4_sub5, AMDGPU::sub5_sub6,
- AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, AMDGPU::sub8_sub9,
- AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
- AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15,
- AMDGPU::sub15_sub16, AMDGPU::sub16_sub17, AMDGPU::sub17_sub18,
- AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, AMDGPU::sub20_sub21,
- AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
- AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27,
- AMDGPU::sub27_sub28, AMDGPU::sub28_sub29, AMDGPU::sub29_sub30,
- AMDGPU::sub30_sub31, AMDGPU::NoSubRegister},
- {AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3,
- AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
- AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7,
- AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
- AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11,
- AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
- AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15,
- AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
- AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19,
- AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
- AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23,
- AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
- AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27,
- AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
- AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31,
- AMDGPU::NoSubRegister, AMDGPU::NoSubRegister},
- {AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4,
- AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
- AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8,
- AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
- AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12,
- AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
- AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16,
- AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
- AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20,
- AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
- AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24,
- AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
- AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28,
- AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
- AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister,
- AMDGPU::NoSubRegister, AMDGPU::NoSubRegister}};
-
- const unsigned NumRegIndex = NumRegs - 1;
-
- assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
- "Not implemented");
- assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
- return SubRegFromChannelTable[NumRegIndex][Channel];
+ assert(NumRegs < SubRegFromChannelTableWidthMap.size());
+ unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
+ assert(NumRegIndex && "Not implemented");
+ assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
+ return SubRegFromChannelTable[NumRegIndex - 1][Channel];
}
MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
@@ -322,7 +293,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ Register ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
// to spill.
@@ -363,9 +334,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
reserveRegisterTuples(Reserved, Reg);
- if (MFI->VGPRReservedForSGPRSpill)
- for (auto SSpill : MFI->getSGPRSpillVGPRs())
- reserveRegisterTuples(Reserved, SSpill.VGPR);
+ for (auto SSpill : MFI->getSGPRSpillVGPRs())
+ reserveRegisterTuples(Reserved, SSpill.VGPR);
return Reserved;
}
@@ -415,8 +385,8 @@ bool SIRegisterInfo::requiresVirtualBaseRegisters(
return true;
}
-int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
- assert(SIInstrInfo::isMUBUF(*MI));
+int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
+ assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::offset);
@@ -425,29 +395,34 @@ int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const {
- if (!SIInstrInfo::isMUBUF(*MI))
+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return 0;
- assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::vaddr) &&
+ assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::vaddr) ||
+ (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::saddr))) &&
"Should never see frame index on non-address operand");
- return getMUBUFInstrOffset(MI);
+ return getScratchInstrOffset(MI);
}
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
if (!MI->mayLoadOrStore())
return false;
- int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+ int64_t FullOffset = Offset + getScratchInstrOffset(MI);
- return !isUInt<12>(FullOffset);
+ if (SIInstrInfo::isMUBUF(*MI))
+ return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
}
-void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
- Register BaseReg,
- int FrameIdx,
- int64_t Offset) const {
+Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ int FrameIdx,
+ int64_t Offset) const {
MachineBasicBlock::iterator Ins = MBB->begin();
DebugLoc DL; // Defaults to "unknown"
@@ -456,32 +431,50 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
MachineFunction *MF = MBB->getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
+ : AMDGPU::V_MOV_B32_e32;
+
+ Register BaseReg = MRI.createVirtualRegister(
+ ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
+ : &AMDGPU::VGPR_32RegClass);
if (Offset == 0) {
- BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+ BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
.addFrameIndex(FrameIdx);
- return;
+ return BaseReg;
}
- MachineRegisterInfo &MRI = MF->getRegInfo();
Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register FIReg = MRI.createVirtualRegister(
+ ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
+ : &AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
- BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
+ BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
.addFrameIndex(FrameIdx);
+ if (ST.enableFlatScratch() ) {
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(FIReg);
+ return BaseReg;
+ }
+
TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
.addReg(OffsetReg, RegState::Kill)
.addReg(FIReg)
.addImm(0); // clamp bit
+
+ return BaseReg;
}
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
const SIInstrInfo *TII = ST.getInstrInfo();
+ bool IsFlat = TII->isFLATScratch(MI);
#ifndef NDEBUG
// FIXME: Is it possible to be storing a frame index to itself?
@@ -496,20 +489,31 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
}
#endif
- MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
-#ifndef NDEBUG
- MachineBasicBlock *MBB = MI.getParent();
- MachineFunction *MF = MBB->getParent();
-#endif
- assert(FIOp && FIOp->isFI() && "frame index must be address operand");
- assert(TII->isMUBUF(MI));
- assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
- MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() &&
- "should only be seeing stack pointer offset relative FrameIndex");
+ MachineOperand *FIOp =
+ TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
+ : AMDGPU::OpName::vaddr);
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
int64_t NewOffset = OffsetOp->getImm() + Offset;
- assert(isUInt<12>(NewOffset) && "offset should be legal");
+
+ assert(FIOp && FIOp->isFI() && "frame index must be address operand");
+ assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
+
+ if (IsFlat) {
+ assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
+ "offset should be legal");
+ FIOp->ChangeToRegister(BaseReg, false);
+ OffsetOp->setImm(NewOffset);
+ return;
+ }
+
+#ifndef NDEBUG
+ MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+ assert(SOffset->isImm() && SOffset->getImm() == 0);
+#endif
+
+ assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
+ "offset should be legal");
FIOp->ChangeToRegister(BaseReg, false);
OffsetOp->setImm(NewOffset);
@@ -518,12 +522,16 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
Register BaseReg,
int64_t Offset) const {
- if (!SIInstrInfo::isMUBUF(*MI))
+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return false;
- int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+ int64_t NewOffset = Offset + getScratchInstrOffset(MI);
+
+ if (SIInstrInfo::isMUBUF(*MI))
+ return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
- return isUInt<12>(NewOffset);
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
}
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -555,16 +563,22 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_V256_SAVE:
case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_A256_SAVE:
+ case AMDGPU::SI_SPILL_A256_RESTORE:
return 8;
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_V192_SAVE:
case AMDGPU::SI_SPILL_V192_RESTORE:
+ case AMDGPU::SI_SPILL_A192_SAVE:
+ case AMDGPU::SI_SPILL_A192_RESTORE:
return 6;
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_V160_SAVE:
case AMDGPU::SI_SPILL_V160_RESTORE:
+ case AMDGPU::SI_SPILL_A160_SAVE:
+ case AMDGPU::SI_SPILL_A160_RESTORE:
return 5;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -577,6 +591,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V96_RESTORE:
+ case AMDGPU::SI_SPILL_A96_SAVE:
+ case AMDGPU::SI_SPILL_A96_RESTORE:
return 3;
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
@@ -672,11 +688,13 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
unsigned Dst = IsStore ? Reg : ValueReg;
unsigned Src = IsStore ? ValueReg : Reg;
- unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
- : AMDGPU::V_ACCVGPR_READ_B32;
+ unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
+ : AMDGPU::V_ACCVGPR_READ_B32_e64;
- return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
- .addReg(Src, getKillRegState(IsKill));
+ auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+ .addReg(Src, getKillRegState(IsKill));
+ MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+ return MIB;
}
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
@@ -721,12 +739,46 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
return true;
}
+static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
+ unsigned LoadStoreOp,
+ unsigned EltSize) {
+ bool IsStore = TII->get(LoadStoreOp).mayStore();
+ bool UseST =
+ AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
+ AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
+
+ switch (EltSize) {
+ case 4:
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
+ break;
+ case 8:
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
+ : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
+ break;
+ case 12:
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
+ : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
+ break;
+ case 16:
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
+ : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
+ break;
+ default:
+ llvm_unreachable("Unexpected spill load/store size!");
+ }
+
+ if (UseST)
+ LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+
+ return LoadStoreOp;
+}
+
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
int Index,
Register ValueReg,
bool IsKill,
- MCRegister ScratchRsrcReg,
MCRegister ScratchOffsetReg,
int64_t InstOffset,
MachineMemOperand *MMO,
@@ -737,36 +789,51 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
const MachineFrameInfo &MFI = MF->getFrameInfo();
const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
- const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+ const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
const DebugLoc &DL = MI->getDebugLoc();
- bool IsStore = Desc.mayStore();
+ bool IsStore = Desc->mayStore();
+ bool IsFlat = TII->isFLATScratch(LoadStoreOp);
bool Scavenged = false;
MCRegister SOffset = ScratchOffsetReg;
- const unsigned EltSize = 4;
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
- unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
+ const bool IsAGPR = hasAGPRs(RC);
+ const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
+
+ // Always use 4 byte operations for AGPRs because we need to scavenge
+ // a temporary VGPR.
+ unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
+ unsigned NumSubRegs = RegWidth / EltSize;
unsigned Size = NumSubRegs * EltSize;
+ unsigned RemSize = RegWidth - Size;
+ unsigned NumRemSubRegs = RemSize ? 1 : 0;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ int64_t MaxOffset = Offset + Size + RemSize - EltSize;
int64_t ScratchOffsetRegDelta = 0;
+ if (IsFlat && EltSize > 4) {
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ Desc = &TII->get(LoadStoreOp);
+ }
+
Align Alignment = MFI.getObjectAlign(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
- Register TmpReg =
- hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
- : Register();
+ assert((IsFlat || ((Offset % EltSize) == 0)) &&
+ "unexpected VGPR spill offset");
- assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
-
- if (!isUInt<12>(Offset + Size - EltSize)) {
+ bool IsOffsetLegal = IsFlat
+ ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
+ : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
+ if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
SOffset = MCRegister();
// We currently only support spilling VGPRs to EltSize boundaries, meaning
// we can simplify the adjustment of Offset here to just scale with
// WavefrontSize.
- Offset *= ST.getWavefrontSize();
+ if (!IsFlat)
+ Offset *= ST.getWavefrontSize();
// We don't have access to the register scavenger if this function is called
// during PEI::scavengeFrameVirtualRegs().
@@ -804,10 +871,30 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
Offset = 0;
}
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
- Register SubReg = NumSubRegs == 1
- ? Register(ValueReg)
- : getSubReg(ValueReg, getSubRegFromChannel(i));
+ if (IsFlat && SOffset == AMDGPU::NoRegister) {
+ assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
+ && "Unexpected vaddr for flat scratch with a FI operand");
+
+ assert(ST.hasFlatScratchSTMode());
+ LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+ Desc = &TII->get(LoadStoreOp);
+ }
+
+ Register TmpReg;
+
+ for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
+ ++i, RegOffset += EltSize) {
+ if (i == NumSubRegs) {
+ EltSize = RemSize;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ Desc = &TII->get(LoadStoreOp);
+
+ unsigned NumRegs = EltSize / 4;
+ Register SubReg = e == 1
+ ? ValueReg
+ : Register(getSubReg(ValueReg,
+ getSubRegFromChannel(RegOffset / 4, NumRegs)));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -817,46 +904,111 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
-
- if (!MIB.getInstr()) {
- unsigned FinalReg = SubReg;
- if (TmpReg != AMDGPU::NoRegister) {
- if (IsStore)
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
- .addReg(SubReg, getKillRegState(IsKill));
- SubReg = TmpReg;
+ // Make sure the whole register is defined if there are undef components by
+ // adding an implicit def of the super-reg on the first instruction.
+ bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
+ bool NeedSuperRegImpOperand = e > 1;
+
+ unsigned Lane = RegOffset / 4;
+ unsigned LaneE = (RegOffset + EltSize) / 4;
+ for ( ; Lane != LaneE; ++Lane) {
+ bool IsSubReg = e > 1 || EltSize > 4;
+ Register Sub = IsSubReg
+ ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
+ : ValueReg;
+ auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
+ if (!MIB.getInstr())
+ break;
+ if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
+ MIB.addReg(ValueReg, RegState::ImplicitDefine);
+ NeedSuperRegDef = false;
+ }
+ if (IsSubReg || NeedSuperRegImpOperand) {
+ NeedSuperRegImpOperand = true;
+ unsigned State = SrcDstRegState;
+ if (Lane + 1 != LaneE)
+ State &= ~RegState::Kill;
+ MIB.addReg(ValueReg, RegState::Implicit | State);
}
+ }
- MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
- MachineMemOperand *NewMMO =
- MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
- commonAlignment(Alignment, EltSize * i));
+ if (Lane == LaneE) // Fully spilled into AGPRs.
+ continue;
+
+ // Offset in bytes from the beginning of the ValueReg to its portion we
+ // still need to spill. It may differ from RegOffset if a portion of
+ // current SubReg has been already spilled into AGPRs by the loop above.
+ unsigned RemRegOffset = Lane * 4;
+ unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
+ if (RemEltSize != EltSize) { // Partially spilled to AGPRs
+ assert(IsFlat && EltSize > 4);
+
+ unsigned NumRegs = RemEltSize / 4;
+ SubReg = Register(getSubReg(ValueReg,
+ getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
+ unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
+ Desc = &TII->get(Opc);
+ }
- MIB = BuildMI(*MBB, MI, DL, Desc)
- .addReg(SubReg,
- getDefRegState(!IsStore) | getKillRegState(IsKill))
- .addReg(ScratchRsrcReg);
- if (SOffset == AMDGPU::NoRegister) {
- MIB.addImm(0);
- } else {
- MIB.addReg(SOffset, SOffsetRegState);
+ unsigned FinalReg = SubReg;
+
+ if (IsAGPR) {
+ assert(EltSize == 4);
+
+ if (!TmpReg) {
+ assert(RS && "Needs to have RegScavenger to spill an AGPR!");
+ // FIXME: change to scavengeRegisterBackwards()
+ TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ RS->setRegUsed(TmpReg);
}
- MIB.addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(NewMMO);
+ if (IsStore) {
+ auto AccRead = BuildMI(*MBB, MI, DL,
+ TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
+ .addReg(SubReg, getKillRegState(IsKill));
+ if (NeedSuperRegDef)
+ AccRead.addReg(ValueReg, RegState::ImplicitDefine);
+ AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+ }
+ SubReg = TmpReg;
+ }
+
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
+ MachineMemOperand *NewMMO =
+ MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
+ commonAlignment(Alignment, RemRegOffset));
+
+ auto MIB = BuildMI(*MBB, MI, DL, *Desc)
+ .addReg(SubReg,
+ getDefRegState(!IsStore) | getKillRegState(IsKill));
+ if (!IsFlat)
+ MIB.addReg(FuncInfo->getScratchRSrcReg());
- if (!IsStore && TmpReg != AMDGPU::NoRegister)
- MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
- FinalReg)
- .addReg(TmpReg, RegState::Kill);
+ if (SOffset == AMDGPU::NoRegister) {
+ if (!IsFlat)
+ MIB.addImm(0);
+ } else {
+ MIB.addReg(SOffset, SOffsetRegState);
+ }
+ MIB.addImm(Offset + RemRegOffset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0); // tfe for MUBUF or dlc for FLAT
+ if (!IsFlat)
+ MIB.addImm(0) // dlc
+ .addImm(0); // swz
+ MIB.addMemOperand(NewMMO);
+
+ if (!IsAGPR && NeedSuperRegDef)
+ MIB.addReg(ValueReg, RegState::ImplicitDefine);
+
+ if (!IsStore && TmpReg != AMDGPU::NoRegister) {
+ MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
+ FinalReg)
+ .addReg(TmpReg, RegState::Kill);
+ MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
- if (NumSubRegs > 1)
+ if (NeedSuperRegImpOperand)
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
@@ -907,9 +1059,10 @@ void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
// Backup EXEC
if (OnlyExecLo) {
- SavedExecReg = NumSubRegs == 1
- ? SuperReg
- : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]);
+ SavedExecReg =
+ NumSubRegs == 1
+ ? SuperReg
+ : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
} else {
// If src/dst is an odd size it is possible subreg0 is not aligned.
for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
@@ -942,15 +1095,19 @@ void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
EltSize, Alignment);
if (IsLoad) {
- buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ buildSpillLoadStore(MI, Opc,
Index,
VGPR, false,
- MFI->getScratchRSrcReg(), FrameReg,
+ FrameReg,
Offset * EltSize, MMO,
RS);
} else {
- buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR,
- IsKill, MFI->getScratchRSrcReg(), FrameReg,
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ buildSpillLoadStore(MI, Opc, Index, VGPR,
+ IsKill, FrameReg,
Offset * EltSize, MMO, RS);
// This only ever adds one VGPR spill
MFI->addToSpilledVGPRs(1);
@@ -966,15 +1123,15 @@ void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
} else if (!IsKill) {
// Restore SGPRs from appropriate VGPR lanes
if (!OnlyExecLo) {
- BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
.addReg(VGPR)
.addImm(ExecLane + 1);
}
- BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- NumSubRegs == 1
- ? SavedExecReg
- : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]))
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ NumSubRegs == 1 ? SavedExecReg
+ : Register(getSubReg(
+ SuperReg, SplitParts[FirstPart + ExecLane])))
.addReg(VGPR, RegState::Kill)
.addImm(ExecLane);
}
@@ -987,7 +1144,6 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- DenseSet<Register> SGPRSpillVGPRDefinedSet;
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
@@ -1016,25 +1172,29 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToVGPR) {
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg =
- NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+ Register SubReg = NumSubRegs == 1
+ ? SuperReg
+ : Register(getSubReg(SuperReg, SplitParts[i]));
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
- // During SGPR spilling to VGPR, determine if the VGPR is defined. The
- // only circumstance in which we say it is undefined is when it is the
- // first spill to this VGPR in the first basic block.
- bool VGPRDefined = true;
- if (MBB == &MF->front())
- VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+ bool UseKill = IsKill && i == NumSubRegs - 1;
// Mark the "old value of vgpr" input undef only if this is the first sgpr
// spill to this specific vgpr in the first basic block.
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
- Spill.VGPR)
- .addReg(SubReg, getKillRegState(IsKill))
- .addImm(Spill.Lane)
- .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+ .addReg(SubReg, getKillRegState(UseKill))
+ .addImm(Spill.Lane)
+ .addReg(Spill.VGPR);
+
+ if (i == 0 && NumSubRegs > 1) {
+ // We may be spilling a super-register which is only partially defined,
+ // and need to ensure later spills think the value is defined.
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ }
+
+ if (NumSubRegs > 1)
+ MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
@@ -1060,13 +1220,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
for (unsigned i = Offset * PerVGPR,
e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
i < e; ++i) {
- Register SubReg =
- NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+ Register SubReg = NumSubRegs == 1
+ ? SuperReg
+ : Register(getSubReg(SuperReg, SplitParts[i]));
MachineInstrBuilder WriteLane =
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
- TmpVGPR)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
.addReg(SubReg, SubKillState)
.addImm(i % PerVGPR)
.addReg(TmpVGPR, TmpVGPRFlags);
@@ -1126,15 +1285,14 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
if (SpillToVGPR) {
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg =
- NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+ Register SubReg = NumSubRegs == 1
+ ? SuperReg
+ : Register(getSubReg(SuperReg, SplitParts[i]));
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- SubReg)
- .addReg(Spill.VGPR)
- .addImm(Spill.Lane);
+ auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane);
if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
}
@@ -1155,13 +1313,13 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
for (unsigned i = Offset * PerVGPR,
e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
i < e; ++i) {
- Register SubReg =
- NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+ Register SubReg = NumSubRegs == 1
+ ? SuperReg
+ : Register(getSubReg(SuperReg, SplitParts[i]));
bool LastSubReg = (i + 1 == e);
auto MIB =
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
.addReg(TmpVGPR, getKillRegState(LastSubReg))
.addImm(i);
if (NumSubRegs > 1 && i == 0)
@@ -1259,6 +1417,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V192_SAVE:
case AMDGPU::SI_SPILL_V160_SAVE:
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
@@ -1266,7 +1425,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V32_SAVE:
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A512_SAVE:
+ case AMDGPU::SI_SPILL_A256_SAVE:
+ case AMDGPU::SI_SPILL_A192_SAVE:
+ case AMDGPU::SI_SPILL_A160_SAVE:
case AMDGPU::SI_SPILL_A128_SAVE:
+ case AMDGPU::SI_SPILL_A96_SAVE:
case AMDGPU::SI_SPILL_A64_SAVE:
case AMDGPU::SI_SPILL_A32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
@@ -1274,10 +1437,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ buildSpillLoadStore(MI, Opc,
Index,
VData->getReg(), VData->isKill(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(),
@@ -1291,12 +1455,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V160_RESTORE:
+ case AMDGPU::SI_SPILL_V192_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE:
case AMDGPU::SI_SPILL_V1024_RESTORE:
case AMDGPU::SI_SPILL_A32_RESTORE:
case AMDGPU::SI_SPILL_A64_RESTORE:
+ case AMDGPU::SI_SPILL_A96_RESTORE:
case AMDGPU::SI_SPILL_A128_RESTORE:
+ case AMDGPU::SI_SPILL_A160_RESTORE:
+ case AMDGPU::SI_SPILL_A192_RESTORE:
+ case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_A512_RESTORE:
case AMDGPU::SI_SPILL_A1024_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
@@ -1304,10 +1473,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ buildSpillLoadStore(MI, Opc,
Index,
VData->getReg(), VData->isKill(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(),
@@ -1318,6 +1488,117 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
default: {
const DebugLoc &DL = MI->getDebugLoc();
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ if (ST.enableFlatScratch()) {
+ if (TII->isFLATScratch(*MI)) {
+ assert((int16_t)FIOperandNum ==
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::saddr));
+
+ // The offset is always swizzled, just replace it
+ if (FrameReg)
+ FIOp.ChangeToRegister(FrameReg, false);
+
+ if (!Offset)
+ return;
+
+ MachineOperand *OffsetOp =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
+ int64_t NewOffset = Offset + OffsetOp->getImm();
+ if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ true)) {
+ OffsetOp->setImm(NewOffset);
+ if (FrameReg)
+ return;
+ Offset = 0;
+ }
+
+ assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
+ "Unexpected vaddr for flat scratch with a FI operand");
+
+ // On GFX10 we have ST mode to use no registers for an address.
+ // Otherwise we need to materialize 0 into an SGPR.
+ if (!Offset && ST.hasFlatScratchSTMode()) {
+ unsigned Opc = MI->getOpcode();
+ unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
+ MI->RemoveOperand(
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
+ MI->setDesc(TII->get(NewOpc));
+ return;
+ }
+ }
+
+ if (!FrameReg) {
+ FIOp.ChangeToImmediate(Offset);
+ if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
+ return;
+ }
+
+ // We need to use register here. Check if we can use an SGPR or need
+ // a VGPR.
+ FIOp.ChangeToRegister(AMDGPU::M0, false);
+ bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
+
+ if (!Offset && FrameReg && UseSGPR) {
+ FIOp.setReg(FrameReg);
+ return;
+ }
+
+ const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
+ : &AMDGPU::VGPR_32RegClass;
+
+ Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
+ FIOp.setReg(TmpReg);
+ FIOp.setIsKill(true);
+
+ if ((!FrameReg || !Offset) && TmpReg) {
+ unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+ auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
+ if (FrameReg)
+ MIB.addReg(FrameReg);
+ else
+ MIB.addImm(Offset);
+
+ return;
+ }
+
+ Register TmpSReg =
+ UseSGPR ? TmpReg
+ : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
+ !UseSGPR);
+
+ // TODO: for flat scratch another attempt can be made with a VGPR index
+ // if no SGPRs can be scavenged.
+ if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+ report_fatal_error("Cannot scavenge register in FI elimination!");
+
+ if (!TmpSReg) {
+ // Use frame register and restore it after.
+ TmpSReg = FrameReg;
+ FIOp.setReg(FrameReg);
+ FIOp.setIsKill(false);
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
+ .addReg(FrameReg)
+ .addImm(Offset);
+
+ if (!UseSGPR)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addReg(TmpSReg, RegState::Kill);
+
+ if (TmpSReg == FrameReg) {
+ // Undo frame register modification.
+ BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
+ FrameReg)
+ .addReg(FrameReg)
+ .addImm(Offset);
+ }
+
+ return;
+ }
+
bool IsMUBUF = TII->isMUBUF(*MI);
if (!IsMUBUF && !MFI->isEntryFunction()) {
@@ -1356,7 +1637,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (!IsVOP2)
MIB.addImm(0); // clamp bit
} else {
- assert(MIB->getOpcode() == AMDGPU::V_ADD_I32_e64 &&
+ assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
"Need to reuse carry out register");
// Use scavenged unused carry out as offset register.
@@ -1419,23 +1700,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
AMDGPU::OpName::vaddr));
auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
- assert((SOffset.isReg() &&
- SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
- (SOffset.isImm() && SOffset.getImm() == 0));
- if (SOffset.isReg()) {
- if (FrameReg == AMDGPU::NoRegister) {
- SOffset.ChangeToImmediate(0);
- } else {
- SOffset.setReg(FrameReg);
- }
- }
+ assert((SOffset.isImm() && SOffset.getImm() == 0));
+
+ if (FrameReg != AMDGPU::NoRegister)
+ SOffset.ChangeToRegister(FrameReg, false);
int64_t Offset = FrameInfo.getObjectOffset(Index);
int64_t OldImm
= TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
int64_t NewOffset = OldImm + Offset;
- if (isUInt<12>(NewOffset) &&
+ if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
MI->eraseFromParent();
return;
@@ -1445,7 +1720,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// If the offset is simply too big, don't convert to a scratch wave offset
// relative index.
- int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
@@ -1590,6 +1864,16 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
return nullptr;
}
+bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
+ Register Reg) const {
+ const TargetRegisterClass *RC;
+ if (Reg.isVirtual())
+ RC = MRI.getRegClass(Reg);
+ else
+ RC = getPhysRegClass(Reg);
+ return isSGPRClass(RC);
+}
+
// TODO: It might be helpful to have some target specific flags in
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
@@ -1698,6 +1982,12 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
+bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
+ // TODO: 64-bit operands have extending behavior from 32-bit literal.
+ return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
+}
+
/// Returns a lowest register that is not used at any point in the function.
/// If all registers are used, then this function will return
/// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
@@ -1903,7 +2193,8 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
DefIdx = V->def;
} else {
// Find last def.
- for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
+ for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
+ ++Units) {
LiveRange &LR = LIS->getRegUnit(*Units);
if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
if (!DefIdx.isValid() ||
@@ -1963,11 +2254,12 @@ SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
}
ArrayRef<MCPhysReg>
-SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
- return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
+SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
+ return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
+ ST.getMaxNumSGPRs(MF) / 2);
}
ArrayRef<MCPhysReg>
-SIRegisterInfo::getAllVGPR32(const MachineFunction &MF) const {
- return makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), ST.getMaxNumVGPRs(MF));
+SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 62d9f1174337..963da9b3536b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -17,13 +17,11 @@
#define GET_REGINFO_HEADER
#include "AMDGPUGenRegisterInfo.inc"
-#include "SIDefines.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
namespace llvm {
class GCNSubtarget;
class LiveIntervals;
+class RegisterBank;
class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
@@ -40,6 +38,11 @@ private:
/// all elements of the inner vector combined give a full lane mask.
static std::array<std::vector<int16_t>, 16> RegSplitParts;
+ // Table representing sub reg of given width and offset.
+ // First index is subreg size: 32, 64, 96, 128, 160, 192, 224, 256, 512.
+ // Second index is 32 different dword offsets.
+ static std::array<std::array<uint16_t, 32>, 9> SubRegFromChannelTable;
+
void reserveRegisterTuples(BitVector &, MCRegister Reg) const;
public:
@@ -63,6 +66,7 @@ public:
const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
+ const uint32_t *getNoPreservedMask() const override;
// Stack access is very expensive. CSRs are also the high registers, and we
// want to minimize the number of used registers.
@@ -83,16 +87,15 @@ public:
const MachineFunction &MF) const override;
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
- int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+ int64_t getScratchInstrOffset(const MachineInstr *MI) const;
int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const override;
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
- int FrameIdx,
- int64_t Offset) const override;
+ Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+ int64_t Offset) const override;
void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
@@ -126,6 +129,7 @@ public:
StringRef getRegAsmName(MCRegister Reg) const override;
+ // Pseudo regs are not allowed
unsigned getHWRegIndex(MCRegister Reg) const {
return getEncodingValue(Reg) & 0xff;
}
@@ -148,14 +152,7 @@ public:
return isSGPRClass(getRegClass(RCID));
}
- bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const {
- const TargetRegisterClass *RC;
- if (Reg.isVirtual())
- RC = MRI.getRegClass(Reg);
- else
- RC = getPhysRegClass(Reg);
- return isSGPRClass(RC);
- }
+ bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;
/// \returns true if this class contains only AGPR registers
bool isAGPRClass(const TargetRegisterClass *RC) const {
@@ -198,11 +195,7 @@ public:
/// \returns True if operands defined with this operand type can accept
/// a literal constant (i.e. any 32-bit immediate).
- bool opCanUseLiteralConstant(unsigned OpType) const {
- // TODO: 64-bit operands have extending behavior from 32-bit literal.
- return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
- OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
- }
+ bool opCanUseLiteralConstant(unsigned OpType) const;
/// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
@@ -317,13 +310,13 @@ public:
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
- /// Return all SGPR32 which satisfy the waves per execution unit requirement
+ /// Return all SGPR64 which satisfy the waves per execution unit requirement
/// of the subtarget.
- ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
+ ArrayRef<MCPhysReg> getAllSGPR64(const MachineFunction &MF) const;
- /// Return all VGPR32 which satisfy the waves per execution unit requirement
+ /// Return all SGPR32 which satisfy the waves per execution unit requirement
/// of the subtarget.
- ArrayRef<MCPhysReg> getAllVGPR32(const MachineFunction &MF) const;
+ ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
@@ -331,7 +324,6 @@ private:
int Index,
Register ValueReg,
bool ValueIsKill,
- MCRegister ScratchRsrcReg,
MCRegister ScratchOffsetReg,
int64_t InstrOffset,
MachineMemOperand *MMO,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ff1f5c4bc49b..92390f1f3297 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -17,9 +17,7 @@ class Indexes<int N> {
24, 25, 26, 27, 28, 29, 30, 31];
// Returns list of indexes [0..N)
- list<int> slice =
- !foldl([]<int>, all, acc, cur,
- !listconcat(acc, !if(!lt(cur, N), [cur], [])));
+ list<int> slice = !filter(i, all, !lt(i, N));
}
let Namespace = "AMDGPU" in {
@@ -27,17 +25,17 @@ let Namespace = "AMDGPU" in {
def lo16 : SubRegIndex<16, 0>;
def hi16 : SubRegIndex<16, 16>;
-foreach Index = 0-31 in {
+foreach Index = 0...31 in {
def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
}
-foreach Index = 1-31 in {
+foreach Index = 1...31 in {
def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>;
def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>;
}
-foreach Size = {2-6,8,16} in {
- foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in {
+foreach Size = {2...6,8,16} in {
+ foreach Index = Indexes<!sub(33, Size)>.slice in {
def !foldl("", Indexes<Size>.slice, acc, cur,
!strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
@@ -89,7 +87,7 @@ class getSubRegs<int size> {
class RegSeqNames<int last_reg, int stride, int size, string prefix,
int start = 0> {
int next = !add(start, stride);
- int end_reg = !add(!add(start, size), -1);
+ int end_reg = !add(start, size, -1);
list<string> ret =
!if(!le(end_reg, last_reg),
!listconcat([prefix # "[" # start # ":" # end_reg # "]"],
@@ -102,7 +100,7 @@ class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
int start = 0> {
dag trunc_rc = (trunc RC,
!if(!and(!eq(stride, 1), !eq(start, 0)),
- !add(!add(last_reg, 2), !mul(size, -1)),
+ !sub(!add(last_reg, 2), size),
!add(last_reg, 1)));
list<dag> ret =
!if(!lt(start, size),
@@ -149,7 +147,7 @@ multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
!cast<Register>(NAME#"_HI16")]> {
let Namespace = "AMDGPU";
let SubRegIndices = [lo16, hi16];
- let CoveredBySubRegs = !if(ArtificialHigh,0,1);
+ let CoveredBySubRegs = !not(ArtificialHigh);
let HWEncoding = regIdx;
let HWEncoding{8} = HWEncodingHigh;
}
@@ -247,10 +245,10 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> {
let HWEncoding = 110;
}
-foreach Index = 0-15 in {
- defm TTMP#Index#_vi : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>;
- defm TTMP#Index#_gfx9_gfx10 : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>;
- defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>;
+foreach Index = 0...15 in {
+ defm TTMP#Index#_vi : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>;
+ defm TTMP#Index#_gfx9plus : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>;
+ defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>;
}
multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
@@ -274,7 +272,7 @@ def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
// SGPR registers
-foreach Index = 0-105 in {
+foreach Index = 0...105 in {
defm SGPR#Index :
SIRegLoHi16 <"s"#Index, Index>,
DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)),
@@ -282,14 +280,14 @@ foreach Index = 0-105 in {
}
// VGPR registers
-foreach Index = 0-255 in {
+foreach Index = 0...255 in {
defm VGPR#Index :
SIRegLoHi16 <"v"#Index, Index, 0, 1>,
DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>;
}
// AccVGPR registers
-foreach Index = 0-255 in {
+foreach Index = 0...255 in {
defm AGPR#Index :
SIRegLoHi16 <"a"#Index, Index, 1, 1>,
DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>;
@@ -389,7 +387,7 @@ def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttm
class TmpRegTuplesBase<int index, int size,
list<Register> subRegs,
list<SubRegIndex> indices = getSubRegs<size>.ret,
- int index1 = !add(index, !add(size, -1)),
+ int index1 = !add(index, size, -1),
string name = "ttmp["#index#":"#index1#"]"> :
RegisterWithSubRegs<name, subRegs> {
let HWEncoding = subRegs[0].HWEncoding;
@@ -421,8 +419,8 @@ class TmpRegTuples<string tgt,
getSubRegs<size>.ret>;
foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
- def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
- def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_gfx9plus : TmpRegTuples<"_gfx9plus", 2, Index>;
}
foreach Index = {0, 4, 8, 12} in {
@@ -431,7 +429,7 @@ foreach Index = {0, 4, 8, 12} in {
_TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#
_TTMP#!add(Index,2)#
- _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>;
+ _TTMP#!add(Index,3)#_gfx9plus : TmpRegTuples<"_gfx9plus", 4, Index>;
}
foreach Index = {0, 4, 8} in {
@@ -448,7 +446,7 @@ foreach Index = {0, 4, 8} in {
_TTMP#!add(Index,4)#
_TTMP#!add(Index,5)#
_TTMP#!add(Index,6)#
- _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>;
+ _TTMP#!add(Index,7)#_gfx9plus : TmpRegTuples<"_gfx9plus", 8, Index>;
}
def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi :
@@ -458,12 +456,12 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,
TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>;
-def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 :
+def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9plus :
TmpRegTuplesBase<0, 16,
- [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10,
- TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10,
- TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
- TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
+ [TTMP0_gfx9plus, TTMP1_gfx9plus, TTMP2_gfx9plus, TTMP3_gfx9plus,
+ TTMP4_gfx9plus, TTMP5_gfx9plus, TTMP6_gfx9plus, TTMP7_gfx9plus,
+ TTMP8_gfx9plus, TTMP9_gfx9plus, TTMP10_gfx9plus, TTMP11_gfx9plus,
+ TTMP12_gfx9plus, TTMP13_gfx9plus, TTMP14_gfx9plus, TTMP15_gfx9plus]>;
class RegisterTypes<list<ValueType> reg_types> {
list<ValueType> types = reg_types;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
index 64fca0b46797..d30ff4a3fd15 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
@@ -14,9 +14,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/Support/CommandLine.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
index 932381c99e0b..db4a009e08d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -104,6 +104,9 @@ def HWVALU : ProcResource<1> {
def HWRC : ProcResource<1> { // Register destination cache
let BufferSize = 1;
}
+def HWXDL : ProcResource<1> { // MFMA CU
+ let BufferSize = 0;
+}
class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
int latency> : WriteRes<write, resources> {
@@ -138,12 +141,16 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<WriteFloatCvt, 4>;
def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
- def : HWVALUWriteRes<Write2PassMAI, 2>;
- def : HWVALUWriteRes<Write8PassMAI, 8>;
- def : HWVALUWriteRes<Write16PassMAI, 16>;
+
+ let ResourceCycles = [2] in
+ def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
+ let ResourceCycles = [8] in
+ def : HWWriteRes<Write8PassMAI, [HWXDL], 8>;
+ let ResourceCycles = [16] in
+ def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
def : ReadAdvance<MIVGPRRead, -2>;
- def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
+ def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
// Technically mfma reads can be from 0 to 4 cycles but that does not make
// sense to model because its register setup is huge. In particular if we
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 9c6833a7dab6..2628070f219c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -9,19 +9,10 @@
//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "si-shrink-instructions"
@@ -78,7 +69,7 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
MachineOperand &Src0 = MI.getOperand(Src0Idx);
if (Src0.isReg()) {
Register Reg = Src0.getReg();
- if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+ if (Reg.isVirtual() && MRI.hasOneUse(Reg)) {
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
if (Def && Def->isMoveImmediate()) {
MachineOperand &MovSrc = Def->getOperand(1);
@@ -86,13 +77,9 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
isUInt<32>(MovSrc.getImm()))) {
- // It's possible to have only one component of a super-reg defined by
- // a single mov, so we need to clear any subregister flag.
- Src0.setSubReg(0);
Src0.ChangeToImmediate(MovSrc.getImm());
ConstantFolded = true;
} else if (MovSrc.isFI()) {
- Src0.setSubReg(0);
Src0.ChangeToFrameIndex(MovSrc.getIndex());
ConstantFolded = true;
} else if (MovSrc.isGlobal()) {
@@ -276,8 +263,8 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
// enabled
int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
- unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
- unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
+ unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
+ unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
int ToUntie = -1;
if (TFEVal || LWEVal) {
// TFE/LWE is enabled so we need to deal with an implicit tied operand
@@ -367,19 +354,23 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
}
if (NewImm != 0) {
- if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
+ if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
return true;
}
if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+ const bool IsUndef = SrcReg->isUndef();
+ const bool IsKill = SrcReg->isKill();
MI.setDesc(TII->get(Opc));
if (Opc == AMDGPU::S_BITSET0_B32 ||
Opc == AMDGPU::S_BITSET1_B32) {
Src0->ChangeToImmediate(NewImm);
// Remove the immediate and add the tied input.
- MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+ MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
+ /*isImp*/ false, IsKill,
+ /*isDead*/ false, IsUndef);
MI.tieOperands(0, 2);
} else {
SrcImm->setImm(NewImm);
@@ -393,17 +384,16 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
// This is the same as MachineInstr::readsRegister/modifiesRegister except
// it takes subregs into account.
static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
- unsigned Reg, unsigned SubReg,
+ Register Reg, unsigned SubReg,
const SIRegisterInfo &TRI) {
for (const MachineOperand &MO : R) {
if (!MO.isReg())
continue;
- if (Register::isPhysicalRegister(Reg) &&
- Register::isPhysicalRegister(MO.getReg())) {
+ if (Reg.isPhysical() && MO.getReg().isPhysical()) {
if (TRI.regsOverlap(Reg, MO.getReg()))
return true;
- } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) {
+ } else if (MO.getReg() == Reg && Reg.isVirtual()) {
LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
TRI.getSubRegIndexLaneMask(MO.getSubReg());
if (Overlap.any())
@@ -426,10 +416,10 @@ static bool instModifiesReg(const MachineInstr *MI,
}
static TargetInstrInfo::RegSubRegPair
-getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
+getSubRegForIndex(Register Reg, unsigned Sub, unsigned I,
const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
- if (Register::isPhysicalRegister(Reg)) {
+ if (Reg.isPhysical()) {
Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
} else {
Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
@@ -438,6 +428,22 @@ getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
return TargetInstrInfo::RegSubRegPair(Reg, Sub);
}
+static void dropInstructionKeepingImpDefs(MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ for (unsigned i = MI.getDesc().getNumOperands() +
+ MI.getDesc().getNumImplicitUses() +
+ MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
+ i != e; ++i) {
+ const MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isDef())
+ continue;
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
+ }
+
+ MI.eraseFromParent();
+}
+
// Match:
// mov t, x
// mov x, y
@@ -477,18 +483,25 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
if (!TRI.isVGPR(MRI, X))
return nullptr;
+ if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
+ return nullptr;
+
const unsigned SearchLimit = 16;
unsigned Count = 0;
+ bool KilledT = false;
for (auto Iter = std::next(MovT.getIterator()),
E = MovT.getParent()->instr_end();
- Iter != E && Count < SearchLimit; ++Iter, ++Count) {
+ Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
MachineInstr *MovY = &*Iter;
+ KilledT = MovY->killsRegister(T, &TRI);
+
if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
MovY->getOpcode() != AMDGPU::COPY) ||
!MovY->getOperand(1).isReg() ||
MovY->getOperand(1).getReg() != T ||
- MovY->getOperand(1).getSubReg() != Tsub)
+ MovY->getOperand(1).getSubReg() != Tsub ||
+ MovY->hasRegisterImplicitUseOperand(AMDGPU::M0))
continue;
Register Y = MovY->getOperand(0).getReg();
@@ -522,32 +535,53 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
MovX = nullptr;
break;
}
+ // Implicit use of M0 is an indirect move.
+ if (I->hasRegisterImplicitUseOperand(AMDGPU::M0))
+ continue;
+
+ if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
+ continue;
+
MovX = &*I;
}
if (!MovX)
continue;
- LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
+ LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
for (unsigned I = 0; I < Size; ++I) {
TargetInstrInfo::RegSubRegPair X1, Y1;
X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
- BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
- TII->get(AMDGPU::V_SWAP_B32))
+ MachineBasicBlock &MBB = *MovT.getParent();
+ auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+ TII->get(AMDGPU::V_SWAP_B32))
.addDef(X1.Reg, 0, X1.SubReg)
.addDef(Y1.Reg, 0, Y1.SubReg)
.addReg(Y1.Reg, 0, Y1.SubReg)
.addReg(X1.Reg, 0, X1.SubReg).getInstr();
+ if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+ // Drop implicit EXEC.
+ MIB->RemoveOperand(MIB->getNumExplicitOperands());
+ MIB->copyImplicitOps(*MBB.getParent(), *MovX);
+ }
}
MovX->eraseFromParent();
- MovY->eraseFromParent();
+ dropInstructionKeepingImpDefs(*MovY, TII);
MachineInstr *Next = &*std::next(MovT.getIterator());
- if (MRI.use_nodbg_empty(T))
- MovT.eraseFromParent();
- else
+
+ if (MRI.use_nodbg_empty(T)) {
+ dropInstructionKeepingImpDefs(MovT, TII);
+ } else {
Xop.setIsKill(false);
+ for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
+ unsigned OpNo = MovT.getNumExplicitOperands() + I;
+ const MachineOperand &Op = MovT.getOperand(OpNo);
+ if (Op.isKill() && TRI.regsOverlap(X, Op.getReg()))
+ MovT.RemoveOperand(OpNo);
+ }
+ }
return Next;
}
@@ -585,8 +619,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// optimizations happen because this will confuse them.
// XXX - not exactly a check for post-regalloc run.
MachineOperand &Src = MI.getOperand(1);
- if (Src.isImm() &&
- Register::isPhysicalRegister(MI.getOperand(0).getReg())) {
+ if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
int32_t ReverseImm;
if (isReverseInlineImm(TII, Src, ReverseImm)) {
MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
@@ -604,35 +637,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
}
- // Combine adjacent s_nops to use the immediate operand encoding how long
- // to wait.
- //
- // s_nop N
- // s_nop M
- // =>
- // s_nop (N + M)
- if (MI.getOpcode() == AMDGPU::S_NOP &&
- MI.getNumOperands() == 1 && // Don't merge with implicit operands
- Next != MBB.end() &&
- (*Next).getOpcode() == AMDGPU::S_NOP &&
- (*Next).getNumOperands() == 1) {
-
- MachineInstr &NextMI = *Next;
- // The instruction encodes the amount to wait with an offset of 1,
- // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
- // after adding.
- uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
- uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
-
- // Make sure we don't overflow the bounds.
- if (Nop0 + Nop1 <= 8) {
- NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
- MI.eraseFromParent();
- }
-
- continue;
- }
-
// FIXME: We also need to consider movs of constant operands since
// immediate operands are not folded if they have more than one use, and
// the operand folding pass is unaware if the immediate will be free since
@@ -652,7 +656,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// FIXME: This could work better if hints worked with subregisters. If
// we have a vector add of a constant, we usually don't get the correct
// allocation due to the subregister usage.
- if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) {
+ if (Dest->getReg().isVirtual() && Src0->isReg()) {
MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
continue;
@@ -680,7 +684,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
const MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
- if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) {
+ if (Src.isImm() && Dst.getReg().isPhysical()) {
int32_t ReverseImm;
if (isKImmOperand(TII, Src))
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
@@ -729,7 +733,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (TII->isVOPC(Op32)) {
Register DstReg = MI.getOperand(0).getReg();
- if (Register::isVirtualRegister(DstReg)) {
+ if (DstReg.isVirtual()) {
// VOPC instructions can only write to the VCC register. We can't
// force them to use VCC here, because this is only one register and
// cannot deal with sequences which would require multiple copies of
@@ -753,7 +757,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (!Src2->isReg())
continue;
Register SReg = Src2->getReg();
- if (Register::isVirtualRegister(SReg)) {
+ if (SReg.isVirtual()) {
MRI.setRegAllocationHint(SReg, 0, VCCReg);
continue;
}
@@ -773,7 +777,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
bool Next = false;
if (SDst->getReg() != VCCReg) {
- if (Register::isVirtualRegister(SDst->getReg()))
+ if (SDst->getReg().isVirtual())
MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
Next = true;
}
@@ -781,7 +785,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// All of the instructions with carry outs also have an SGPR input in
// src2.
if (Src2 && Src2->getReg() != VCCReg) {
- if (Register::isVirtualRegister(Src2->getReg()))
+ if (Src2->getReg().isVirtual())
MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
Next = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index b1c73df269fb..0640e24b37ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -56,35 +56,17 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DebugLoc.h"
#include "llvm/InitializePasses.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <vector>
using namespace llvm;
@@ -154,6 +136,11 @@ private:
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ unsigned AndOpc;
+ unsigned XorTermrOpc;
+ unsigned OrSaveExecOpc;
+ unsigned Exec;
+
DenseMap<const MachineInstr *, InstrInfo> Instructions;
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
@@ -164,6 +151,8 @@ private:
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
+ void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
+ unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
@@ -252,6 +241,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
assert(!(Flag & StateExact) && Flag != 0);
+ LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
+
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -267,9 +258,70 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
Worklist.push_back(&MI);
}
+/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
+void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
+ Register Reg, unsigned SubReg, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ assert(!MRI->isSSA());
+
+ LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
+
+ LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
+ if (!UseLRQ.valueIn())
+ return;
+
+ SmallPtrSet<const VNInfo *, 4> Visited;
+ SmallVector<const VNInfo *, 4> ToProcess;
+ ToProcess.push_back(UseLRQ.valueIn());
+ do {
+ const VNInfo *Value = ToProcess.pop_back_val();
+ Visited.insert(Value);
+
+ if (Value->isPHIDef()) {
+ // Need to mark all defs used in the PHI node
+ const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
+ assert(MBB && "Phi-def has no defining MBB");
+ for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+ PE = MBB->pred_end();
+ PI != PE; ++PI) {
+ if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
+ if (!Visited.count(VN))
+ ToProcess.push_back(VN);
+ }
+ }
+ } else {
+ MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
+ assert(MI && "Def has no defining instruction");
+ markInstruction(*MI, Flag, Worklist);
+
+ // Iterate over all operands to find relevant definitions
+ for (const MachineOperand &Op : MI->operands()) {
+ if (!(Op.isReg() && Op.getReg() == Reg))
+ continue;
+
+ // Does this def cover whole register?
+ bool DefinesFullReg =
+ Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
+ if (!DefinesFullReg) {
+ // Partial definition; need to follow and mark input value
+ LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
+ if (const VNInfo *VN = LRQ.valueIn()) {
+ if (!Visited.count(VN))
+ ToProcess.push_back(VN);
+ }
+ }
+ }
+ }
+ } while (!ToProcess.empty());
+}
+
/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
+
+ LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
+ << MI);
+
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
@@ -279,30 +331,39 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- if (!Register::isVirtualRegister(Reg)) {
+ if (!Reg.isVirtual()) {
if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
continue;
- for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
+ ++RegUnit) {
LiveRange &LR = LIS->getRegUnit(*RegUnit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (!Value)
continue;
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
-
- markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
- Worklist);
+ if (MRI->isSSA()) {
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
+ Worklist);
+ } else {
+ markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+ }
}
continue;
}
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, Flag, Worklist);
+ if (MRI->isSSA()) {
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, Flag, Worklist);
+ } else {
+ LiveRange &LR = LIS->getInterval(Reg);
+ markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
+ }
}
}
@@ -363,7 +424,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
LowerToCopyInstrs.push_back(&MI);
} else {
Register Reg = Inactive.getReg();
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
for (MachineInstr &DefMI : MRI->def_instructions(Reg))
markInstruction(DefMI, StateWWM, Worklist);
}
@@ -393,7 +454,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Register Reg = MO.getReg();
- if (!Register::isVirtualRegister(Reg) &&
+ if (!Reg.isVirtual() &&
TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
Flags = StateWQM;
break;
@@ -552,7 +613,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
if (!SaveSCC)
return PreferLast ? Last : First;
- LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+ LiveRange &LR =
+ LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
auto MBBE = MBB.end();
SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
: LIS->getMBBEndIdx(&MBB);
@@ -572,7 +634,12 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
break;
Idx = Next;
} else {
- SlotIndex Next = S->end.getNextIndex().getBaseIndex();
+ MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
+ assert(EndMI && "Segment does not end on valid instruction");
+ auto NextI = std::next(EndMI->getIterator());
+ if (NextI == MBB.end())
+ break;
+ SlotIndex Next = LIS->getInstructionIndex(*NextI);
if (Next > LastIdx)
break;
Idx = Next;
@@ -588,6 +655,23 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
MBBI = MBB.end();
}
+ // Move insertion point past any operations modifying EXEC.
+ // This assumes that the value of SCC defined by any of these operations
+ // does not need to be preserved.
+ while (MBBI != Last) {
+ bool IsExecDef = false;
+ for (const MachineOperand &MO : MBBI->operands()) {
+ if (MO.isReg() && MO.isDef()) {
+ IsExecDef |=
+ MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
+ }
+ }
+ if (!IsExecDef)
+ break;
+ MBBI++;
+ S = nullptr;
+ }
+
if (S)
MBBI = saveSCC(MBB, MBBI);
@@ -682,8 +766,11 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
- if (isEntry)
- ++II; // Skip the instruction that saves LiveMask
+ if (isEntry) {
+ // Skip the instruction that saves LiveMask
+ if (II != IE && II->getOpcode() == AMDGPU::COPY)
+ ++II;
+ }
// This stores the first instruction where it's safe to switch from WQM to
// Exact or vice versa.
@@ -694,6 +781,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
// FirstWQM since if it's safe to switch to/from WWM, it must be safe to
// switch to/from WQM as well.
MachineBasicBlock::iterator FirstWWM = IE;
+
for (;;) {
MachineBasicBlock::iterator Next = II;
char Needs = StateExact | StateWQM; // WWM is disabled by default
@@ -730,9 +818,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;
- if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
- MI.getOperand(3).setImm(1);
-
++Next;
} else {
// End of basic block
@@ -809,6 +894,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (II == IE)
break;
+
II = Next;
}
assert(!SavedWQMReg);
@@ -819,6 +905,7 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
Register Dest = MI->getOperand(0).getReg();
+
MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);
@@ -833,19 +920,35 @@ void SIWholeQuadMode::lowerCopyInstrs() {
assert(MI->getNumExplicitOperands() == 2);
const Register Reg = MI->getOperand(0).getReg();
+ const unsigned SubReg = MI->getOperand(0).getSubReg();
if (TRI->isVGPR(*MRI, Reg)) {
- const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
- ? MRI->getRegClass(Reg)
- : TRI->getPhysRegClass(Reg);
+ const TargetRegisterClass *regClass =
+ Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
+ if (SubReg)
+ regClass = TRI->getSubRegClass(regClass, SubReg);
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
// And make it implicitly depend on exec (like all VALU movs should do).
MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- } else {
+ } else if (!MRI->isSSA()) {
+ // Remove early-clobber and exec dependency from simple SGPR copies.
+ // This allows some to be eliminated during/post RA.
+ LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
+ if (MI->getOperand(0).isEarlyClobber()) {
+ LIS->removeInterval(Reg);
+ MI->getOperand(0).setIsEarlyClobber(false);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
+ while (Index >= 0) {
+ MI->RemoveOperand(Index);
+ Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
+ }
MI->setDesc(TII->get(AMDGPU::COPY));
+ LLVM_DEBUG(dbgs() << " -> " << *MI);
}
}
for (MachineInstr *MI : LowerToCopyInstrs) {
@@ -881,9 +984,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
+ if (ST->isWave32()) {
+ AndOpc = AMDGPU::S_AND_B32;
+ XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ Exec = AMDGPU::EXEC_LO;
+ } else {
+ AndOpc = AMDGPU::S_AND_B64;
+ XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ Exec = AMDGPU::EXEC;
+ }
+
char GlobalFlags = analyzeFunction(MF);
unsigned LiveMaskReg = 0;
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(Exec);
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
@@ -932,7 +1046,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining
// the analysis results.
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+ LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index 70bf215c03f3..5b8896c21832 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -332,7 +332,6 @@ let OtherPredicates = [HasScalarStores] in {
def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
} // End OtherPredicates = [HasScalarStores]
-def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
let is_buffer = 1 in {
@@ -340,6 +339,9 @@ defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
}
} // SubtargetPredicate = isGFX8Plus
+let SubtargetPredicate = HasSMemRealTime in
+def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
+
let SubtargetPredicate = isGFX10Plus in
def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
let SubtargetPredicate = HasGetWaveIdInst in
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9d7b25d55217..7426af931a62 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -54,9 +54,9 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
bits<1> has_sdst = 1;
}
-class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
+class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
- ps.Mnemonic # " " # ps.AsmOperands, []>,
+ real_name # " " # ps.AsmOperands, []>,
Enc32 {
let isPseudo = 0;
@@ -288,13 +288,11 @@ def S_MOVRELD_B64 : SOP1_64_movreld <"s_movreld_b64">;
let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in {
def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
-def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
let Defs = [SCC] in {
def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
} // End Defs = [SCC]
-def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
let SubtargetPredicate = HasVGPRIndexMode in {
def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
@@ -361,9 +359,9 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
// let Size = 4; // Do we need size here?
}
-class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
+class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
- ps.Mnemonic # " " # ps.AsmOperands, []>,
+ real_name # " " # ps.AsmOperands, []>,
Enc32 {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -410,8 +408,14 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0),
(Op $src0),
- [{ return !N->isDivergent(); }]
->;
+ [{ return !N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
class UniformBinFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0, node:$src1),
@@ -425,6 +429,18 @@ class UniformBinFrag<SDPatternOperator Op> : PatFrag <
let GISelPredicateCode = [{return true;}];
}
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
def S_ADD_U32 : SOP2_32 <"s_add_u32">;
@@ -465,10 +481,15 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
} // End isCommutable = 1
} // End Defs = [SCC]
+// This pattern is restricted to certain subtargets (practically GFX8Plus)
+// because isel sometimes produces an sreg_64 copy to SCC as a by-product
+// of this pattern, and only for subtargets with hasScalarCompareEq64
+// is it possible to map such copy to a single instruction (S_CMP_LG_U64).
class SelectPat<SDPatternOperator select> : PatFrag <
(ops node:$src1, node:$src2),
(select SCC, $src1, $src2),
- [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
+ [{ return Subtarget->hasScalarCompareEq64() &&
+ N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
>;
let Uses = [SCC] in {
@@ -532,6 +553,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
>;
} // End isCommutable = 1
+// There are also separate patterns for types other than i32
def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
[(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
>;
@@ -803,48 +825,65 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
"$sdst, $simm16"
>;
-let hasSideEffects = 1 in {
-
let mayLoad = 1 in {
// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
// its use in the readcyclecounter selection.
+// FIXME: Need to truncate immediate to 16-bits.
def S_GETREG_B32 : SOPK_Pseudo <
"s_getreg_b32",
(outs SReg_32:$sdst), (ins hwreg:$simm16),
- "$sdst, $simm16"
->;
+ "$sdst, $simm16",
+ [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
+ let SOPKZext = 1;
+ let hasSideEffects = 1;
}
+} // End mayLoad = 1
-let mayLoad = 0, mayStore =0 in {
+let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in {
-def S_SETREG_B32 : SOPK_Pseudo <
+// FIXME: Need to truncate immediate to 16-bits.
+class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo <
"s_setreg_b32",
(outs), (ins SReg_32:$sdst, hwreg:$simm16),
"$simm16, $sdst",
- [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
+ pattern>;
+def S_SETREG_B32 : S_SETREG_B32_Pseudo <
+ [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
// Use custom inserter to optimize some cases to
- // S_DENORM_MODE/S_ROUND_MODE.
+ // S_DENORM_MODE/S_ROUND_MODE/S_SETREG_B32_mode.
let usesCustomInserter = 1;
- let Defs = [MODE];
- let Uses = [MODE];
+ let hasSideEffects = 1;
+}
+
+// Variant of SETREG that is guaranteed to only touch FP bits in the MODE
+// register, so doesn't have unmodeled side effects.
+def S_SETREG_B32_mode : S_SETREG_B32_Pseudo {
+ let hasSideEffects = 0;
}
// FIXME: Not on SI?
//def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
-def S_SETREG_IMM32_B32 : SOPK_Pseudo <
+class S_SETREG_IMM32_B32_Pseudo : SOPK_Pseudo <
"s_setreg_imm32_b32",
(outs), (ins i32imm:$imm, hwreg:$simm16),
"$simm16, $imm"> {
let Size = 8; // Unlike every other SOPK instruction.
let has_sdst = 0;
- let Defs = [MODE];
- let Uses = [MODE];
}
+def S_SETREG_IMM32_B32 : S_SETREG_IMM32_B32_Pseudo {
+ let hasSideEffects = 1;
+}
+
+// Variant of SETREG_IMM32 that is guaranteed to only touch FP bits in the MODE
+// register, so doesn't have unmodeled side effects.
+def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo {
+ let hasSideEffects = 0;
}
-} // End hasSideEffects = 1
+
+} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE]
class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
SOPK_Pseudo<
@@ -891,88 +930,101 @@ let SubtargetPredicate = isGFX10Plus in {
// SOPC Instructions
//===----------------------------------------------------------------------===//
-class SOPCe <bits<7> op> : Enc32 {
- bits<8> src0;
- bits<8> src1;
-
- let Inst{7-0} = src0;
- let Inst{15-8} = src1;
- let Inst{22-16} = op;
- let Inst{31-23} = 0x17e;
-}
-
-class SOPC <bits<7> op, dag outs, dag ins, string asm,
- list<dag> pattern = []> :
- InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+class SOPC_Pseudo<string opName, dag outs, dag ins,
+ string asmOps, list<dag> pattern=[]> :
+ SOP_Pseudo<opName, outs, ins, asmOps, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
let SOPC = 1;
- let isCodeGenOnly = 0;
let Defs = [SCC];
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
}
-class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
- string opName, list<dag> pattern = []> : SOPC <
- op, (outs), (ins rc0:$src0, rc1:$src1),
- opName#" $src0, $src1", pattern > {
- let Defs = [SCC];
+class SOPC_Real<bits<7> op, SOPC_Pseudo ps, string real_name = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ real_name # " " # ps.AsmOperands, []>,
+ Enc32 {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let TSFlags = ps.TSFlags;
+
+ // encoding
+ bits<8> src0;
+ bits<8> src1;
+
+ let Inst{7-0} = src0;
+ let Inst{15-8} = src1;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17e;
}
-class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+
+class SOPC_Base <RegisterOperand rc0, RegisterOperand rc1,
+ string opName, list<dag> pattern = []> : SOPC_Pseudo <
+ opName, (outs), (ins rc0:$src0, rc1:$src1),
+ "$src0, $src1", pattern > {
+}
+
+class SOPC_Helper <RegisterOperand rc, ValueType vt,
string opName, SDPatternOperator cond> : SOPC_Base <
- op, rc, rc, opName,
+ rc, rc, opName,
[(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
}
-class SOPC_CMP_32<bits<7> op, string opName,
+class SOPC_CMP_32<string opName,
SDPatternOperator cond = COND_NULL, string revOp = opName>
- : SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
+ : SOPC_Helper<SSrc_b32, i32, opName, cond>,
Commutable_REV<revOp, !eq(revOp, opName)>,
SOPKInstTable<0, opName> {
let isCompare = 1;
let isCommutable = 1;
}
-class SOPC_CMP_64<bits<7> op, string opName,
+class SOPC_CMP_64<string opName,
SDPatternOperator cond = COND_NULL, string revOp = opName>
- : SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
+ : SOPC_Helper<SSrc_b64, i64, opName, cond>,
Commutable_REV<revOp, !eq(revOp, opName)> {
let isCompare = 1;
let isCommutable = 1;
}
-class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
- : SOPC_Base<op, SSrc_b32, SSrc_b32, opName, pattern>;
-
-class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
- : SOPC_Base<op, SSrc_b64, SSrc_b32, opName, pattern>;
-
-def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00, "s_cmp_eq_i32">;
-def S_CMP_LG_I32 : SOPC_CMP_32 <0x01, "s_cmp_lg_i32">;
-def S_CMP_GT_I32 : SOPC_CMP_32 <0x02, "s_cmp_gt_i32", COND_SGT>;
-def S_CMP_GE_I32 : SOPC_CMP_32 <0x03, "s_cmp_ge_i32", COND_SGE>;
-def S_CMP_LT_I32 : SOPC_CMP_32 <0x04, "s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
-def S_CMP_LE_I32 : SOPC_CMP_32 <0x05, "s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
-def S_CMP_EQ_U32 : SOPC_CMP_32 <0x06, "s_cmp_eq_u32", COND_EQ>;
-def S_CMP_LG_U32 : SOPC_CMP_32 <0x07, "s_cmp_lg_u32", COND_NE>;
-def S_CMP_GT_U32 : SOPC_CMP_32 <0x08, "s_cmp_gt_u32", COND_UGT>;
-def S_CMP_GE_U32 : SOPC_CMP_32 <0x09, "s_cmp_ge_u32", COND_UGE>;
-def S_CMP_LT_U32 : SOPC_CMP_32 <0x0a, "s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
-def S_CMP_LE_U32 : SOPC_CMP_32 <0x0b, "s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
-
-def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
-def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
-def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
-def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+class SOPC_32<string opName, list<dag> pattern = []>
+ : SOPC_Base<SSrc_b32, SSrc_b32, opName, pattern>;
+
+class SOPC_64_32<string opName, list<dag> pattern = []>
+ : SOPC_Base<SSrc_b64, SSrc_b32, opName, pattern>;
+
+def S_CMP_EQ_I32 : SOPC_CMP_32 <"s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_CMP_32 <"s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_CMP_32 <"s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <"s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <"s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
+def S_CMP_LE_I32 : SOPC_CMP_32 <"s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <"s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <"s_cmp_lg_u32", COND_NE>;
+def S_CMP_GT_U32 : SOPC_CMP_32 <"s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <"s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <"s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
+def S_CMP_LE_U32 : SOPC_CMP_32 <"s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
+
+def S_BITCMP0_B32 : SOPC_32 <"s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <"s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <"s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <"s_bitcmp1_b64">;
let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
-def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
+def S_SETVSKIP : SOPC_32 <"s_setvskip">;
let SubtargetPredicate = isGFX8Plus in {
-def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
-def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
+def S_CMP_EQ_U64 : SOPC_CMP_64 <"s_cmp_eq_u64", COND_EQ>;
+def S_CMP_LG_U64 : SOPC_CMP_64 <"s_cmp_lg_u64", COND_NE>;
} // End SubtargetPredicate = isGFX8Plus
let SubtargetPredicate = HasVGPRIndexMode in {
@@ -980,10 +1032,11 @@ let SubtargetPredicate = HasVGPRIndexMode in {
// register. We don't want to add mode register uses to every
// instruction, and it's too complicated to deal with anyway. This is
// modeled just as a side effect.
-def S_SET_GPR_IDX_ON : SOPC <0x11,
+def S_SET_GPR_IDX_ON : SOPC_Pseudo <
+ "s_set_gpr_idx_on" ,
(outs),
(ins SSrc_b32:$src0, GPRIdxMode:$src1),
- "s_set_gpr_idx_on $src0,$src1"> {
+ "$src0, $src1"> {
let Defs = [M0, MODE]; // No scc def
let Uses = [M0, MODE]; // Other bits of mode, m0 unmodified.
let hasSideEffects = 1; // Sets mode.gpr_idx_en
@@ -995,225 +1048,239 @@ def S_SET_GPR_IDX_ON : SOPC <0x11,
// SOPP Instructions
//===----------------------------------------------------------------------===//
-class Base_SOPP <string asm> {
- string AsmString = asm;
-}
-
-class SOPPe <bits<7> op> : Enc32 {
- bits <16> simm16;
-
- let Inst{15-0} = simm16;
- let Inst{22-16} = op;
- let Inst{31-23} = 0x17f; // encoding
-}
-
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
- InstSI <(outs), ins, asm, pattern >, SOPPe <op>, Base_SOPP <asm> {
-
+class SOPP_Pseudo<string opName, dag ins,
+ string asmOps = "", list<dag> pattern=[], string keyName = opName> :
+ SOP_Pseudo<opName, (outs), ins, asmOps, pattern> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
let SOPP = 1;
- let Size = 4;
+ let FixedSize = 1;
let SchedRW = [WriteSALU];
-
let UseNamedOperandTable = 1;
+ bits <16> simm16;
+ bits <1> fixed_imm = 0;
+ string KeyName = keyName;
}
-def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+class SOPPRelaxTable <bit isRelaxed, string keyName, string gfxip> {
+ bit IsRelaxed = isRelaxed;
+ string KeyName = keyName # gfxip;
+}
+
+//spaces inserted in realname on instantiation of this record to allow s_endpgm to omit whitespace
+class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList,
+ real_name # ps.AsmOperands, []> {
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
-class SOPP_w_nop_e <bits<7> op> : Enc64 {
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let TSFlags = ps.TSFlags;
bits <16> simm16;
+}
- let Inst{15-0} = simm16;
+class SOPP_Real_32 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<op, ps, real_name>,
+Enc32 {
+ let Inst{15-0} = !if(ps.fixed_imm, ps.simm16, simm16);
let Inst{22-16} = op;
- let Inst{31-23} = 0x17f; // encoding
- let Inst{47-32} = 0x0;
- let Inst{54-48} = S_NOP.Inst{22-16}; // opcode
- let Inst{63-55} = S_NOP.Inst{31-23}; // encoding
+ let Inst{31-23} = 0x17f;
}
-class SOPP_w_nop <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
- InstSI <(outs), ins, asm, pattern >, SOPP_w_nop_e <op>, Base_SOPP <asm> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let SALU = 1;
- let SOPP = 1;
- let Size = 8;
- let SchedRW = [WriteSALU];
-
- let UseNamedOperandTable = 1;
+class SOPP_Real_64 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<op, ps, real_name>,
+Enc64 {
+ // encoding
+ let Inst{15-0} = !if(ps.fixed_imm, ps.simm16, simm16);
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17f;
+ //effectively a nop
+ let Inst{47-32} = 0x0;
+ let Inst{54-48} = 0x0;
+ let Inst{63-55} = 0x17f;
}
-multiclass SOPP_With_Relaxation <bits<7> op, dag ins, string asm, list<dag> pattern = []> {
- def "" : SOPP <op, ins, asm, pattern>;
- def _pad_s_nop : SOPP_w_nop <op, ins, asm, pattern>;
+multiclass SOPP_With_Relaxation <string opName, dag ins,
+ string asmOps, list<dag> pattern=[]> {
+ def "" : SOPP_Pseudo <opName, ins, asmOps, pattern>;
+ def _pad_s_nop : SOPP_Pseudo <opName # "_pad_s_nop", ins, asmOps, pattern, opName>;
}
-let isTerminator = 1 in {
+def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16">;
-def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm$simm16"> {
+let isTerminator = 1 in {
+def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins EndpgmImm:$simm16), "$simm16"> {
let isBarrier = 1;
let isReturn = 1;
+ let hasSideEffects = 1;
}
-def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+def S_ENDPGM_SAVED : SOPP_Pseudo<"s_endpgm_saved", (ins)> {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
+ let fixed_imm = 1;
let isBarrier = 1;
let isReturn = 1;
}
let SubtargetPredicate = isGFX9Plus in {
- let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+ let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
def S_ENDPGM_ORDERED_PS_DONE :
- SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
- } // End isBarrier = 1, isReturn = 1, simm16 = 0
+ SOPP_Pseudo<"s_endpgm_ordered_ps_done", (ins)>;
+ } // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1
} // End SubtargetPredicate = isGFX9Plus
let SubtargetPredicate = isGFX10Plus in {
- let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+ let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in {
def S_CODE_END :
- SOPP<0x01f, (ins), "s_code_end">;
- } // End isBarrier = 1, isReturn = 1, simm16 = 0
+ SOPP_Pseudo<"s_code_end", (ins)>;
+ } // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1
} // End SubtargetPredicate = isGFX10Plus
let isBranch = 1, SchedRW = [WriteBranch] in {
let isBarrier = 1 in {
-defm S_BRANCH : SOPP_With_Relaxation <
- 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+defm S_BRANCH : SOPP_With_Relaxation<
+ "s_branch" , (ins sopp_brtarget:$simm16), "$simm16",
[(br bb:$simm16)]>;
}
let Uses = [SCC] in {
-defm S_CBRANCH_SCC0 : SOPP_With_Relaxation <
- 0x00000004, (ins sopp_brtarget:$simm16),
- "s_cbranch_scc0 $simm16"
+defm S_CBRANCH_SCC0 : SOPP_With_Relaxation<
+ "s_cbranch_scc0" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
defm S_CBRANCH_SCC1 : SOPP_With_Relaxation <
- 0x00000005, (ins sopp_brtarget:$simm16),
- "s_cbranch_scc1 $simm16"
+ "s_cbranch_scc1" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
} // End Uses = [SCC]
let Uses = [VCC] in {
defm S_CBRANCH_VCCZ : SOPP_With_Relaxation <
- 0x00000006, (ins sopp_brtarget:$simm16),
- "s_cbranch_vccz $simm16"
+ "s_cbranch_vccz" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation <
- 0x00000007, (ins sopp_brtarget:$simm16),
- "s_cbranch_vccnz $simm16"
+ "s_cbranch_vccnz" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
} // End Uses = [VCC]
let Uses = [EXEC] in {
defm S_CBRANCH_EXECZ : SOPP_With_Relaxation <
- 0x00000008, (ins sopp_brtarget:$simm16),
- "s_cbranch_execz $simm16"
+ "s_cbranch_execz" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation <
- 0x00000009, (ins sopp_brtarget:$simm16),
- "s_cbranch_execnz $simm16"
+ "s_cbranch_execnz" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
} // End Uses = [EXEC]
defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation <
- 0x00000017, (ins sopp_brtarget:$simm16),
- "s_cbranch_cdbgsys $simm16"
+ "s_cbranch_cdbgsys" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation <
- 0x0000001A, (ins sopp_brtarget:$simm16),
- "s_cbranch_cdbgsys_and_user $simm16"
+ "s_cbranch_cdbgsys_and_user" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation <
- 0x00000019, (ins sopp_brtarget:$simm16),
- "s_cbranch_cdbgsys_or_user $simm16"
+ "s_cbranch_cdbgsys_or_user" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation <
- 0x00000018, (ins sopp_brtarget:$simm16),
- "s_cbranch_cdbguser $simm16"
+ "s_cbranch_cdbguser" , (ins sopp_brtarget:$simm16),
+ "$simm16"
>;
} // End isBranch = 1
} // End isTerminator = 1
let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
+def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "",
[(int_amdgcn_s_barrier)]> {
let SchedRW = [WriteBarrier];
let simm16 = 0;
+ let fixed_imm = 1;
let isConvergent = 1;
}
-def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
+ let fixed_imm = 1;
let mayLoad = 1;
let mayStore = 1;
}
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
+def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
-def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
-def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
+def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
// On SI the documentation says sleep for approximately 64 * low 2
// bits, consistent with the reported maximum of 448. On VI the
// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
// maximum really 15 on VI?
-def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
- "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
+def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
+ "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 0;
let mayStore = 0;
}
-def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
+def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
let Uses = [EXEC, M0] in {
// FIXME: Should this be mayLoad+mayStore?
-def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
+def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16",
[(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
-def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
+def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16",
[(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
} // End Uses = [EXEC, M0]
-def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
+def S_TRAP : SOPP_Pseudo <"s_trap" , (ins i16imm:$simm16), "$simm16"> {
let isTrap = 1;
}
-def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> {
let simm16 = 0;
+ let fixed_imm = 1;
}
-def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
+def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_incperflevel timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 0;
let mayStore = 0;
}
-def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
+def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_decperflevel timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 0;
let mayStore = 0;
}
-def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
let simm16 = 0;
+ let fixed_imm = 1;
}
let SubtargetPredicate = HasVGPRIndexMode in {
-def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
+def S_SET_GPR_IDX_OFF : SOPP_Pseudo<"s_set_gpr_idx_off", (ins) > {
let simm16 = 0;
+ let fixed_imm = 1;
let Defs = [MODE];
let Uses = [MODE];
}
@@ -1221,8 +1288,8 @@ def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
} // End hasSideEffects
let SubtargetPredicate = HasVGPRIndexMode in {
-def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
- "s_set_gpr_idx_mode$simm16"> {
+def S_SET_GPR_IDX_MODE : SOPP_Pseudo<"s_set_gpr_idx_mode", (ins GPRIdxMode:$simm16),
+ "$simm16"> {
let Defs = [M0, MODE];
let Uses = [MODE];
}
@@ -1230,37 +1297,30 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
let SubtargetPredicate = isGFX10Plus in {
def S_INST_PREFETCH :
- SOPP<0x020, (ins s16imm:$simm16), "s_inst_prefetch $simm16">;
+ SOPP_Pseudo<"s_inst_prefetch", (ins s16imm:$simm16), "$simm16">;
def S_CLAUSE :
- SOPP<0x021, (ins s16imm:$simm16), "s_clause $simm16">;
- def S_WAITCNT_IDLE :
- SOPP <0x022, (ins), "s_wait_idle"> {
+ SOPP_Pseudo<"s_clause", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_IDLE :
+ SOPP_Pseudo <"s_wait_idle", (ins), ""> {
let simm16 = 0;
+ let fixed_imm = 1;
}
def S_WAITCNT_DEPCTR :
- SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
+ SOPP_Pseudo <"s_waitcnt_depctr" , (ins s16imm:$simm16), "$simm16">;
let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
def S_ROUND_MODE :
- SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+ SOPP_Pseudo<"s_round_mode", (ins s16imm:$simm16), "$simm16">;
def S_DENORM_MODE :
- SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+ SOPP_Pseudo<"s_denorm_mode", (ins i32imm:$simm16), "$simm16",
[(SIdenorm_mode (i32 timm:$simm16))]>;
}
def S_TTRACEDATA_IMM :
- SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
+ SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
-// S_GETREG_B32 Intrinsic Pattern.
-//===----------------------------------------------------------------------===//
-def : GCNPat <
- (int_amdgcn_s_getreg timm:$simm16),
- (S_GETREG_B32 (as_i16imm $simm16))
->;
-
-//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -1270,6 +1330,11 @@ def : GCNPat <
>;
def : GCNPat <
+ (int_amdgcn_endpgm),
+ (S_ENDPGM (i16 0))
+>;
+
+def : GCNPat <
(i64 (ctpop i64:$src)),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
@@ -1325,13 +1390,27 @@ def : GCNPat<
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
+// FIXME: ValueType should have isVector field
+class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
+ bit isVector = 1> : GCNPat<
+ (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+ (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
+>;
-//===----------------------------------------------------------------------===//
-// Target-specific instruction encodings.
-//===----------------------------------------------------------------------===//
+// Match these for some more types
+// TODO: i1
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
+
+def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
//===----------------------------------------------------------------------===//
-// SOP1 - GFX10.
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
@@ -1339,6 +1418,20 @@ class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
string DecoderNamespace = "GFX10";
}
+class Select_vi<string opName> : SIMCInstr<opName, SIEncodingFamily.VI> {
+ Predicate AssemblerPredicate = isGFX8GFX9;
+ string DecoderNamespace = "GFX8";
+}
+
+class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
+ Predicate AssemblerPredicate = isGFX6GFX7;
+ string DecoderNamespace = "GFX6GFX7";
+}
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX10.
+//===----------------------------------------------------------------------===//
+
multiclass SOP1_Real_gfx10<bits<8> op> {
def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
@@ -1367,10 +1460,6 @@ defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
// SOP1 - GFX6, GFX7.
//===----------------------------------------------------------------------===//
-class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
- Predicate AssemblerPredicate = isGFX6GFX7;
- string DecoderNamespace = "GFX6GFX7";
-}
multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
@@ -1381,7 +1470,6 @@ multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>;
-defm S_MOV_REGRD_B32 : SOP1_Real_gfx6_gfx7<0x033>;
defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>;
@@ -1430,7 +1518,6 @@ defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>;
defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
-defm S_MOV_FED_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x035>;
//===----------------------------------------------------------------------===//
// SOP2 - GFX10.
@@ -1574,15 +1661,163 @@ defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
//===----------------------------------------------------------------------===//
-// GFX8, GFX9 (VI).
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
//===----------------------------------------------------------------------===//
-class Select_vi<string opName> :
- SIMCInstr<opName, SIEncodingFamily.VI> {
- Predicate AssemblerPredicate = isGFX8GFX9;
- string DecoderNamespace = "GFX8";
+multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic> {
+ def _gfx6_gfx7 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+}
+
+multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _vi : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+}
+
+multiclass SOPP_Real_32_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _gfx10 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+}
+
+multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_32_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>;
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_32_gfx6_gfx7<op, real_name>, SOPP_Real_32_gfx8_gfx9<op, real_name>;
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>;
+
+//64 bit encodings, for Relaxation
+multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _gfx6_gfx7 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+}
+
+multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _vi : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+}
+
+multiclass SOPP_Real_64_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
+ def _gfx10 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
+ Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+}
+
+multiclass SOPP_Real_64_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_64_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>;
+
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_64_gfx6_gfx7<op, real_name>, SOPP_Real_64_gfx8_gfx9<op, real_name>;
+
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
+ SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>;
+
+//relaxation for insts with no operands not implemented
+multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
+ defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
+ defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
+}
+
+defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>;
+defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">;
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
+defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
+defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
+defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
+defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
+defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
+defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
+defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>;
+defm S_SETKILL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00b>;
+defm S_SLEEP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00e>;
+defm S_SETPRIO : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00f>;
+defm S_SENDMSG : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x010>;
+defm S_SENDMSGHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x011>;
+defm S_TRAP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x012>;
+defm S_ICACHE_INV : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x013>;
+defm S_INCPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x014>;
+defm S_DECPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x015>;
+defm S_TTRACEDATA : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x016>;
+defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
+defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>;
+defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>;
+defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
+defm S_CODE_END : SOPP_Real_32_gfx10<0x01f>;
+defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>;
+defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>;
+defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>;
+defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx10<0x023>;
+defm S_ROUND_MODE : SOPP_Real_32_gfx10<0x024>;
+defm S_DENORM_MODE : SOPP_Real_32_gfx10<0x025>;
+defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10<0x028>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
+//===----------------------------------------------------------------------===//
+
+multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
+ def _gfx6_gfx7 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
+ def _vi : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+ Select_vi<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPC_Real_gfx10<bits<7> op> {
+ def _gfx10 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
}
+multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
+ SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>;
+
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
+ SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>;
+
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
+ SOPC_Real_gfx6_gfx7_gfx8_gfx9<op>, SOPC_Real_gfx10<op>;
+
+defm S_CMP_EQ_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x00>;
+defm S_CMP_LG_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x01>;
+defm S_CMP_GT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x02>;
+defm S_CMP_GE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x03>;
+defm S_CMP_LT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x04>;
+defm S_CMP_LE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x05>;
+defm S_CMP_EQ_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x06>;
+defm S_CMP_LG_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x07>;
+defm S_CMP_GT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x08>;
+defm S_CMP_GE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x09>;
+defm S_CMP_LT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0a>;
+defm S_CMP_LE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0b>;
+defm S_BITCMP0_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0c>;
+defm S_BITCMP1_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0d>;
+defm S_BITCMP0_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0e>;
+defm S_BITCMP1_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10<0x0f>;
+defm S_SETVSKIP : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>;
+defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>;
+defm S_CMP_EQ_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x12>;
+defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x13>;
+
+//===----------------------------------------------------------------------===//
+// GFX8 (VI), GFX9.
+//===----------------------------------------------------------------------===//
+
class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
SOP1_Real<op, ps>,
Select_vi<ps.Mnemonic>;
@@ -1643,9 +1878,7 @@ def S_MOVRELS_B64_vi : SOP1_Real_vi <0x2b, S_MOVRELS_B64>;
def S_MOVRELD_B32_vi : SOP1_Real_vi <0x2c, S_MOVRELD_B32>;
def S_MOVRELD_B64_vi : SOP1_Real_vi <0x2d, S_MOVRELD_B64>;
def S_CBRANCH_JOIN_vi : SOP1_Real_vi <0x2e, S_CBRANCH_JOIN>;
-def S_MOV_REGRD_B32_vi : SOP1_Real_vi <0x2f, S_MOV_REGRD_B32>;
def S_ABS_I32_vi : SOP1_Real_vi <0x30, S_ABS_I32>;
-def S_MOV_FED_B32_vi : SOP1_Real_vi <0x31, S_MOV_FED_B32>;
def S_SET_GPR_IDX_IDX_vi : SOP1_Real_vi <0x32, S_SET_GPR_IDX_IDX>;
def S_ADD_U32_vi : SOP2_Real_vi <0x00, S_ADD_U32>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 5819a621f55d..c8a85d76a55b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
#include "AMDGPUAsmUtils.h"
+#include "SIDefines.h"
+
+#include "llvm/ADT/StringRef.h"
namespace llvm {
namespace AMDGPU {
@@ -87,6 +90,250 @@ const char* const IdSymbolic[] = {
} // namespace Hwreg
+namespace MTBUFFormat {
+
+StringLiteral const DfmtSymbolic[] = {
+ "BUF_DATA_FORMAT_INVALID",
+ "BUF_DATA_FORMAT_8",
+ "BUF_DATA_FORMAT_16",
+ "BUF_DATA_FORMAT_8_8",
+ "BUF_DATA_FORMAT_32",
+ "BUF_DATA_FORMAT_16_16",
+ "BUF_DATA_FORMAT_10_11_11",
+ "BUF_DATA_FORMAT_11_11_10",
+ "BUF_DATA_FORMAT_10_10_10_2",
+ "BUF_DATA_FORMAT_2_10_10_10",
+ "BUF_DATA_FORMAT_8_8_8_8",
+ "BUF_DATA_FORMAT_32_32",
+ "BUF_DATA_FORMAT_16_16_16_16",
+ "BUF_DATA_FORMAT_32_32_32",
+ "BUF_DATA_FORMAT_32_32_32_32",
+ "BUF_DATA_FORMAT_RESERVED_15"
+};
+
+StringLiteral const NfmtSymbolicGFX10[] = {
+ "BUF_NUM_FORMAT_UNORM",
+ "BUF_NUM_FORMAT_SNORM",
+ "BUF_NUM_FORMAT_USCALED",
+ "BUF_NUM_FORMAT_SSCALED",
+ "BUF_NUM_FORMAT_UINT",
+ "BUF_NUM_FORMAT_SINT",
+ "",
+ "BUF_NUM_FORMAT_FLOAT"
+};
+
+StringLiteral const NfmtSymbolicSICI[] = {
+ "BUF_NUM_FORMAT_UNORM",
+ "BUF_NUM_FORMAT_SNORM",
+ "BUF_NUM_FORMAT_USCALED",
+ "BUF_NUM_FORMAT_SSCALED",
+ "BUF_NUM_FORMAT_UINT",
+ "BUF_NUM_FORMAT_SINT",
+ "BUF_NUM_FORMAT_SNORM_OGL",
+ "BUF_NUM_FORMAT_FLOAT"
+};
+
+StringLiteral const NfmtSymbolicVI[] = { // VI and GFX9
+ "BUF_NUM_FORMAT_UNORM",
+ "BUF_NUM_FORMAT_SNORM",
+ "BUF_NUM_FORMAT_USCALED",
+ "BUF_NUM_FORMAT_SSCALED",
+ "BUF_NUM_FORMAT_UINT",
+ "BUF_NUM_FORMAT_SINT",
+ "BUF_NUM_FORMAT_RESERVED_6",
+ "BUF_NUM_FORMAT_FLOAT"
+};
+
+StringLiteral const UfmtSymbolic[] = {
+ "BUF_FMT_INVALID",
+
+ "BUF_FMT_8_UNORM",
+ "BUF_FMT_8_SNORM",
+ "BUF_FMT_8_USCALED",
+ "BUF_FMT_8_SSCALED",
+ "BUF_FMT_8_UINT",
+ "BUF_FMT_8_SINT",
+
+ "BUF_FMT_16_UNORM",
+ "BUF_FMT_16_SNORM",
+ "BUF_FMT_16_USCALED",
+ "BUF_FMT_16_SSCALED",
+ "BUF_FMT_16_UINT",
+ "BUF_FMT_16_SINT",
+ "BUF_FMT_16_FLOAT",
+
+ "BUF_FMT_8_8_UNORM",
+ "BUF_FMT_8_8_SNORM",
+ "BUF_FMT_8_8_USCALED",
+ "BUF_FMT_8_8_SSCALED",
+ "BUF_FMT_8_8_UINT",
+ "BUF_FMT_8_8_SINT",
+
+ "BUF_FMT_32_UINT",
+ "BUF_FMT_32_SINT",
+ "BUF_FMT_32_FLOAT",
+
+ "BUF_FMT_16_16_UNORM",
+ "BUF_FMT_16_16_SNORM",
+ "BUF_FMT_16_16_USCALED",
+ "BUF_FMT_16_16_SSCALED",
+ "BUF_FMT_16_16_UINT",
+ "BUF_FMT_16_16_SINT",
+ "BUF_FMT_16_16_FLOAT",
+
+ "BUF_FMT_10_11_11_UNORM",
+ "BUF_FMT_10_11_11_SNORM",
+ "BUF_FMT_10_11_11_USCALED",
+ "BUF_FMT_10_11_11_SSCALED",
+ "BUF_FMT_10_11_11_UINT",
+ "BUF_FMT_10_11_11_SINT",
+ "BUF_FMT_10_11_11_FLOAT",
+
+ "BUF_FMT_11_11_10_UNORM",
+ "BUF_FMT_11_11_10_SNORM",
+ "BUF_FMT_11_11_10_USCALED",
+ "BUF_FMT_11_11_10_SSCALED",
+ "BUF_FMT_11_11_10_UINT",
+ "BUF_FMT_11_11_10_SINT",
+ "BUF_FMT_11_11_10_FLOAT",
+
+ "BUF_FMT_10_10_10_2_UNORM",
+ "BUF_FMT_10_10_10_2_SNORM",
+ "BUF_FMT_10_10_10_2_USCALED",
+ "BUF_FMT_10_10_10_2_SSCALED",
+ "BUF_FMT_10_10_10_2_UINT",
+ "BUF_FMT_10_10_10_2_SINT",
+
+ "BUF_FMT_2_10_10_10_UNORM",
+ "BUF_FMT_2_10_10_10_SNORM",
+ "BUF_FMT_2_10_10_10_USCALED",
+ "BUF_FMT_2_10_10_10_SSCALED",
+ "BUF_FMT_2_10_10_10_UINT",
+ "BUF_FMT_2_10_10_10_SINT",
+
+ "BUF_FMT_8_8_8_8_UNORM",
+ "BUF_FMT_8_8_8_8_SNORM",
+ "BUF_FMT_8_8_8_8_USCALED",
+ "BUF_FMT_8_8_8_8_SSCALED",
+ "BUF_FMT_8_8_8_8_UINT",
+ "BUF_FMT_8_8_8_8_SINT",
+
+ "BUF_FMT_32_32_UINT",
+ "BUF_FMT_32_32_SINT",
+ "BUF_FMT_32_32_FLOAT",
+
+ "BUF_FMT_16_16_16_16_UNORM",
+ "BUF_FMT_16_16_16_16_SNORM",
+ "BUF_FMT_16_16_16_16_USCALED",
+ "BUF_FMT_16_16_16_16_SSCALED",
+ "BUF_FMT_16_16_16_16_UINT",
+ "BUF_FMT_16_16_16_16_SINT",
+ "BUF_FMT_16_16_16_16_FLOAT",
+
+ "BUF_FMT_32_32_32_UINT",
+ "BUF_FMT_32_32_32_SINT",
+ "BUF_FMT_32_32_32_FLOAT",
+ "BUF_FMT_32_32_32_32_UINT",
+ "BUF_FMT_32_32_32_32_SINT",
+ "BUF_FMT_32_32_32_32_FLOAT"
+};
+
+unsigned const DfmtNfmt2UFmt[] = {
+ DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT),
+
+ DFMT_8 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_8 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_8 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_8 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_8 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_8 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_16 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_16 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_16 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_16 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_16 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_16 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_16 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_8_8 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_8_8 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_16_16 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_16_16 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_10_11_11 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_10_11_11 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_10_11_11 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_10_11_11 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_10_11_11 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_10_11_11 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_10_11_11 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_11_11_10 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_11_11_10 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_11_11_10 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_11_11_10 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_11_11_10 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_11_11_10 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_11_11_10 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_10_10_10_2 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_10_10_10_2 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_2_10_10_10 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_2_10_10_10 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_8_8_8_8 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_8_8_8_8 | (NFMT_SINT << NFMT_SHIFT),
+
+ DFMT_32_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32_32 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_16_16_16_16 | (NFMT_UNORM << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_SNORM << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_16_16_16_16 | (NFMT_FLOAT << NFMT_SHIFT),
+
+ DFMT_32_32_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32_32_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT),
+ DFMT_32_32_32_32 | (NFMT_UINT << NFMT_SHIFT),
+ DFMT_32_32_32_32 | (NFMT_SINT << NFMT_SHIFT),
+ DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT)
+};
+
+} // namespace MTBUFFormat
+
namespace Swizzle {
// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index cd91c5f6edd5..3eb27c5e5f42 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -10,7 +10,11 @@
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
namespace llvm {
+
+class StringLiteral;
+
namespace AMDGPU {
+
namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
extern const char* const IdSymbolic[];
@@ -25,6 +29,17 @@ extern const char* const IdSymbolic[];
} // namespace Hwreg
+namespace MTBUFFormat {
+
+extern StringLiteral const DfmtSymbolic[];
+extern StringLiteral const NfmtSymbolicGFX10[];
+extern StringLiteral const NfmtSymbolicSICI[];
+extern StringLiteral const NfmtSymbolicVI[];
+extern StringLiteral const UfmtSymbolic[];
+extern unsigned const DfmtNfmt2UFmt[];
+
+} // namespace MTBUFFormat
+
namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
extern const char* const IdSymbolic[];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3df2157fc402..4c1e4dec7ecb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -9,44 +9,28 @@
#include "AMDGPUBaseInfo.h"
#include "AMDGPU.h"
#include "AMDGPUAsmUtils.h"
-#include "AMDGPUTargetTransformInfo.h"
-#include "SIDefines.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
+#include "AMDKernelCodeT.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetParser.h"
#define GET_INSTRINFO_NAMED_OPS
#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRMAP_INFO
-#undef GET_INSTRINFO_NAMED_OPS
+
+static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
+ "amdhsa-code-object-version", llvm::cl::Hidden,
+ llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));
namespace {
@@ -103,6 +87,32 @@ namespace llvm {
namespace AMDGPU {
+Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
+ if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA)
+ return None;
+
+ switch (AmdhsaCodeObjectVersion) {
+ case 2:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+ case 3:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ default:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ }
+}
+
+bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {
+ if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
+ return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+ return false;
+}
+
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
+ if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
+ return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ return false;
+}
+
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
@@ -236,6 +246,94 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
namespace IsaInfo {
+AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
+ : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) {
+ if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
+ XnackSetting = TargetIDSetting::Unsupported;
+ if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
+ SramEccSetting = TargetIDSetting::Unsupported;
+}
+
+void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) {
+ // Check if xnack or sramecc is explicitly enabled or disabled. In the
+ // absence of the target features we assume we must generate code that can run
+ // in any environment.
+ SubtargetFeatures Features(FS);
+ Optional<bool> XnackRequested;
+ Optional<bool> SramEccRequested;
+
+ for (const std::string &Feature : Features.getFeatures()) {
+ if (Feature == "+xnack")
+ XnackRequested = true;
+ else if (Feature == "-xnack")
+ XnackRequested = false;
+ else if (Feature == "+sramecc")
+ SramEccRequested = true;
+ else if (Feature == "-sramecc")
+ SramEccRequested = false;
+ }
+
+ bool XnackSupported = isXnackSupported();
+ bool SramEccSupported = isSramEccSupported();
+
+ if (XnackRequested) {
+ if (XnackSupported) {
+ XnackSetting =
+ *XnackRequested ? TargetIDSetting::On : TargetIDSetting::Off;
+ } else {
+ // If a specific xnack setting was requested and this GPU does not support
+ // xnack emit a warning. Setting will remain set to "Unsupported".
+ if (*XnackRequested) {
+ errs() << "warning: xnack 'On' was requested for a processor that does "
+ "not support it!\n";
+ } else {
+ errs() << "warning: xnack 'Off' was requested for a processor that "
+ "does not support it!\n";
+ }
+ }
+ }
+
+ if (SramEccRequested) {
+ if (SramEccSupported) {
+ SramEccSetting =
+ *SramEccRequested ? TargetIDSetting::On : TargetIDSetting::Off;
+ } else {
+ // If a specific sramecc setting was requested and this GPU does not
+ // support sramecc emit a warning. Setting will remain set to
+ // "Unsupported".
+ if (*SramEccRequested) {
+ errs() << "warning: sramecc 'On' was requested for a processor that "
+ "does not support it!\n";
+ } else {
+ errs() << "warning: sramecc 'Off' was requested for a processor that "
+ "does not support it!\n";
+ }
+ }
+ }
+}
+
+static TargetIDSetting
+getTargetIDSettingFromFeatureString(StringRef FeatureString) {
+ if (FeatureString.endswith("-"))
+ return TargetIDSetting::Off;
+ if (FeatureString.endswith("+"))
+ return TargetIDSetting::On;
+
+ llvm_unreachable("Malformed feature string");
+}
+
+void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
+ SmallVector<StringRef, 3> TargetIDSplit;
+ TargetID.split(TargetIDSplit, ':');
+
+ for (const auto &FeatureString : TargetIDSplit) {
+ if (FeatureString.startswith("xnack"))
+ XnackSetting = getTargetIDSettingFromFeatureString(FeatureString);
+ if (FeatureString.startswith("sramecc"))
+ SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString);
+ }
+}
+
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
auto TargetTriple = STI->getTargetTriple();
auto Version = getIsaVersion(STI->getCPU());
@@ -252,16 +350,11 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
if (hasXNACK(*STI))
Stream << "+xnack";
if (hasSRAMECC(*STI))
- Stream << "+sram-ecc";
+ Stream << "+sramecc";
Stream.flush();
}
-bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
- return STI->getTargetTriple().getOS() == Triple::AMDHSA &&
- STI->getFeatureBits().test(FeatureCodeObjectV3);
-}
-
unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureWavefrontSize16))
return 16;
@@ -284,7 +377,7 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
// workgroup must share". For gfx10 in CU mode this is the CU, which contains
// two SIMDs.
- if (isGFX10(*STI) && STI->getFeatureBits().test(FeatureCuMode))
+ if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
return 2;
// Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
// two CUs, so a total of four SIMDs.
@@ -309,7 +402,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
- if (!isGFX10(*STI))
+ if (!isGFX10Plus(*STI))
return 10;
return hasGFX10_3Insts(*STI) ? 16 : 20;
}
@@ -459,7 +552,7 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
- if (!isGFX10(*STI))
+ if (!isGFX10Plus(*STI))
return 256;
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}
@@ -578,7 +671,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
- return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600;
+ return TT.getArch() == Triple::r600;
}
int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
@@ -784,6 +877,165 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width)
} // namespace Hwreg
//===----------------------------------------------------------------------===//
+// exp tgt
+//===----------------------------------------------------------------------===//
+
+namespace Exp {
+
+struct ExpTgt {
+ StringLiteral Name;
+ unsigned Tgt;
+ unsigned MaxIndex;
+};
+
+static constexpr ExpTgt ExpTgtInfo[] = {
+ {{"null"}, ET_NULL, ET_NULL_MAX_IDX},
+ {{"mrtz"}, ET_MRTZ, ET_MRTZ_MAX_IDX},
+ {{"prim"}, ET_PRIM, ET_PRIM_MAX_IDX},
+ {{"mrt"}, ET_MRT0, ET_MRT_MAX_IDX},
+ {{"pos"}, ET_POS0, ET_POS_MAX_IDX},
+ {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX},
+};
+
+bool getTgtName(unsigned Id, StringRef &Name, int &Index) {
+ for (const ExpTgt &Val : ExpTgtInfo) {
+ if (Val.Tgt <= Id && Id <= Val.Tgt + Val.MaxIndex) {
+ Index = (Val.MaxIndex == 0) ? -1 : (Id - Val.Tgt);
+ Name = Val.Name;
+ return true;
+ }
+ }
+ return false;
+}
+
+unsigned getTgtId(const StringRef Name) {
+
+ for (const ExpTgt &Val : ExpTgtInfo) {
+ if (Val.MaxIndex == 0 && Name == Val.Name)
+ return Val.Tgt;
+
+ if (Val.MaxIndex > 0 && Name.startswith(Val.Name)) {
+ StringRef Suffix = Name.drop_front(Val.Name.size());
+
+ unsigned Id;
+ if (Suffix.getAsInteger(10, Id) || Id > Val.MaxIndex)
+ return ET_INVALID;
+
+ // Disable leading zeroes
+ if (Suffix.size() > 1 && Suffix[0] == '0')
+ return ET_INVALID;
+
+ return Val.Tgt + Id;
+ }
+ }
+ return ET_INVALID;
+}
+
+bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI) {
+ return (Id != ET_POS4 && Id != ET_PRIM) || isGFX10Plus(STI);
+}
+
+} // namespace Exp
+
+//===----------------------------------------------------------------------===//
+// MTBUF Format
+//===----------------------------------------------------------------------===//
+
+namespace MTBUFFormat {
+
+int64_t getDfmt(const StringRef Name) {
+ for (int Id = DFMT_MIN; Id <= DFMT_MAX; ++Id) {
+ if (Name == DfmtSymbolic[Id])
+ return Id;
+ }
+ return DFMT_UNDEF;
+}
+
+StringRef getDfmtName(unsigned Id) {
+ assert(Id <= DFMT_MAX);
+ return DfmtSymbolic[Id];
+}
+
+static StringLiteral const *getNfmtLookupTable(const MCSubtargetInfo &STI) {
+ if (isSI(STI) || isCI(STI))
+ return NfmtSymbolicSICI;
+ if (isVI(STI) || isGFX9(STI))
+ return NfmtSymbolicVI;
+ return NfmtSymbolicGFX10;
+}
+
+int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI) {
+ auto lookupTable = getNfmtLookupTable(STI);
+ for (int Id = NFMT_MIN; Id <= NFMT_MAX; ++Id) {
+ if (Name == lookupTable[Id])
+ return Id;
+ }
+ return NFMT_UNDEF;
+}
+
+StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI) {
+ assert(Id <= NFMT_MAX);
+ return getNfmtLookupTable(STI)[Id];
+}
+
+bool isValidDfmtNfmt(unsigned Id, const MCSubtargetInfo &STI) {
+ unsigned Dfmt;
+ unsigned Nfmt;
+ decodeDfmtNfmt(Id, Dfmt, Nfmt);
+ return isValidNfmt(Nfmt, STI);
+}
+
+bool isValidNfmt(unsigned Id, const MCSubtargetInfo &STI) {
+ return !getNfmtName(Id, STI).empty();
+}
+
+int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt) {
+ return (Dfmt << DFMT_SHIFT) | (Nfmt << NFMT_SHIFT);
+}
+
+void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) {
+ Dfmt = (Format >> DFMT_SHIFT) & DFMT_MASK;
+ Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK;
+}
+
+int64_t getUnifiedFormat(const StringRef Name) {
+ for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
+ if (Name == UfmtSymbolic[Id])
+ return Id;
+ }
+ return UFMT_UNDEF;
+}
+
+StringRef getUnifiedFormatName(unsigned Id) {
+ return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : "";
+}
+
+bool isValidUnifiedFormat(unsigned Id) {
+ return Id <= UFMT_LAST;
+}
+
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) {
+ int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt);
+ for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) {
+ if (Fmt == DfmtNfmt2UFmt[Id])
+ return Id;
+ }
+ return UFMT_UNDEF;
+}
+
+bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI) {
+ return isGFX10Plus(STI) ? (Val <= UFMT_MAX) : (Val <= DFMT_NFMT_MAX);
+}
+
+unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) {
+ if (isGFX10Plus(STI))
+ return UFMT_DEFAULT;
+ return DFMT_NFMT_DEFAULT;
+}
+
+} // namespace MTBUFFormat
+
+//===----------------------------------------------------------------------===//
// SendMsg
//===----------------------------------------------------------------------===//
@@ -804,7 +1056,7 @@ static bool isValidMsgId(int64_t MsgId) {
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
if (Strict) {
if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
- return isGFX9(STI) || isGFX10(STI);
+ return isGFX9Plus(STI);
else
return isValidMsgId(MsgId);
} else {
@@ -919,8 +1171,12 @@ bool isShader(CallingConv::ID cc) {
}
}
+bool isGraphics(CallingConv::ID cc) {
+ return isShader(cc) || cc == CallingConv::AMDGPU_Gfx;
+}
+
bool isCompute(CallingConv::ID cc) {
- return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+ return !isGraphics(cc) || cc == CallingConv::AMDGPU_CS;
}
bool isEntryFunctionCC(CallingConv::ID CC) {
@@ -940,6 +1196,15 @@ bool isEntryFunctionCC(CallingConv::ID CC) {
}
}
+bool isModuleEntryFunctionCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_Gfx:
+ return true;
+ default:
+ return isEntryFunctionCC(CC);
+ }
+}
+
bool hasXNACK(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
}
@@ -980,10 +1245,16 @@ bool isGFX9(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool isGFX9Plus(const MCSubtargetInfo &STI) {
+ return isGFX9(STI) || isGFX10Plus(STI);
+}
+
bool isGFX10(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
}
+bool isGFX10Plus(const MCSubtargetInfo &STI) { return isGFX10(STI); }
+
bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
@@ -1017,46 +1288,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
CASE_CI_VI(FLAT_SCR) \
CASE_CI_VI(FLAT_SCR_LO) \
CASE_CI_VI(FLAT_SCR_HI) \
- CASE_VI_GFX9_GFX10(TTMP0) \
- CASE_VI_GFX9_GFX10(TTMP1) \
- CASE_VI_GFX9_GFX10(TTMP2) \
- CASE_VI_GFX9_GFX10(TTMP3) \
- CASE_VI_GFX9_GFX10(TTMP4) \
- CASE_VI_GFX9_GFX10(TTMP5) \
- CASE_VI_GFX9_GFX10(TTMP6) \
- CASE_VI_GFX9_GFX10(TTMP7) \
- CASE_VI_GFX9_GFX10(TTMP8) \
- CASE_VI_GFX9_GFX10(TTMP9) \
- CASE_VI_GFX9_GFX10(TTMP10) \
- CASE_VI_GFX9_GFX10(TTMP11) \
- CASE_VI_GFX9_GFX10(TTMP12) \
- CASE_VI_GFX9_GFX10(TTMP13) \
- CASE_VI_GFX9_GFX10(TTMP14) \
- CASE_VI_GFX9_GFX10(TTMP15) \
- CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \
- CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \
- CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \
- CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \
- CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \
- CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \
- CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \
- CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \
- CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \
- CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \
- CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \
- CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \
- CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
- CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
- CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
- CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9PLUS(TTMP0) \
+ CASE_VI_GFX9PLUS(TTMP1) \
+ CASE_VI_GFX9PLUS(TTMP2) \
+ CASE_VI_GFX9PLUS(TTMP3) \
+ CASE_VI_GFX9PLUS(TTMP4) \
+ CASE_VI_GFX9PLUS(TTMP5) \
+ CASE_VI_GFX9PLUS(TTMP6) \
+ CASE_VI_GFX9PLUS(TTMP7) \
+ CASE_VI_GFX9PLUS(TTMP8) \
+ CASE_VI_GFX9PLUS(TTMP9) \
+ CASE_VI_GFX9PLUS(TTMP10) \
+ CASE_VI_GFX9PLUS(TTMP11) \
+ CASE_VI_GFX9PLUS(TTMP12) \
+ CASE_VI_GFX9PLUS(TTMP13) \
+ CASE_VI_GFX9PLUS(TTMP14) \
+ CASE_VI_GFX9PLUS(TTMP15) \
+ CASE_VI_GFX9PLUS(TTMP0_TTMP1) \
+ CASE_VI_GFX9PLUS(TTMP2_TTMP3) \
+ CASE_VI_GFX9PLUS(TTMP4_TTMP5) \
+ CASE_VI_GFX9PLUS(TTMP6_TTMP7) \
+ CASE_VI_GFX9PLUS(TTMP8_TTMP9) \
+ CASE_VI_GFX9PLUS(TTMP10_TTMP11) \
+ CASE_VI_GFX9PLUS(TTMP12_TTMP13) \
+ CASE_VI_GFX9PLUS(TTMP14_TTMP15) \
+ CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3) \
+ CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9PLUS(TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
}
#define CASE_CI_VI(node) \
assert(!isSI(STI)); \
case node: return isCI(STI) ? node##_ci : node##_vi;
-#define CASE_VI_GFX9_GFX10(node) \
- case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi;
+#define CASE_VI_GFX9PLUS(node) \
+ case node: return isGFX9Plus(STI) ? node##_gfx9plus : node##_vi;
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
if (STI.getTargetTriple().getArch() == Triple::r600)
@@ -1065,17 +1336,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
}
#undef CASE_CI_VI
-#undef CASE_VI_GFX9_GFX10
+#undef CASE_VI_GFX9PLUS
#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
-#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node;
+#define CASE_VI_GFX9PLUS(node) case node##_vi: case node##_gfx9plus: return node;
unsigned mc2PseudoReg(unsigned Reg) {
MAP_REG2REG
}
#undef CASE_CI_VI
-#undef CASE_VI_GFX9_GFX10
+#undef CASE_VI_GFX9PLUS
#undef MAP_REG2REG
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
@@ -1311,6 +1582,7 @@ bool isArgPassedInSGPR(const Argument *A) {
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_Gfx:
// For non-compute shaders, SGPR inputs are marked with either inreg or byval.
// Everything else is in VGPRs.
return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
@@ -1322,11 +1594,11 @@ bool isArgPassedInSGPR(const Argument *A) {
}
static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
- return isGCN3Encoding(ST) || isGFX10(ST);
+ return isGCN3Encoding(ST) || isGFX10Plus(ST);
}
static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
- return isGFX9(ST) || isGFX10(ST);
+ return isGFX9Plus(ST);
}
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
@@ -1382,6 +1654,14 @@ Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
return isUInt<32>(EncodedOffset) ? Optional<int64_t>(EncodedOffset) : None;
}
+unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed) {
+ // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
+ if (AMDGPU::isGFX10(ST))
+ return Signed ? 12 : 11;
+
+ return Signed ? 13 : 12;
+}
+
// Given Imm, split it into the values to put into the SOffset and ImmOffset
// fields in an MUBUF instruction. Return false if it is not possible (due to a
// hardware bug needing a workaround).
@@ -1483,7 +1763,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
const MCSubtargetInfo &STI) {
- return isGFX10(STI)
+ return isGFX10Plus(STI)
? getGfx10PlusBufferFormatInfo(BitsPerComp, NumComponents,
NumFormat)
: getGfx9BufferFormatInfo(BitsPerComp, NumComponents, NumFormat);
@@ -1491,9 +1771,29 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
const MCSubtargetInfo &STI) {
- return isGFX10(STI) ? getGfx10PlusBufferFormatInfo(Format)
- : getGfx9BufferFormatInfo(Format);
+ return isGFX10Plus(STI) ? getGfx10PlusBufferFormatInfo(Format)
+ : getGfx9BufferFormatInfo(Format);
}
} // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS,
+ const AMDGPU::IsaInfo::TargetIDSetting S) {
+ switch (S) {
+ case (AMDGPU::IsaInfo::TargetIDSetting::Unsupported):
+ OS << "Unsupported";
+ break;
+ case (AMDGPU::IsaInfo::TargetIDSetting::Any):
+ OS << "Any";
+ break;
+ case (AMDGPU::IsaInfo::TargetIDSetting::Off):
+ OS << "Off";
+ break;
+ case (AMDGPU::IsaInfo::TargetIDSetting::On):
+ OS << "On";
+ break;
+ }
+ return OS;
+}
+
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 26bb77f4b4c7..f9378693cf48 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -9,22 +9,15 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
-#include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
#include "SIDefines.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Alignment.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetParser.h"
-#include <cstdint>
-#include <string>
-#include <utility>
+
+struct amd_kernel_code_t;
namespace llvm {
+struct Align;
class Argument;
class Function;
class GCNSubtarget;
@@ -35,8 +28,23 @@ class MCSubtargetInfo;
class StringRef;
class Triple;
+namespace amdhsa {
+struct kernel_descriptor_t;
+}
+
namespace AMDGPU {
+struct IsaVersion;
+
+/// \returns HSA OS ABI Version identification.
+Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 2,
+/// false otherwise.
+bool isHsaAbiVersion2(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3,
+/// false otherwise.
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
+
struct GcnBufferFormatInfo {
unsigned Format;
unsigned BitsPerComp;
@@ -61,13 +69,87 @@ enum {
TRAP_NUM_SGPRS = 16
};
+enum class TargetIDSetting {
+ Unsupported,
+ Any,
+ Off,
+ On
+};
+
+class AMDGPUTargetID {
+private:
+ TargetIDSetting XnackSetting;
+ TargetIDSetting SramEccSetting;
+
+public:
+ explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
+ ~AMDGPUTargetID() = default;
+
+ /// \return True if the current xnack setting is not "Unsupported".
+ bool isXnackSupported() const {
+ return XnackSetting != TargetIDSetting::Unsupported;
+ }
+
+ /// \returns True if the current xnack setting is "On" or "Any".
+ bool isXnackOnOrAny() const {
+ return XnackSetting == TargetIDSetting::On ||
+ XnackSetting == TargetIDSetting::Any;
+ }
+
+ /// \returns True if current xnack setting is "On" or "Off",
+ /// false otherwise.
+ bool isXnackOnOrOff() const {
+ return getXnackSetting() == TargetIDSetting::On ||
+ getXnackSetting() == TargetIDSetting::Off;
+ }
+
+ /// \returns The current xnack TargetIDSetting, possible options are
+ /// "Unsupported", "Any", "Off", and "On".
+ TargetIDSetting getXnackSetting() const {
+ return XnackSetting;
+ }
+
+ /// Sets xnack setting to \p NewXnackSetting.
+ void setXnackSetting(TargetIDSetting NewXnackSetting) {
+ XnackSetting = NewXnackSetting;
+ }
+
+ /// \return True if the current sramecc setting is not "Unsupported".
+ bool isSramEccSupported() const {
+ return SramEccSetting != TargetIDSetting::Unsupported;
+ }
+
+ /// \returns True if the current sramecc setting is "On" or "Any".
+ bool isSramEccOnOrAny() const {
+ return SramEccSetting == TargetIDSetting::On ||
+ SramEccSetting == TargetIDSetting::Any;
+ }
+
+ /// \returns True if current sramecc setting is "On" or "Off",
+ /// false otherwise.
+ bool isSramEccOnOrOff() const {
+ return getSramEccSetting() == TargetIDSetting::On ||
+ getSramEccSetting() == TargetIDSetting::Off;
+ }
+
+ /// \returns The current sramecc TargetIDSetting, possible options are
+ /// "Unsupported", "Any", "Off", and "On".
+ TargetIDSetting getSramEccSetting() const {
+ return SramEccSetting;
+ }
+
+ /// Sets sramecc setting to \p NewSramEccSetting.
+ void setSramEccSetting(TargetIDSetting NewSramEccSetting) {
+ SramEccSetting = NewSramEccSetting;
+ }
+
+ void setTargetIDFromFeaturesString(StringRef FS);
+ void setTargetIDFromTargetIDStream(StringRef TargetID);
+};
+
/// Streams isa version string for given subtarget \p STI into \p Stream.
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
-/// \returns True if given subtarget \p STI supports code object version 3,
-/// false otherwise.
-bool hasCodeObjectV3(const MCSubtargetInfo *STI);
-
/// \returns Wavefront size for given subtarget \p STI.
unsigned getWavefrontSize(const MCSubtargetInfo *STI);
@@ -368,8 +450,8 @@ struct Waitcnt {
Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
: VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
- static Waitcnt allZero(const IsaVersion &Version) {
- return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u);
+ static Waitcnt allZero(bool HasVscnt) {
+ return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u);
}
static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); }
@@ -482,6 +564,51 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
} // namespace Hwreg
+namespace Exp {
+
+bool getTgtName(unsigned Id, StringRef &Name, int &Index);
+
+LLVM_READONLY
+unsigned getTgtId(const StringRef Name);
+
+LLVM_READNONE
+bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI);
+
+} // namespace Exp
+
+namespace MTBUFFormat {
+
+LLVM_READNONE
+int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt);
+
+void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt);
+
+int64_t getDfmt(const StringRef Name);
+
+StringRef getDfmtName(unsigned Id);
+
+int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI);
+
+StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI);
+
+bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI);
+
+bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI);
+
+int64_t getUnifiedFormat(const StringRef Name);
+
+StringRef getUnifiedFormatName(unsigned Id);
+
+bool isValidUnifiedFormat(unsigned Val);
+
+int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt);
+
+bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI);
+
+unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI);
+
+} // namespace MTBUFFormat
+
namespace SendMsg {
LLVM_READONLY
@@ -530,11 +657,23 @@ LLVM_READNONE
bool isShader(CallingConv::ID CC);
LLVM_READNONE
+bool isGraphics(CallingConv::ID CC);
+
+LLVM_READNONE
bool isCompute(CallingConv::ID CC);
LLVM_READNONE
bool isEntryFunctionCC(CallingConv::ID CC);
+// These functions are considered entrypoints into the current module, i.e. they
+// are allowed to be called from outside the current module. This is different
+// from isEntryFunctionCC, which is only true for functions that are entered by
+// the hardware. Module entry points include all entry functions but also
+// include functions that can be called from other functions inside or outside
+// the current module. Module entry functions are allowed to allocate LDS.
+LLVM_READNONE
+bool isModuleEntryFunctionCC(CallingConv::ID CC);
+
// FIXME: Remove this when calling conventions cleaned up
LLVM_READNONE
inline bool isKernel(CallingConv::ID CC) {
@@ -558,7 +697,9 @@ bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
+bool isGFX9Plus(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
+bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
@@ -690,6 +831,13 @@ Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
int64_t ByteOffset);
+/// For FLAT segment the offset must be positive;
+/// MSB is ignored and forced to zero.
+///
+/// \return The number of bits available for the offset field in flat
+/// instructions.
+unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed);
+
/// \returns true if this offset is small enough to fit in the SMRD
/// offset field. \p ByteOffset should be the offset in bytes and
/// not the encoded offset.
@@ -735,10 +883,8 @@ struct SIModeRegisterDefaults {
SIModeRegisterDefaults(const Function &F);
static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
- const bool IsCompute = AMDGPU::isCompute(CC);
-
SIModeRegisterDefaults Mode;
- Mode.IEEE = IsCompute;
+ Mode.IEEE = !AMDGPU::isShader(CC);
return Mode;
}
@@ -805,6 +951,10 @@ struct SIModeRegisterDefaults {
};
} // end namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS,
+ const AMDGPU::IsaInfo::TargetIDSetting S);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index ef010a7ac157..b7dd757a8af3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -15,12 +15,10 @@
//
#include "AMDGPUPALMetadata.h"
-#include "AMDGPU.h"
-#include "AMDGPUAsmPrinter.h"
-#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "AMDGPUPTNote.h"
#include "SIDefines.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -45,8 +43,11 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
}
BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
- if (!NamedMD || !NamedMD->getNumOperands())
+ if (!NamedMD || !NamedMD->getNumOperands()) {
+ // Emit msgpack metadata by default
+ BlobType = ELF::NT_AMDGPU_METADATA;
return;
+ }
// This is the old reg=value pair format for metadata. It is a NamedMD
// containing an MDTuple containing a number of MDNodes each of which is an
// integer value, and each two integer values forms a key=value pair that we
@@ -235,6 +236,13 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
}
+// Set the stack frame size of a function in the metadata.
+void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
+}
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -718,6 +726,30 @@ msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
return Registers.getMap();
}
+// Reference (create if necessary) the node for the shader functions map.
+msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
+ auto &N =
+ MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")];
+ N.getMap(/*Convert=*/true);
+ return N;
+}
+
+// Get (create if necessary) the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
+ if (ShaderFunctions.isEmpty())
+ ShaderFunctions = refShaderFunctions();
+ return ShaderFunctions.getMap();
+}
+
+// Get (create if necessary) a function in the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) {
+ auto Functions = getShaderFunctions();
+ return Functions[Name].getMap(/*Convert=*/true);
+}
+
// Return the PAL metadata hardware shader stage name.
static const char *getStageName(CallingConv::ID CC) {
switch (CC) {
@@ -733,6 +765,8 @@ static const char *getStageName(CallingConv::ID CC) {
return ".hs";
case CallingConv::AMDGPU_LS:
return ".ls";
+ case CallingConv::AMDGPU_Gfx:
+ llvm_unreachable("Callable shader has no hardware stage");
default:
return ".cs";
}
@@ -773,3 +807,9 @@ void AMDGPUPALMetadata::setLegacy() {
BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
}
+// Erase all PAL metadata.
+void AMDGPUPALMetadata::reset() {
+ MsgPackDoc.clear();
+ Registers = MsgPackDoc.getEmptyNode();
+ HwStages = MsgPackDoc.getEmptyNode();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 544ab669d9ae..8fa1f738487c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -13,11 +13,11 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
-
#include "llvm/BinaryFormat/MsgPackDocument.h"
namespace llvm {
+class MachineFunction;
class Module;
class StringRef;
@@ -26,6 +26,7 @@ class AMDGPUPALMetadata {
msgpack::Document MsgPackDoc;
msgpack::DocNode Registers;
msgpack::DocNode HwStages;
+ msgpack::DocNode ShaderFunctions;
public:
// Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -76,6 +77,9 @@ public:
// Set the scratch size in the metadata.
void setScratchSize(unsigned CC, unsigned Val);
+ // Set the stack frame size of a function in the metadata.
+ void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void setWave32(unsigned CC);
@@ -106,6 +110,9 @@ public:
// Set legacy PAL metadata format.
void setLegacy();
+ // Erase all PAL metadata.
+ void reset();
+
private:
// Return whether the blob type is legacy PAL metadata.
bool isLegacy() const;
@@ -116,6 +123,15 @@ private:
// Get (create if necessary) the registers map.
msgpack::MapDocNode getRegisters();
+ // Reference (create if necessary) the node for the shader functions map.
+ msgpack::DocNode &refShaderFunctions();
+
+ // Get (create if necessary) the shader functions map.
+ msgpack::MapDocNode getShaderFunctions();
+
+ // Get (create if necessary) a function in the shader functions map.
+ msgpack::MapDocNode getShaderFunction(StringRef Name);
+
// Get (create if necessary) the .hardware_stages entry for the given calling
// convention.
msgpack::MapDocNode getHwStage(unsigned CC);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 443e2cc45ac0..45eb6c321476 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "AMDKernelCodeTUtils.h"
+#include "AMDKernelCodeT.h"
#include "SIDefines.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringMap.h"
@@ -18,9 +19,6 @@
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index a87325a78df3..41d0e0d745e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -13,7 +13,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
-#include "AMDKernelCodeT.h"
+struct amd_kernel_code_t;
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 17f334f62a30..f1e470031982 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -138,7 +138,6 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
let HasModifiers = 0;
let HasClamp = 1;
- let HasOMod = 1;
}
def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
@@ -242,25 +241,25 @@ defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
-let SchedRW = [WriteTrans32] in {
+let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
-} // End SchedRW = [WriteTrans32]
+} // End TRANS = 1, SchedRW = [WriteTrans32]
-let SchedRW = [WriteTrans64] in {
+let TRANS = 1, SchedRW = [WriteTrans64] in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
-} // End SchedRW = [WriteTrans64]
+} // End TRANS = 1, SchedRW = [WriteTrans64]
-let SchedRW = [WriteTrans32] in {
+let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
-} // End SchedRW = [WriteTrans32]
+} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
@@ -338,10 +337,8 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_MOVRELS>;
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>;
} // End Uses = [M0, EXEC]
-defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-
let SubtargetPredicate = isGFX6GFX7 in {
- let SchedRW = [WriteTrans32] in {
+ let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_LOG_CLAMP_F32 :
VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
defm V_RCP_CLAMP_F32 :
@@ -352,7 +349,7 @@ let SubtargetPredicate = isGFX6GFX7 in {
VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
defm V_RSQ_LEGACY_F32 :
VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
- } // End SchedRW = [WriteTrans32]
+ } // End TRANS = 1, SchedRW = [WriteTrans32]
let SchedRW = [WriteDouble] in {
defm V_RCP_CLAMP_F64 :
@@ -363,10 +360,10 @@ let SubtargetPredicate = isGFX6GFX7 in {
} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX7GFX8GFX9 in {
- let SchedRW = [WriteTrans32] in {
+ let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>;
defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>;
- } // End SchedRW = [WriteTrans32]
+ } // End TRANS = 1, SchedRW = [WriteTrans32]
} // End SubtargetPredicate = isGFX7GFX8GFX9
let SubtargetPredicate = isGFX7Plus in {
@@ -386,7 +383,7 @@ defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
} // End FPDPRounding = 1
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
-let SchedRW = [WriteTrans32] in {
+let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
@@ -394,7 +391,7 @@ defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
-} // End SchedRW = [WriteTrans32]
+} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -650,7 +647,6 @@ defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>;
defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>;
defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>;
defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>;
-defm V_MOV_FED_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x009>;
defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
@@ -754,7 +750,6 @@ defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>;
defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>;
defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>;
defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>;
-defm V_MOV_FED_B32 : VOP1_Real_vi <0x9>;
defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>;
defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index aa37dbf1418f..7a334eaadaed 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -92,6 +92,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
@@ -240,12 +241,16 @@ multiclass VOP2eInst <string opName,
}
}
-class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd> :
+class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd = ""> :
InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd,
(inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
- ps.Pfl.Src1RC32:$src1)>,
- PredicateControl {
-}
+ ps.Pfl.Src1RC32:$src1)>, PredicateControl;
+
+class VOP2e64InstAlias <VOP3_Pseudo ps, Instruction inst> :
+ InstAlias <ps.OpName#" "#ps.Pfl.Asm64,
+ (inst ps.Pfl.DstRC:$vdst, VOPDstS64orS32:$sdst,
+ ps.Pfl.Src0RC32:$src0, ps.Pfl.Src1RC32:$src1, clampmod:$clamp)>,
+ PredicateControl;
multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
let WaveSizePredicate = isWave32 in {
@@ -328,11 +333,12 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F32 : VOP_MAC <f32>;
+let HasExtDPP = 0 in
+def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
let HasClamp = 0;
let HasExtSDWA = 0;
- let HasModifiers = 1;
let HasOpSel = 0;
let IsPacked = 0;
}
@@ -341,7 +347,11 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
let Src0ModDPP = FPVRegInputMods;
let Src1ModDPP = FPVRegInputMods;
}
-def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32>;
+
+def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> {
+ let HasSrc0Mods = 1;
+ let HasSrc1Mods = 1;
+}
// Write out to vcc or arbitrary SGPR.
def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> {
@@ -361,8 +371,8 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp
def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
- let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
- let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi";
let AsmDPP16 = AsmDPP#"$fi";
@@ -396,8 +406,8 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> {
let Asm32 = "$vdst, $src0, $src1";
let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
- let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
- let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi";
let AsmDPP16 = AsmDPP#"$fi";
@@ -468,7 +478,7 @@ def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
-defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>;
defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
@@ -490,24 +500,25 @@ defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
let mayRaiseFPException = 0 in {
-let SubtargetPredicate = HasMadMacF32Insts in {
+let OtherPredicates = [HasMadMacF32Insts] in {
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
-}
+
+let SubtargetPredicate = isGFX6GFX7GFX10 in
+defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>;
+} // End Constraints = "$vdst = $src2", DisableEncoding="$src2",
+ // isConvertibleToThreeAddress = 1
def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
-} // End SubtargetPredicate = HasMadMacF32Insts
-}
+} // End OtherPredicates = [HasMadMacF32Insts]
+} // End mayRaiseFPException = 0
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
-
-// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
-// but the VI instructions behave the same as the SI versions.
-defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_i32", 1>;
-defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
-defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
+defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
+defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
+defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
@@ -555,10 +566,6 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
} // End SubtargetPredicate = isGFX6GFX7
let isCommutable = 1 in {
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
-let OtherPredicates = [HasMadMacF32Insts] in
-defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10
let SubtargetPredicate = isGFX6GFX7 in {
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
@@ -595,8 +602,8 @@ let SubtargetPredicate = HasAddNoCarryInsts in {
}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in {
-def : DivergentClampingBinOp<add, V_ADD_I32_e64>;
-def : DivergentClampingBinOp<sub, V_SUB_I32_e64>;
+def : DivergentClampingBinOp<add, V_ADD_CO_U32_e64>;
+def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
}
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
@@ -635,7 +642,7 @@ defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
let isCommutable = 1 in {
let FPDPRounding = 1 in {
defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, any_fadd>;
-defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
+defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, any_fsub>;
defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, any_fmul>;
@@ -668,14 +675,23 @@ let SubtargetPredicate = HasDLInsts in {
defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
let Constraints = "$vdst = $src2",
- DisableEncoding="$src2",
+ DisableEncoding = "$src2",
isConvertibleToThreeAddress = 1,
- isCommutable = 1 in {
+ isCommutable = 1 in
defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
-}
} // End SubtargetPredicate = HasDLInsts
+let SubtargetPredicate = HasFmaLegacy32 in {
+
+let Constraints = "$vdst = $src2",
+ DisableEncoding = "$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in
+defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
+
+} // End SubtargetPredicate = HasFmaLegacy32
+
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
@@ -827,6 +843,24 @@ def : GCNPat <
} // End Predicates = [Has16BitInsts]
+let SubtargetPredicate = HasIntClamp in {
+// Set clamp bit for saturation.
+def : VOPBinOpClampPat<uaddsat, V_ADD_CO_U32_e64, i32>;
+def : VOPBinOpClampPat<usubsat, V_SUB_CO_U32_e64, i32>;
+}
+
+let SubtargetPredicate = HasAddNoCarryInsts, OtherPredicates = [HasIntClamp] in {
+let AddedComplexity = 1 in { // Prefer over form with carry-out.
+def : VOPBinOpClampPat<uaddsat, V_ADD_U32_e64, i32>;
+def : VOPBinOpClampPat<usubsat, V_SUB_U32_e64, i32>;
+}
+}
+
+let SubtargetPredicate = Has16BitInsts, OtherPredicates = [HasIntClamp] in {
+def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>;
+def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
@@ -854,6 +888,7 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
VOP2_DPP<op, ps, opName, p, 1> {
let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = HasDPP16;
+ let OtherPredicates = ps.OtherPredicates;
}
class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
@@ -880,6 +915,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
let AssemblerPredicate = HasDPP8;
let SubtargetPredicate = HasDPP8;
+ let OtherPredicates = ps.OtherPredicates;
}
//===----------------------------------------------------------------------===//
@@ -1090,13 +1126,10 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
//===---------------------------- VOP3beOnly ----------------------------===//
- multiclass VOP3beOnly_Real_gfx10<bits<10> op, string opName, string asmName> {
+ multiclass VOP3beOnly_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
- VOP3be_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
- VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
- let AsmString = asmName # Ps.AsmOperands;
- }
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
@@ -1126,21 +1159,25 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
-defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
-defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
-defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
-defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>;
-defm V_ADD_F16 : VOP2_Real_gfx10<0x032>;
-defm V_SUB_F16 : VOP2_Real_gfx10<0x033>;
-defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>;
-defm V_MUL_F16 : VOP2_Real_gfx10<0x035>;
-defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>;
-defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>;
-defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
-defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
-defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
-defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
-defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
+// NB: Same opcode as v_mac_legacy_f32
+let DecoderNamespace = "GFX10_B" in
+defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>;
+
+defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
+defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
+defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
+defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>;
+defm V_ADD_F16 : VOP2_Real_gfx10<0x032>;
+defm V_SUB_F16 : VOP2_Real_gfx10<0x033>;
+defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>;
+defm V_MUL_F16 : VOP2_Real_gfx10<0x035>;
+defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>;
+defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>;
+defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
+defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
+defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
+defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
+defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@@ -1172,13 +1209,10 @@ defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>;
defm V_CVT_PK_U16_U32 : VOP3Only_Real_gfx10<0x36a>;
defm V_CVT_PK_I16_I32 : VOP3Only_Real_gfx10<0x36b>;
-// VOP3 carry-in, carry-out.
-defm V_ADD_CO_U32 :
- VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">;
-defm V_SUB_CO_U32 :
- VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">;
-defm V_SUBREV_CO_U32 :
- VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">;
+// VOP3 carry-out.
+defm V_ADD_CO_U32 : VOP3beOnly_Real_gfx10<0x30f>;
+defm V_SUB_CO_U32 : VOP3beOnly_Real_gfx10<0x310>;
+defm V_SUBREV_CO_U32 : VOP3beOnly_Real_gfx10<0x319>;
let SubtargetPredicate = isGFX10Plus in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>;
@@ -1207,7 +1241,7 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
}
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
- multiclass VOP2Only_Real_gfx6_gfx7<bits<6> op> {
+ multiclass VOP2_Lane_Real_gfx6_gfx7<bits<6> op> {
def _gfx6_gfx7 :
VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
@@ -1217,20 +1251,20 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
}
- multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op> {
+ multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
def _e32_gfx6_gfx7 :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ VOP2_Real<!cast<VOP2_Pseudo>(PseudoName#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(PseudoName#"_e32").Pfl>;
}
- multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op> {
+ multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
}
- multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op> {
+ multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
@@ -1246,6 +1280,20 @@ multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
+multiclass VOP2be_Real_gfx6_gfx7_with_name<bits<6> op,
+ string PseudoName, string asmName> {
+ defvar ps32 = !cast<VOP2_Pseudo>(PseudoName#"_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(PseudoName#"_e64");
+
+ let AsmString = asmName # ps32.AsmOperands in {
+ defm "" : VOP2_Real_e32_gfx6_gfx7<op, PseudoName>;
+ }
+
+ let AsmString = asmName # ps64.AsmOperands in {
+ defm "" : VOP2be_Real_e64_gfx6_gfx7<op, PseudoName>;
+ }
+}
+
defm V_CNDMASK_B32 : VOP2_Real_gfx6_gfx7<0x000>;
defm V_MIN_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00d>;
defm V_MAX_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00e>;
@@ -1262,27 +1310,36 @@ defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>;
defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>;
defm V_CVT_PK_U16_U32 : VOP2_Real_gfx6_gfx7<0x030>;
defm V_CVT_PK_I16_I32 : VOP2_Real_gfx6_gfx7<0x031>;
-defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7<0x025>;
-defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7<0x026>;
-defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7<0x027>;
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in
+// VI, but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7_with_name<0x025, "V_ADD_CO_U32", "v_add_i32">;
+defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7_with_name<0x026, "V_SUB_CO_U32", "v_sub_i32">;
+defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7_with_name<0x027, "V_SUBREV_CO_U32", "v_subrev_i32">;
defm V_ADDC_U32 : VOP2be_Real_gfx6_gfx7<0x028>;
defm V_SUBB_U32 : VOP2be_Real_gfx6_gfx7<0x029>;
defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>;
-defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
+defm V_READLANE_B32 : VOP2_Lane_Real_gfx6_gfx7<0x001>;
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
- defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
+ defm V_WRITELANE_B32 : VOP2_Lane_Real_gfx6_gfx7<0x002>;
} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
let SubtargetPredicate = isGFX6GFX7 in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
+ defm : VOP2eInstAliases<V_ADD_CO_U32_e32, V_ADD_I32_e32_gfx6_gfx7>;
+ defm : VOP2eInstAliases<V_SUB_CO_U32_e32, V_SUB_I32_e32_gfx6_gfx7>;
+ defm : VOP2eInstAliases<V_SUBREV_CO_U32_e32, V_SUBREV_I32_e32_gfx6_gfx7>;
+
+ def : VOP2e64InstAlias<V_ADD_CO_U32_e64, V_ADD_I32_e64_gfx6_gfx7>;
+ def : VOP2e64InstAlias<V_SUB_CO_U32_e64, V_SUB_I32_e64_gfx6_gfx7>;
+ def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>;
} // End SubtargetPredicate = isGFX6GFX7
defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
-let OtherPredicates = [HasMadMacF32Insts] in
defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
@@ -1490,16 +1547,16 @@ defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>;
defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>;
defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>;
-defm V_ADD_U32 : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_I32", "v_add_u32">;
-defm V_SUB_U32 : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_I32", "v_sub_u32">;
-defm V_SUBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_I32", "v_subrev_u32">;
+defm V_ADD_U32 : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_CO_U32", "v_add_u32">;
+defm V_SUB_U32 : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_CO_U32", "v_sub_u32">;
+defm V_SUBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_CO_U32", "v_subrev_u32">;
defm V_ADDC_U32 : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32", "v_addc_u32">;
defm V_SUBB_U32 : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32", "v_subb_u32">;
defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">;
-defm V_ADD_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_I32", "v_add_co_u32">;
-defm V_SUB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_I32", "v_sub_co_u32">;
-defm V_SUBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_I32", "v_subrev_co_u32">;
+defm V_ADD_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_CO_U32", "v_add_co_u32">;
+defm V_SUB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_CO_U32", "v_sub_co_u32">;
+defm V_SUBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_CO_U32", "v_subrev_co_u32">;
defm V_ADDC_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1c, "V_ADDC_U32", "v_addc_co_u32">;
defm V_SUBB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1d, "V_SUBB_U32", "v_subb_co_u32">;
defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_subbrev_co_u32">;
@@ -1568,11 +1625,11 @@ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_vi>;
let SubtargetPredicate = isGFX9Only in {
-defm : VOP2bInstAliases<V_ADD_I32_e32, V_ADD_CO_U32_e32_gfx9, "v_add_co_u32">;
+defm : VOP2bInstAliases<V_ADD_U32_e32, V_ADD_CO_U32_e32_gfx9, "v_add_co_u32">;
defm : VOP2bInstAliases<V_ADDC_U32_e32, V_ADDC_CO_U32_e32_gfx9, "v_addc_co_u32">;
-defm : VOP2bInstAliases<V_SUB_I32_e32, V_SUB_CO_U32_e32_gfx9, "v_sub_co_u32">;
+defm : VOP2bInstAliases<V_SUB_U32_e32, V_SUB_CO_U32_e32_gfx9, "v_sub_co_u32">;
defm : VOP2bInstAliases<V_SUBB_U32_e32, V_SUBB_CO_U32_e32_gfx9, "v_subb_co_u32">;
-defm : VOP2bInstAliases<V_SUBREV_I32_e32, V_SUBREV_CO_U32_e32_gfx9, "v_subrev_co_u32">;
+defm : VOP2bInstAliases<V_SUBREV_U32_e32, V_SUBREV_CO_U32_e32_gfx9, "v_subrev_co_u32">;
defm : VOP2bInstAliases<V_SUBBREV_U32_e32, V_SUBBREV_CO_U32_e32_gfx9, "v_subbrev_co_u32">;
} // End SubtargetPredicate = isGFX9Only
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 169949f2171a..42dc995609f0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -119,28 +119,37 @@ class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
timm:$cbsz, timm:$abid, timm:$blgp))];
}
-class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+// Consistently gives instructions a _e64 suffix.
+multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
+ def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
+}
+
+class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
VOP3_Pseudo<OpName, P,
- !if(P.HasOpSel,
- !if(P.HasModifiers,
- getVOP3OpSelModPat<P, node>.ret,
- getVOP3OpSelPat<P, node>.ret),
- !if(P.HasModifiers,
- getVOP3ModPat<P, node>.ret,
- !if(P.HasIntClamp,
- getVOP3ClampPat<P, node>.ret,
- !if (P.IsMAI,
- getVOP3MAIPat<P, node>.ret,
- getVOP3Pat<P, node>.ret)))),
- VOP3Only, 0, P.HasOpSel> {
+ !if(P.HasOpSel,
+ !if(P.HasModifiers,
+ getVOP3OpSelModPat<P, node>.ret,
+ getVOP3OpSelPat<P, node>.ret),
+ !if(P.HasModifiers,
+ getVOP3ModPat<P, node>.ret,
+ !if(P.HasIntClamp,
+ getVOP3ClampPat<P, node>.ret,
+ !if (P.IsMAI,
+ getVOP3MAIPat<P, node>.ret,
+ getVOP3Pat<P, node>.ret)))),
+ VOP3Only, 0, P.HasOpSel> {
let IntClamp = P.HasIntClamp;
let AsmMatchConverter =
- !if(P.HasOpSel,
- "cvtVOP3OpSel",
- !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
- "cvtVOP3",
- ""));
+ !if(P.HasOpSel,
+ "cvtVOP3OpSel",
+ !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
+ "cvtVOP3",
+ ""));
+}
+
+multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> {
+ def _e64 : VOP3InstBase<OpName, P, node, VOP3Only>;
}
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
@@ -174,7 +183,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
- let HasModifiers = !if(Features.IsPacked, !if(Features.IsMAI, 0, 1), P.HasModifiers);
+ let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
// FIXME: Hack to stop printing _e64
let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -182,6 +191,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
" " # !if(Features.HasOpSel,
getAsmVOP3OpSel<NumSrcArgs,
HasIntClamp,
+ P.HasOMod,
HasSrc0FloatMods,
HasSrc1FloatMods,
HasSrc2FloatMods>.ret,
@@ -193,12 +203,8 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
}
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
- // v_div_scale_{f32|f64} do not support input modifiers.
- let HasModifiers = 0;
- let HasClamp = 0;
- let HasOMod = 0;
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
- let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+ let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
}
def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
@@ -247,6 +253,7 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
let HasClamp = 1;
+ let HasSrc0Mods = 0;
}
class getInterp16Asm <bit HasSrc2, bit HasOMod> {
@@ -277,7 +284,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
- let HasOMod = !if(!eq(DstVT.Value, f16.Value), 0, 1);
+ let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
let Outs64 = (outs VGPR_32:$vdst);
@@ -293,34 +300,36 @@ let isCommutable = 1 in {
let mayRaiseFPException = 0 in {
let SubtargetPredicate = HasMadMacF32Insts in {
-def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+defm V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+defm V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
} // End SubtargetPredicate = HasMadMacInsts
-let SubtargetPredicate = HasNoMadMacF32Insts in
-def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+let SubtargetPredicate = HasFmaLegacy32 in
+defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
+ VOP3_Profile<VOP_F32_F32_F32_F32>,
+ int_amdgcn_fma_legacy>;
}
-def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
-def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
+defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
-def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
-def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
+defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
+defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
} // End FPDPRounding = 1
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
+defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
-def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
-def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
-def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
-def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
+defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
+defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
+defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
} // End SchedRW = [WriteQuarterRate32]
let Uses = [MODE, VCC, EXEC] in {
@@ -329,191 +338,165 @@ let Uses = [MODE, VCC, EXEC] in {
// if (vcc)
// result *= 2^32
//
-def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []> {
- let SchedRW = [WriteFloatFMA];
-}
+let SchedRW = [WriteFloatFMA] in
+defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []>;
// v_div_fmas_f64:
// result = src0 * src1 + src2
// if (vcc)
// result *= 2^64
//
-def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> {
- let SchedRW = [WriteDouble];
- let FPDPRounding = 1;
-}
-} // End Uses = [VCC, EXEC]
+let SchedRW = [WriteDouble], FPDPRounding = 1 in
+defm V_DIV_FMAS_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []>;
+} // End Uses = [MODE, VCC, EXEC]
} // End isCommutable = 1
let mayRaiseFPException = 0 in {
-def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
-def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
-def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
-def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
+defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
+defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
+defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
} // End mayRaiseFPException
-def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
-def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
-def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
-def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
-def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
+defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
+defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
+defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
+defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
-def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
-def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
-def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
-def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
-def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
-def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
-def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
-def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
-def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
+defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
} // End mayRaiseFPException = 0
-def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
-def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
+
+defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
-def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
-def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
+ defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
+ defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
-def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
- let SchedRW = [WriteFloatFMA, WriteSALU];
- let AsmMatchConverter = "";
-}
+ let SchedRW = [WriteFloatFMA, WriteSALU] in
+ defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> ;
-// Double precision division pre-scale.
-def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
- let SchedRW = [WriteDouble, WriteSALU];
- let AsmMatchConverter = "";
- let FPDPRounding = 1;
-}
+ // Double precision division pre-scale.
+ let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
+ defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1>;
} // End mayRaiseFPException = 0
-def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
let Constraints = "@earlyclobber $vdst" in {
-def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
} // End Constraints = "@earlyclobber $vdst"
-def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop> {
- let SchedRW = [WriteDouble];
-}
-let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7 in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
-} // End SubtargetPredicate = isGFX6GFX7
+let SchedRW = [WriteDouble] in {
+defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
+} // End SchedRW = [WriteDouble]
-let SubtargetPredicate = isGFX8Plus in {
-def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
-def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
-def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
-} // End SubtargetPredicate = isGFX8Plus
+let SchedRW = [Write64Bit] in {
+ let SubtargetPredicate = isGFX6GFX7 in {
+ defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
+ defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
+ defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
+ } // End SubtargetPredicate = isGFX6GFX7
+
+ let SubtargetPredicate = isGFX8Plus in {
+ defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
+ defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
+ defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
+ } // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
def : GCNPat<
- (i64 (getDivergentFrag<sext>.ret i16:$src)),
- (REG_SEQUENCE VReg_64,
- (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
- (i32 (COPY_TO_REGCLASS
- (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
- ), VGPR_32)), sub1)
->;
-
-def : GCNPat<
(i32 (getDivergentFrag<sext>.ret i16:$src)),
- (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+ (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
>;
let SubtargetPredicate = isGFX6GFX7GFX10 in {
-def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
} // End SubtargetPredicate = isGFX6GFX7GFX10
let SchedRW = [Write32Bit] in {
let SubtargetPredicate = isGFX8Plus in {
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
+defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write32Bit]
let SubtargetPredicate = isGFX7Plus in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
-def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
+defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>;
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
let isCommutable = 1 in {
let SchedRW = [WriteQuarterRate32, WriteSALU] in {
-def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
-def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
+defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SchedRW = [WriteQuarterRate32, WriteSALU]
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX7Plus
-
-def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
- let Predicates = [Has16BitInsts, isGFX8Only];
- let FPDPRounding = 1;
-}
-def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
- VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
- let renamedInGFX9 = 1;
- let Predicates = [Has16BitInsts, isGFX9Plus];
- let FPDPRounding = 1;
-}
-
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma> {
- let Predicates = [Has16BitInsts, isGFX8Only];
- let FPDPRounding = 1;
-}
-def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma> {
- let renamedInGFX9 = 1;
- let Predicates = [Has16BitInsts, isGFX9Plus];
- let FPDPRounding = 1;
-}
+let FPDPRounding = 1 in {
+ let Predicates = [Has16BitInsts, isGFX8Only] in {
+ defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+ defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
+ } // End Predicates = [Has16BitInsts, isGFX8Only]
+
+ let renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus] in {
+ defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
+ VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>;
+ defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
+ } // End renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus]
+} // End FPDPRounding = 1
let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
let renamedInGFX9 = 1 in {
-def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-let FPDPRounding = 1 in {
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
-let Uses = [MODE, M0, EXEC] in {
-// For some reason the intrinsic operands are in a different order
-// from the instruction operands.
-def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
- [(set f16:$vdst,
- (int_amdgcn_interp_p2_f16 (VOP3Mods f32:$src2, i32:$src2_modifiers),
- (VOP3Mods f32:$src0, i32:$src0_modifiers),
- (i32 timm:$attrchan),
- (i32 timm:$attr),
- (i1 timm:$high),
- M0))]>;
-} // End Uses = [M0, MODE, EXEC]
-} // End FPDPRounding = 1
+ defm V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+ defm V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+ let FPDPRounding = 1 in {
+ defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+ let Uses = [MODE, M0, EXEC] in {
+ // For some reason the intrinsic operands are in a different order
+ // from the instruction operands.
+ def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+ [(set f16:$vdst,
+ (int_amdgcn_interp_p2_f16 (VOP3Mods f32:$src2, i32:$src2_modifiers),
+ (VOP3Mods f32:$src0, i32:$src0_modifiers),
+ (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i1 timm:$high),
+ M0))]>;
+ } // End Uses = [M0, MODE, EXEC]
+ } // End FPDPRounding = 1
} // End renamedInGFX9 = 1
-let SubtargetPredicate = isGFX9Only in {
-def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
- let FPDPRounding = 1;
-}
-} // End SubtargetPredicate = isGFX9Only
+let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
+ defm V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> ;
+} // End SubtargetPredicate = isGFX9Only, FPDPRounding = 1
let SubtargetPredicate = isGFX9Plus in {
-def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
-def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9Plus
@@ -535,6 +518,15 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
+def : GCNPat<
+ (i64 (getDivergentFrag<sext>.ret i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+ (i32 (COPY_TO_REGCLASS
+ (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+ ), VGPR_32)), sub1)
+>;
+
let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
@@ -552,8 +544,8 @@ def : GCNPat <
}
-defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64, sext>;
} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
@@ -568,8 +560,8 @@ def : GCNPat <
}
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9, zext>;
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9, sext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64, zext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64, sext>;
} // End Predicates = [Has16BitInsts, isGFX10Plus]
@@ -593,9 +585,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
if (!Operands[i]->isDivergent() &&
!isInlineImmediate(Operands[i].getNode())) {
ConstantBusUses++;
- // This uses AMDGPU::V_ADD3_U32, but all three operand instructions
+ // This uses AMDGPU::V_ADD3_U32_e64, but all three operand instructions
// have the same constant bus limit.
- if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32))
+ if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32_e64))
return false;
}
}
@@ -605,52 +597,60 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
let PredicateCodeUsesOperands = 1;
// The divergence predicate is irrelevant in GlobalISel, as we have
- // proper register bank checks. We also force all VOP instruction
- // operands to VGPR, so we should not need to check the constant bus
- // restriction.
+ // proper register bank checks. We just need to verify the constant
+ // bus restriction when all the sources are considered.
//
// FIXME: With unlucky SGPR operands, we could penalize code by
// blocking folding SGPR->VGPR copies later.
// FIXME: There's no register bank verifier
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
+ let GISelPredicateCode = [{
+ const int ConstantBusLimit = Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32_e64);
+ int ConstantBusUses = 0;
+ for (unsigned i = 0; i < 3; ++i) {
+ const RegisterBank *RegBank = RBI.getRegBank(Operands[i]->getReg(), MRI, TRI);
+ if (RegBank->getID() == AMDGPU::SGPRRegBankID) {
+ if (++ConstantBusUses > ConstantBusLimit)
+ return false;
+ }
+ }
+ return true;
+ }];
}
let SubtargetPredicate = isGFX9Plus in {
-def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
-def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
-def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
+defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
+defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
+defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
-def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
-def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
-def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
+defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
+defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
+defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
-def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
-def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
-def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
+defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
+defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
+defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
-def V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-def V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-def V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
-def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
@@ -659,14 +659,28 @@ class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instructio
(inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
>;
-def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
-def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32>;
-def : ThreeOp_i32_Pats<add, add, V_ADD3_U32>;
-def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32>;
-def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
-def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
-def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
+def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32_e64>;
+def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32_e64>;
+def : ThreeOp_i32_Pats<add, add, V_ADD3_U32_e64>;
+def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32_e64>;
+def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
+def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
+def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+
+def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
+def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
+
+// FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
+class OpSelBinOpClampPat<SDPatternOperator node,
+ Instruction inst> : GCNPat<
+ (node (i16 (VOP3OpSel i16:$src0, i32:$src0_modifiers)),
+ (i16 (VOP3OpSel i16:$src1, i32:$src1_modifiers))),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0)
+>;
+
+def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
+def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
} // End SubtargetPredicate = isGFX9Plus
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
@@ -676,9 +690,8 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1,
IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
- VGPR_32:$vdst_in, op_sel:$op_sel);
+ VGPR_32:$vdst_in, op_sel0:$op_sel);
let HasClamp = 0;
- let HasOMod = 0;
}
class PermlanePat<SDPatternOperator permlane,
@@ -716,23 +729,23 @@ class PermlaneDiscardVDstIn<SDPatternOperator permlane,
let SubtargetPredicate = isGFX10Plus in {
- def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
- def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
+ defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
- def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>;
- def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+ defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
+ defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
- def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32>;
- def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32>;
+ def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
+ def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
def : PermlaneDiscardVDstIn<
BoundControlOrFetchInvalidPermlane<int_amdgcn_permlane16>,
- V_PERMLANE16_B32>;
+ V_PERMLANE16_B32_e64>;
def : PermlaneDiscardVDstIn<
BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
- V_PERMLANEX16_B32>;
+ V_PERMLANEX16_B32_e64>;
} // End SubtargetPredicate = isGFX10Plus
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
@@ -744,13 +757,13 @@ class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
>;
let WaveSizePredicate = isWave64 in {
-def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC>;
-def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC>;
+def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC>;
}
let WaveSizePredicate = isWave32 in {
-def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC_LO>;
-def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC_LO>;
+def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC_LO>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
}
//===----------------------------------------------------------------------===//
@@ -775,23 +788,23 @@ class getClampRes<VOPProfile P, Instruction inst> {
ret1));
}
-class IntClampPat<VOP3Inst inst, SDPatternOperator node> : GCNPat<
+class IntClampPat<VOP3InstBase inst, SDPatternOperator node> : GCNPat<
getClampPat<inst.Pfl, node>.ret,
getClampRes<inst.Pfl, inst>.ret
>;
-def : IntClampPat<V_MAD_I32_I24, AMDGPUmad_i24>;
-def : IntClampPat<V_MAD_U32_U24, AMDGPUmad_u24>;
+def : IntClampPat<V_MAD_I32_I24_e64, AMDGPUmad_i24>;
+def : IntClampPat<V_MAD_U32_U24_e64, AMDGPUmad_u24>;
-def : IntClampPat<V_SAD_U8, int_amdgcn_sad_u8>;
-def : IntClampPat<V_SAD_HI_U8, int_amdgcn_sad_hi_u8>;
-def : IntClampPat<V_SAD_U16, int_amdgcn_sad_u16>;
+def : IntClampPat<V_SAD_U8_e64, int_amdgcn_sad_u8>;
+def : IntClampPat<V_SAD_HI_U8_e64, int_amdgcn_sad_hi_u8>;
+def : IntClampPat<V_SAD_U16_e64, int_amdgcn_sad_u16>;
-def : IntClampPat<V_MSAD_U8, int_amdgcn_msad_u8>;
-def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>;
+def : IntClampPat<V_MSAD_U8_e64, int_amdgcn_msad_u8>;
+def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>;
-def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>;
-def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
+def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>;
+def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
//===----------------------------------------------------------------------===//
@@ -805,22 +818,27 @@ def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
multiclass VOP3_Real_gfx10<bits<10> op> {
def _gfx10 :
+ VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP3_Real_No_Suffix_gfx10<bits<10> op> {
+ def _gfx10 :
VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX10>,
VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
multiclass VOP3_Real_gfx10_with_name<bits<10> op, string opName,
string asmName> {
def _gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
- VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands;
}
}
multiclass VOP3be_Real_gfx10<bits<10> op> {
def _gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
- VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3Interp_Real_gfx10<bits<10> op> {
def _gfx10 :
@@ -829,26 +847,30 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
multiclass VOP3OpSel_Real_gfx10<bits<10> op> {
def _gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
- VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3OpSel_Real_gfx10_with_name<bits<10> op, string opName,
string asmName> {
def _gfx10 :
- VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
- VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands;
}
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
-defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>;
+defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>;
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
- defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
+ defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
+let SubtargetPredicate = isGFX10Before1030 in {
+ defm V_MUL_LO_I32 : VOP3_Real_gfx10<0x16b>;
+}
+
defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>;
defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>;
defm V_LSHRREV_B64 : VOP3_Real_gfx10<0x300>;
@@ -868,9 +890,9 @@ defm V_ADD_NC_I16 :
defm V_SUB_NC_I16 :
VOP3OpSel_Real_gfx10_with_name<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
defm V_SUB_NC_I32 :
- VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32_gfx9", "v_sub_nc_i32">;
+ VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32", "v_sub_nc_i32">;
defm V_ADD_NC_I32 :
- VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32_gfx9", "v_add_nc_i32">;
+ VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32", "v_add_nc_i32">;
defm V_INTERP_P1_F32_e64 : VOP3Interp_Real_gfx10<0x200>;
defm V_INTERP_P2_F32_e64 : VOP3Interp_Real_gfx10<0x201>;
@@ -907,16 +929,16 @@ defm V_DIV_FIXUP_F16 :
// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
// (they do not support SDWA or DPP).
-defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16_e64", "v_add_nc_u16">;
-defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16_e64", "v_sub_nc_u16">;
-defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_e64", "v_mul_lo_u16">;
-defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_e64", "v_lshrrev_b16">;
-defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_e64", "v_ashrrev_i16">;
-defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16_e64", "v_max_u16">;
-defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16_e64", "v_max_i16">;
-defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">;
-defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">;
-defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">;
+defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">;
+defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">;
+defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
+defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
+defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
+defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16", "v_max_u16">;
+defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16", "v_max_i16">;
+defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16", "v_min_u16">;
+defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16", "v_min_i16">;
+defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16", "v_lshlrev_b16">;
defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
@@ -927,13 +949,13 @@ defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
multiclass VOP3_Real_gfx7<bits<10> op> {
def _gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3be_Real_gfx7<bits<10> op> {
def _gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
@@ -955,13 +977,13 @@ defm V_MAD_I64_I32 : VOP3be_Real_gfx7_gfx10<0x177>;
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
multiclass VOP3_Real_gfx6_gfx7<bits<10> op> {
def _gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3be_Real_gfx6_gfx7<bits<10> op> {
def _gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
@@ -974,6 +996,7 @@ multiclass VOP3be_Real_gfx6_gfx7_gfx10<bits<10> op> :
defm V_LSHL_B64 : VOP3_Real_gfx6_gfx7<0x161>;
defm V_LSHR_B64 : VOP3_Real_gfx6_gfx7<0x162>;
defm V_ASHR_I64 : VOP3_Real_gfx6_gfx7<0x163>;
+defm V_MUL_LO_I32 : VOP3_Real_gfx6_gfx7<0x16b>;
defm V_MAD_LEGACY_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x140>;
defm V_MAD_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x141>;
@@ -1015,7 +1038,6 @@ defm V_MAX_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x167>;
defm V_LDEXP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x168>;
defm V_MUL_LO_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x169>;
defm V_MUL_HI_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x16a>;
-defm V_MUL_LO_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16b>;
defm V_MUL_HI_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16c>;
defm V_DIV_FMAS_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x16f>;
defm V_DIV_FMAS_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x170>;
@@ -1036,18 +1058,22 @@ defm V_FMA_LEGACY_F32 : VOP3_Real_gfx10<0x140>;
let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
multiclass VOP3_Real_vi<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
+}
+multiclass VOP3_Real_No_Suffix_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
multiclass VOP3be_Real_vi<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3be_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3be_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3Interp_Real_vi<bits<10> op> {
@@ -1060,8 +1086,8 @@ multiclass VOP3Interp_Real_vi<bits<10> op> {
let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in {
multiclass VOP3_F16_Real_vi<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
@@ -1074,17 +1100,17 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
- def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
- VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
}
}
multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> {
- def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>,
- VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME);
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
+ VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
let AsmString = AsmName # ps.AsmOperands;
}
}
@@ -1098,9 +1124,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
}
multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
- def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>,
- VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> {
- VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME);
+ def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
+ VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
+ VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME#"_e64");
let AsmString = AsmName # ps.AsmOperands;
}
}
@@ -1177,8 +1203,8 @@ defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
-defm V_ADD_I32_gfx9 : VOP3_Real_gfx9 <0x29c, "v_add_i32">;
-defm V_SUB_I32_gfx9 : VOP3_Real_gfx9 <0x29d, "v_sub_i32">;
+defm V_ADD_I32 : VOP3_Real_vi <0x29c>;
+defm V_SUB_I32 : VOP3_Real_vi <0x29d>;
defm V_INTERP_P1_F32_e64 : VOP3Interp_Real_vi <0x270>;
defm V_INTERP_P2_F32_e64 : VOP3Interp_Real_vi <0x271>;
@@ -1201,8 +1227,8 @@ defm V_MUL_LO_I32 : VOP3_Real_vi <0x285>;
defm V_MUL_HI_U32 : VOP3_Real_vi <0x286>;
defm V_MUL_HI_I32 : VOP3_Real_vi <0x287>;
-defm V_READLANE_B32 : VOP3_Real_vi <0x289>;
-defm V_WRITELANE_B32 : VOP3_Real_vi <0x28a>;
+defm V_READLANE_B32 : VOP3_Real_No_Suffix_vi <0x289>;
+defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_vi <0x28a>;
defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>;
defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index fc457ad212d4..64e70b8f64b0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -39,7 +39,7 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
// class constraints.
!if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
(ins clampmod0:$clamp))),
- (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi));
+ (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi));
let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(UseTiedOutput, "$vdst_in", "");
@@ -77,6 +77,8 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+let SubtargetPredicate = HasVOP3PInsts in {
+
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// The constant will be emitted as a mov, and folded later.
@@ -86,6 +88,19 @@ def : GCNPat<
(V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
>;
+// Integer operations with clamp bit set.
+class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<
+ (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)),
+ (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE)
+>;
+
+def : VOP3PSatPat<uaddsat, V_PK_ADD_U16>;
+def : VOP3PSatPat<saddsat, V_PK_ADD_I16>;
+def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
+def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
+} // End SubtargetPredicate = HasVOP3PInsts
+
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
@@ -211,7 +226,7 @@ foreach Type = ["I", "U"] in
foreach Index = 0-3 in {
// Defines patterns that extract each Index'ed 8bit from an unsigned
// 32bit scalar value;
- def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
+ def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !eq (Type, "U")>;
// Defines multiplication patterns where the multiplication is happening on each
// Index'ed 8bit of a 32bit scalar value.
@@ -239,7 +254,7 @@ foreach Type = ["I", "U"] in
foreach Index = 0-7 in {
// Defines patterns that extract each Index'ed 4bit from an unsigned
// 32bit scalar value;
- def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
+ def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !eq (Type, "U")>;
// Defines multiplication patterns where the multiplication is happening on each
// Index'ed 8bit of a 32bit scalar value.
@@ -347,7 +362,6 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
let Src2RC64 = _SrcRC;
let HasOpSel = 0;
let HasClamp = 0;
- let HasModifiers = 0;
let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
}
@@ -368,34 +382,34 @@ def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, A
let Predicates = [HasMAIInsts] in {
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
-def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
-def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
- let isMoveImm = 1;
-}
-}
+ defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
+ let isMoveImm = 1 in {
+ defm V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite>;
+ } // End isMoveImm = 1
+} // End isAsCheapAsAMove = 1, isReMaterializable = 1
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
-def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
-def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
-def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
-def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>;
-def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>;
-def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>;
-def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
-def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>;
-def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>;
-def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>;
-def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
-def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>;
-def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>;
-def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>;
-def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
-def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
-def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>;
-def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
-def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
-def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
+defm V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
+defm V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
+defm V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>;
+defm V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>;
+defm V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
+defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>;
+defm V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>;
+defm V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>;
+defm V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>;
+defm V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
+defm V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
+defm V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>;
+defm V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
} // End SubtargetPredicate = HasMAIInsts
@@ -403,7 +417,15 @@ def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F3
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
-multiclass VOP3P_Real_vi<bits<10> op> {
+//===----------------------------------------------------------------------===//
+// Begin Real Encodings
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// GFX8 (VI)
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3P_Real_vi<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicate = HasVOP3PInsts;
@@ -411,40 +433,51 @@ multiclass VOP3P_Real_vi<bits<10> op> {
}
}
-multiclass VOP3P_Real_MAI<bits<10> op> {
- def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+multiclass VOP3P_Real_MAI<bits<7> op> {
+ def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ let AssemblerPredicate = HasMAIInsts;
+ let DecoderNamespace = "GFX8";
+ let Inst{14} = 1; // op_sel_hi(2) default value
+ let Inst{59} = 1; // op_sel_hi(0) default value
+ let Inst{60} = 1; // op_sel_hi(1) default value
+ }
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op> {
+ def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
}
}
-defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
-defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
-defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
-defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
-defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
-defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
-
-defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
-defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
-defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
-defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
-defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
-defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>;
-defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
-defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
-defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_vi <0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_vi <0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_vi <0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x09>;
+
+defm V_PK_ADD_U16 : VOP3P_Real_vi <0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_vi <0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_vi <0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_vi <0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_vi <0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
+defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
+defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;
let SubtargetPredicate = HasMadMixInsts in {
-defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
-defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
-defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;
+defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>;
+defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>;
}
let SubtargetPredicate = HasFmaMixInsts in {
@@ -452,54 +485,54 @@ let DecoderNamespace = "GFX9_DL" in {
// The mad_mix instructions were renamed and their behaviors changed,
// but the opcode stayed the same so we need to put these in a
// different DecoderNamespace to avoid the ambiguity.
-defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
}
}
let SubtargetPredicate = HasDot2Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
-defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
-defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
-defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
-defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
+defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>;
+defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>;
} // End SubtargetPredicate = HasDot2Insts
let SubtargetPredicate = HasDot1Insts in {
-defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
-defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
+defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>;
+defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>;
} // End SubtargetPredicate = HasDot1Insts
let SubtargetPredicate = HasMAIInsts in {
-defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>;
-defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>;
-defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>;
-defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>;
-defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>;
-defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>;
-defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>;
-defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>;
-defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>;
-defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>;
-defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>;
-defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>;
-defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>;
-defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>;
-defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>;
-defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>;
-defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>;
-defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
+defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>;
+defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
+defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40>;
+defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41>;
+defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42>;
+defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44>;
+defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45>;
+defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48>;
+defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49>;
+defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a>;
+defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c>;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d>;
+defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50>;
+defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51>;
+defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52>;
+defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA <0x55>;
+defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA <0x54>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>;
+defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA <0x6b>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
} // End SubtargetPredicate = HasMAIInsts
@@ -508,48 +541,48 @@ defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
- multiclass VOP3P_Real_gfx10<bits<10> op> {
+ multiclass VOP3P_Real_gfx10<bits<7> op> {
def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
-defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>;
-defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>;
-defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>;
-defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>;
-defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>;
-defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>;
-defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>;
-defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>;
-defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>;
-defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>;
-defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>;
-defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>;
-defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>;
-defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>;
-defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>;
-defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>;
+defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x09>;
+defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x10>;
+defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x11>;
+defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x12>;
+defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>;
let SubtargetPredicate = HasDot2Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>;
-defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>;
-defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>;
-defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>;
-defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
+defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
+defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
+defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>;
+defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>;
} // End SubtargetPredicate = HasDot2Insts
let SubtargetPredicate = HasDot1Insts in {
-defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>;
-defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>;
+defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>;
+defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>;
} // End SubtargetPredicate = HasDot1Insts
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index aa2fa260e7b5..99599c5cd667 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -229,7 +229,7 @@ multiclass VOPC_Pseudos <string opName,
foreach _ = BoolToList<P.HasExtSDWA>.ret in
def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isConvergent = DefExec;
let isCompare = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f8a83e5f74c0..282c1002d3c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -8,6 +8,7 @@
// dummies for outer let
class LetDummies {
+ bit TRANS;
bit ReadsModeReg;
bit mayRaiseFPException;
bit isCommutable;
@@ -69,7 +70,7 @@ class VOP3Common <dag outs, dag ins, string asm = "",
let VOP3 = 1;
let AsmVariantName = AMDGPUAsmVariants.VOP3;
- let AsmMatchConverter = !if(!eq(HasMods,1), "cvtVOP3", "");
+ let AsmMatchConverter = !if(HasMods, "cvtVOP3", "");
let isCodeGenOnly = 0;
@@ -129,7 +130,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let AsmMatchConverter =
!if(isVOP3P,
"cvtVOP3P",
- !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)),
+ !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp),
"cvtVOP3",
""));
}
@@ -296,7 +297,7 @@ class VOP3be <VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
}
-class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
+class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
bits<8> vdst;
// neg, neg_hi, op_sel put in srcN_modifiers
bits<4> src0_modifiers;
@@ -320,8 +321,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
- let Inst{25-16} = op;
- let Inst{31-26} = 0x34; //encoding
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x1a7; //encoding
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
@@ -332,7 +333,7 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
-class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
+class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
bits<8> vdst;
bits<10> src0;
bits<10> src1;
@@ -349,8 +350,8 @@ class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
- let Inst{25-16} = op;
- let Inst{31-26} = 0x34; //encoding
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x1a7; //encoding
let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
@@ -362,8 +363,8 @@ class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
}
-class VOP3Pe_gfx10 <bits<10> op, VOPProfile P> : VOP3Pe<op, P> {
- let Inst{31-26} = 0x33; //encoding
+class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
+ let Inst{31-23} = 0x198; //encoding
}
class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
@@ -626,7 +627,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
string Mnemonic = OpName;
string AsmOperands = P.AsmDPP;
- let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+ let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
let SubtargetPredicate = HasDPP;
let AssemblerPredicate = HasDPP;
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
@@ -681,7 +682,7 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
let DPP = 1;
let Size = 8;
- let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+ let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
let SubtargetPredicate = HasDPP;
let AssemblerPredicate = HasDPP;
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
@@ -776,6 +777,19 @@ class DivergentFragOrOp<SDPatternOperator Op, VOPProfile P> {
!if(!isa<SDNode>(Op), getDivergentFrag<Op>.ret, Op), Op);
}
+class getVSrcOp<ValueType vt> {
+ RegisterOperand ret = !if(!eq(vt.Size, 32), VSrc_b32, VSrc_b16);
+}
+
+// Class for binary integer operations with the clamp bit set for saturation
+// TODO: Add sub with negated inline constant pattern.
+class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
+ GCNPat<(node vt:$src0, vt:$src1),
+ (inst getVSrcOp<vt>.ret:$src0, getVSrcOp<vt>.ret:$src1,
+ DSTCLAMP.ENABLE)
+>;
+
+
include "VOPCInstructions.td"
include "VOP1Instructions.td"
include "VOP2Instructions.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 4a6510f10eeb..ca33f5297471 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -22,7 +22,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Intrinsics.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp
index bce2dbd2eaa6..409dd2a98ab4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.cpp
@@ -26,5 +26,5 @@ void ARCSubtarget::anchor() {}
ARCSubtarget::ARCSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
- : ARCGenSubtargetInfo(TT, CPU, FS), FrameLowering(*this),
+ : ARCGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), FrameLowering(*this),
TLInfo(TM, *this) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
index 0be797f753d5..1f1b27f13f68 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCSubtarget.h
@@ -43,7 +43,7 @@ public:
/// Parses features string setting specified subtarget options.
/// Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
const ARCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
const ARCFrameLowering *getFrameLowering() const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index 4a5b6fd4d5bf..b8c8949e18dd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -21,9 +21,7 @@
using namespace llvm;
static Reloc::Model getRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::Static;
- return *RM;
+ return RM.getValueOr(Reloc::Static);
}
/// ARCTargetMachine ctor - Create an ILP32 architecture model
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
index 266f2de08772..f6f8f9d089df 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
@@ -26,6 +26,7 @@ public:
: MCInstPrinter(MAI, MII, MRI) {}
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index 3e3613ccb90f..358ee6002f80 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -48,7 +48,7 @@ static MCRegisterInfo *createARCMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *createARCMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
- return createARCMCSubtargetInfoImpl(TT, CPU, FS);
+ return createARCMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
}
static MCAsmInfo *createARCMCAsmInfo(const MCRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index f8a86a70c077..bb81233cf803 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -359,8 +359,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
SmallVector<MachineInstr *, 8> Front;
Front.push_back(MI);
while (Front.size() != 0) {
- MI = Front.back();
- Front.pop_back();
+ MI = Front.pop_back_val();
// If we have already explored this MachineInstr, ignore it.
if (Reached.find(MI) != Reached.end())
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
index 7398968bb24a..f4fdc9803728 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
@@ -37,6 +37,7 @@ class PassRegistry;
Pass *createMVETailPredicationPass();
FunctionPass *createARMLowOverheadLoopsPass();
+FunctionPass *createARMBlockPlacementPass();
Pass *createARMParallelDSPPass();
FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
CodeGenOpt::Level OptLevel);
@@ -55,6 +56,8 @@ InstructionSelector *
createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
const ARMRegisterBankInfo &RBI);
Pass *createMVEGatherScatterLoweringPass();
+FunctionPass *createARMSLSHardeningPass();
+FunctionPass *createARMIndirectThunks();
void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
ARMAsmPrinter &AP);
@@ -69,8 +72,10 @@ void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
void initializeMVEVPTOptimisationsPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);
+void initializeARMBlockPlacementPass(PassRegistry &);
void initializeMVETailPredicationPass(PassRegistry &);
void initializeMVEGatherScatterLoweringPass(PassRegistry &);
+void initializeARMSLSHardeningPass(PassRegistry &);
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
index 0468f7f1cf8e..3d0a0bf7f8c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
@@ -535,6 +535,10 @@ def HasV8_6aOps : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true",
[HasV8_5aOps, FeatureBF16,
FeatureMatMulInt8]>;
+def HasV8_7aOps : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true",
+ "Support ARM v8.7a instructions",
+ [HasV8_6aOps]>;
+
def HasV8_1MMainlineOps : SubtargetFeature<
"v8.1m.main", "HasV8_1MMainlineOps", "true",
"Support ARM v8-1M Mainline instructions",
@@ -559,6 +563,20 @@ foreach i = {0-7} in
[HasCDEOps]>;
//===----------------------------------------------------------------------===//
+// Control codegen mitigation against Straight Line Speculation vulnerability.
+//===----------------------------------------------------------------------===//
+
+def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr",
+ "HardenSlsRetBr", "true",
+ "Harden against straight line speculation across RETurn and BranchRegister "
+ "instructions">;
+def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
+ "HardenSlsBlr", "true",
+ "Harden against straight line speculation across indirect calls">;
+
+
+
+//===----------------------------------------------------------------------===//
// ARM Processor subtarget features.
//
@@ -598,9 +616,14 @@ def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
"Cortex-A77 ARM processors", []>;
def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78",
"Cortex-A78 ARM processors", []>;
+def ProcA78C : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C",
+ "Cortex-A78C ARM processors", []>;
def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
"Cortex-X1 ARM processors", []>;
+def ProcV1 : SubtargetFeature<"neoverse-v1", "ARMProcFamily",
+ "NeoverseV1", "Neoverse-V1 ARM processors", []>;
+
def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
"Qualcomm Krait processors", []>;
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
@@ -639,7 +662,8 @@ def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
"Cortex-M3 ARM processors", []>;
-
+def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7",
+ "Cortex-M7 ARM processors", []>;
//===----------------------------------------------------------------------===//
// ARM Helper classes.
@@ -828,6 +852,19 @@ def ARMv86a : Architecture<"armv8.6-a", "ARMv86a", [HasV8_6aOps,
FeatureCRC,
FeatureRAS,
FeatureDotProd]>;
+def ARMv87a : Architecture<"armv8.7-a", "ARMv86a", [HasV8_7aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureRAS,
+ FeatureDotProd]>;
def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops,
FeatureRClass,
@@ -882,6 +919,13 @@ def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>;
def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>;
def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>;
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+include "ARMRegisterBanks.td"
+include "ARMCallingConv.td"
//===----------------------------------------------------------------------===//
// ARM schedules.
@@ -891,9 +935,27 @@ include "ARMPredicates.td"
include "ARMSchedule.td"
//===----------------------------------------------------------------------===//
-// ARM processors
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+def ARMInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// ARM schedules
//
+include "ARMScheduleV6.td"
+include "ARMScheduleA8.td"
+include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
+include "ARMScheduleR52.td"
+include "ARMScheduleA57.td"
+include "ARMScheduleM4.td"
+include "ARMScheduleM7.td"
+//===----------------------------------------------------------------------===//
+// ARM processors
+//
// Dummy CPU, used to target architectures
def : ProcessorModel<"generic", CortexA8Model, []>;
@@ -1131,8 +1193,10 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em,
FeatureUseMISched,
FeatureHasNoBranchPredictor]>;
-def : ProcNoItin<"cortex-m7", [ARMv7em,
- FeatureFPARMv8_D16]>;
+def : ProcessorModel<"cortex-m7", CortexM7Model, [ARMv7em,
+ ProcM7,
+ FeatureFPARMv8_D16,
+ FeatureUseMISched]>;
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
FeatureNoMovt]>;
@@ -1246,6 +1310,14 @@ def : ProcNoItin<"cortex-a78", [ARMv82a, ProcA78,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"cortex-a78c", [ARMv82a, ProcA78C,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureDotProd,
+ FeatureFullFP16]>;
+
def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1,
FeatureHWDivThumb,
FeatureHWDivARM,
@@ -1254,6 +1326,15 @@ def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"neoverse-v1", [ARMv84a,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureBF16,
+ FeatureMatMulInt8]>;
+
def : ProcNoItin<"neoverse-n1", [ARMv82a,
FeatureHWDivThumb,
FeatureHWDivARM,
@@ -1261,6 +1342,11 @@ def : ProcNoItin<"neoverse-n1", [ARMv82a,
FeatureCRC,
FeatureDotProd]>;
+def : ProcNoItin<"neoverse-n2", [ARMv85a,
+ FeatureBF16,
+ FeatureMatMulInt8,
+ FeaturePerfMon]>;
+
def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureHasRetAddrStack,
FeatureNEONForFP,
@@ -1296,21 +1382,6 @@ def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52,
FeatureFPAO]>;
//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "ARMRegisterInfo.td"
-include "ARMRegisterBanks.td"
-include "ARMCallingConv.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "ARMInstrInfo.td"
-def ARMInstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
// Declare the target which we are implementing
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index d6c1efa6327c..04e21867d571 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -285,7 +285,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
return false;
case 'y': // Print a VFP single precision register as indexed double.
if (MI->getOperand(OpNum).isReg()) {
- Register Reg = MI->getOperand(OpNum).getReg();
+ MCRegister Reg = MI->getOperand(OpNum).getReg().asMCReg();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
// Find the 'd' register that has this 's' register as a sub-register,
// and determine the lane number.
@@ -903,7 +903,7 @@ void ARMAsmPrinter::emitMachineConstantPoolValue(
MCSymbol *MCSym;
if (ACPV->isLSDA()) {
- MCSym = getCurExceptionSym();
+ MCSym = getMBBExceptionSym(MF->front());
} else if (ACPV->isBlockAddress()) {
const BlockAddress *BA =
cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
@@ -1897,7 +1897,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
// LSJLJEH:
Register SrcReg = MI->getOperand(0).getReg();
Register ValReg = MI->getOperand(1).getReg();
- MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
+ MCSymbol *Label = OutContext.createTempSymbol("SJLJEH");
OutStreamer->AddComment("eh_setjmp begin");
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
.addReg(ValReg)
@@ -2180,6 +2180,48 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
case ARM::PATCHABLE_TAIL_CALL:
LowerPATCHABLE_TAIL_CALL(*MI);
return;
+ case ARM::SpeculationBarrierISBDSBEndBB: {
+ // Print DSB SYS + ISB
+ MCInst TmpInstDSB;
+ TmpInstDSB.setOpcode(ARM::DSB);
+ TmpInstDSB.addOperand(MCOperand::createImm(0xf));
+ EmitToStreamer(*OutStreamer, TmpInstDSB);
+ MCInst TmpInstISB;
+ TmpInstISB.setOpcode(ARM::ISB);
+ TmpInstISB.addOperand(MCOperand::createImm(0xf));
+ EmitToStreamer(*OutStreamer, TmpInstISB);
+ return;
+ }
+ case ARM::t2SpeculationBarrierISBDSBEndBB: {
+ // Print DSB SYS + ISB
+ MCInst TmpInstDSB;
+ TmpInstDSB.setOpcode(ARM::t2DSB);
+ TmpInstDSB.addOperand(MCOperand::createImm(0xf));
+ TmpInstDSB.addOperand(MCOperand::createImm(ARMCC::AL));
+ TmpInstDSB.addOperand(MCOperand::createReg(0));
+ EmitToStreamer(*OutStreamer, TmpInstDSB);
+ MCInst TmpInstISB;
+ TmpInstISB.setOpcode(ARM::t2ISB);
+ TmpInstISB.addOperand(MCOperand::createImm(0xf));
+ TmpInstISB.addOperand(MCOperand::createImm(ARMCC::AL));
+ TmpInstISB.addOperand(MCOperand::createReg(0));
+ EmitToStreamer(*OutStreamer, TmpInstISB);
+ return;
+ }
+ case ARM::SpeculationBarrierSBEndBB: {
+ // Print SB
+ MCInst TmpInstSB;
+ TmpInstSB.setOpcode(ARM::SB);
+ EmitToStreamer(*OutStreamer, TmpInstSB);
+ return;
+ }
+ case ARM::t2SpeculationBarrierSBEndBB: {
+ // Print SB
+ MCInst TmpInstSB;
+ TmpInstSB.setOpcode(ARM::t2SB);
+ EmitToStreamer(*OutStreamer, TmpInstSB);
+ return;
+ }
}
MCInst TmpInst;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 4cc2b6bf7e7e..112eb59e173d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -19,6 +19,7 @@
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MVETailPredUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
@@ -35,6 +36,8 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MultiHazardRecognizer.h"
#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -131,12 +134,43 @@ ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
}
+// Called during:
+// - pre-RA scheduling
+// - post-RA scheduling when FeatureUseMISched is set
+ScheduleHazardRecognizer *ARMBaseInstrInfo::CreateTargetMIHazardRecognizer(
+ const InstrItineraryData *II, const ScheduleDAGMI *DAG) const {
+ MultiHazardRecognizer *MHR = new MultiHazardRecognizer();
+
+ // We would like to restrict this hazard recognizer to only
+ // post-RA scheduling; we can tell that we're post-RA because we don't
+ // track VRegLiveness.
+ // Cortex-M7: TRM indicates that there is a single ITCM bank and two DTCM
+ // banks banked on bit 2. Assume that TCMs are in use.
+ if (Subtarget.isCortexM7() && !DAG->hasVRegLiveness())
+ MHR->AddHazardRecognizer(
+ std::make_unique<ARMBankConflictHazardRecognizer>(DAG, 0x4, true));
+
+ // Not inserting ARMHazardRecognizerFPMLx because that would change
+ // legacy behavior
+
+ auto BHR = TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
+ MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR));
+ return MHR;
+}
+
+// Called during post-RA scheduling when FeatureUseMISched is not set
ScheduleHazardRecognizer *ARMBaseInstrInfo::
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const {
+ MultiHazardRecognizer *MHR = new MultiHazardRecognizer();
+
if (Subtarget.isThumb2() || Subtarget.hasVFP2Base())
- return new ARMHazardRecognizer(II, DAG);
- return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+ MHR->AddHazardRecognizer(std::make_unique<ARMHazardRecognizerFPMLx>());
+
+ auto BHR = TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+ if (BHR)
+ MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR));
+ return MHR;
}
MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
@@ -317,8 +351,8 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
TBB = nullptr;
FBB = nullptr;
- MachineBasicBlock::iterator I = MBB.end();
- if (I == MBB.begin())
+ MachineBasicBlock::instr_iterator I = MBB.instr_end();
+ if (I == MBB.instr_begin())
return false; // Empty blocks are easy.
--I;
@@ -330,9 +364,12 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// out.
bool CantAnalyze = false;
- // Skip over DEBUG values and predicated nonterminators.
- while (I->isDebugInstr() || !I->isTerminator()) {
- if (I == MBB.begin())
+ // Skip over DEBUG values, predicated nonterminators and speculation
+ // barrier terminators.
+ while (I->isDebugInstr() || !I->isTerminator() ||
+ isSpeculationBarrierEndBBOpcode(I->getOpcode()) ||
+ I->getOpcode() == ARM::t2DoLoopStartTP){
+ if (I == MBB.instr_begin())
return false;
--I;
}
@@ -356,7 +393,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
Cond.push_back(I->getOperand(2));
} else if (I->isReturn()) {
// Returns can't be analyzed, but we should run cleanup.
- CantAnalyze = !isPredicated(*I);
+ CantAnalyze = true;
} else {
// We encountered other unrecognized terminator. Bail out immediately.
return true;
@@ -377,18 +414,30 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// unconditional branch.
if (AllowModify) {
MachineBasicBlock::iterator DI = std::next(I);
- while (DI != MBB.end()) {
+ while (DI != MBB.instr_end()) {
MachineInstr &InstToDelete = *DI;
++DI;
+ // Speculation barriers must not be deleted.
+ if (isSpeculationBarrierEndBBOpcode(InstToDelete.getOpcode()))
+ continue;
InstToDelete.eraseFromParent();
}
}
}
- if (CantAnalyze)
+ if (CantAnalyze) {
+ // We may not be able to analyze the block, but we could still have
+ // an unconditional branch as the last instruction in the block, which
+ // just branches to layout successor. If this is the case, then just
+ // remove it if we're allowed to make modifications.
+ if (AllowModify && !isPredicated(MBB.back()) &&
+ isUncondBranchOpcode(MBB.back().getOpcode()) &&
+ TBB && MBB.isLayoutSuccessor(TBB))
+ removeBranch(MBB);
return true;
+ }
- if (I == MBB.begin())
+ if (I == MBB.instr_begin())
return false;
--I;
@@ -537,6 +586,18 @@ bool ARMBaseInstrInfo::PredicateInstruction(
MachineOperand &PMO = MI.getOperand(PIdx);
PMO.setImm(Pred[0].getImm());
MI.getOperand(PIdx+1).setReg(Pred[1].getReg());
+
+ // Thumb 1 arithmetic instructions do not set CPSR when executed inside an
+ // IT block. This affects how they are printed.
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+ assert(MCID.OpInfo[1].isOptionalDef() && "CPSR def isn't expected operand");
+ assert((MI.getOperand(1).isDead() ||
+ MI.getOperand(1).getReg() != ARM::CPSR) &&
+ "if conversion tried to stop defining used CPSR");
+ MI.getOperand(1).setReg(ARM::NoRegister);
+ }
+
return true;
}
return false;
@@ -568,13 +629,23 @@ bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
}
}
-bool ARMBaseInstrInfo::DefinesPredicate(
- MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
+bool ARMBaseInstrInfo::ClobbersPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred,
+ bool SkipDead) const {
bool Found = false;
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI.getOperand(i);
- if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
- (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
+ bool ClobbersCPSR = MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR);
+ bool IsCPSR = MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR;
+ if (ClobbersCPSR || IsCPSR) {
+
+ // Filter out T1 instructions that have a dead CPSR,
+ // allowing IT blocks to be generated containing T1 instructions
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.TSFlags & ARMII::ThumbArithFlagSetting && MO.isDead() &&
+ SkipDead)
+ continue;
+
Pred.push_back(MO);
Found = true;
}
@@ -590,61 +661,6 @@ bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) {
return false;
}
-bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI,
- unsigned Op) const {
- const MachineOperand &Offset = MI.getOperand(Op + 1);
- return Offset.getReg() != 0;
-}
-
-// Load with negative register offset requires additional 1cyc and +I unit
-// for Cortex A57
-bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI,
- unsigned Op) const {
- const MachineOperand &Offset = MI.getOperand(Op + 1);
- const MachineOperand &Opc = MI.getOperand(Op + 2);
- assert(Opc.isImm());
- assert(Offset.isReg());
- int64_t OpcImm = Opc.getImm();
-
- bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub;
- return (isSub && Offset.getReg() != 0);
-}
-
-bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI,
- unsigned Op) const {
- const MachineOperand &Opc = MI.getOperand(Op + 2);
- unsigned OffImm = Opc.getImm();
- return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
-}
-
-// Load, scaled register offset, not plus LSL2
-bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI,
- unsigned Op) const {
- const MachineOperand &Opc = MI.getOperand(Op + 2);
- unsigned OffImm = Opc.getImm();
-
- bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add;
- unsigned Amt = ARM_AM::getAM2Offset(OffImm);
- ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm);
- if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled
- bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2);
- return !SimpleScaled;
-}
-
-// Minus reg for ldstso addr mode
-bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI,
- unsigned Op) const {
- unsigned OffImm = MI.getOperand(Op + 2).getImm();
- return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
-}
-
-// Load, scaled register offset
-bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI,
- unsigned Op) const {
- unsigned OffImm = MI.getOperand(Op + 2).getImm();
- return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
-}
-
static bool isEligibleForITBlock(const MachineInstr *MI) {
switch (MI->getOpcode()) {
default: return true;
@@ -687,14 +703,23 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
if (!isEligibleForITBlock(&MI))
return false;
+ const MachineFunction *MF = MI.getParent()->getParent();
const ARMFunctionInfo *AFI =
- MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
+ MF->getInfo<ARMFunctionInfo>();
// Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM.
// In their ARM encoding, they can't be encoded in a conditional form.
if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
return false;
+ // Make indirect control flow changes unpredicable when SLS mitigation is
+ // enabled.
+ const ARMSubtarget &ST = MF->getSubtarget<ARMSubtarget>();
+ if (ST.hardenSlsRetBr() && isIndirectControlFlowNotComingBack(MI))
+ return false;
+ if (ST.hardenSlsBlr() && isIndirectCall(MI))
+ return false;
+
if (AFI->isThumb2Function()) {
if (getSubtarget().restrictIT())
return isV8EligibleForIT(&MI);
@@ -777,6 +802,14 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
Size = alignTo(Size, 4);
return Size;
}
+ case ARM::SpeculationBarrierISBDSBEndBB:
+ case ARM::t2SpeculationBarrierISBDSBEndBB:
+ // This gets lowered to 2 4-byte instructions.
+ return 8;
+ case ARM::SpeculationBarrierSBEndBB:
+ case ARM::t2SpeculationBarrierSBEndBB:
+ // This gets lowered to 1 4-byte instructions.
+ return 4;
}
}
@@ -2142,7 +2175,12 @@ ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF,
// Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions.
// ARM has a condition code field in every predicable instruction, using it
// doesn't change code size.
- return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0;
+ if (!Subtarget.isThumb2())
+ return 0;
+
+ // It's possible that the size of the IT is restricted to a single block.
+ unsigned MaxInsts = Subtarget.restrictIT() ? 1 : 4;
+ return divideCeil(NumInsts, MaxInsts) * 2;
}
unsigned
@@ -3379,7 +3417,7 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
case ARM::t2SUBspImm:
case ARM::t2ADDri:
case ARM::t2SUBri:
- MRI->setRegClass(UseMI.getOperand(0).getReg(), TRC);
+ MRI->constrainRegClass(UseMI.getOperand(0).getReg(), TRC);
}
return true;
}
@@ -3840,22 +3878,6 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
return DefCycle;
}
-bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
- Register BaseReg = MI.getOperand(0).getReg();
- for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
- const auto &Op = MI.getOperand(i);
- if (Op.isReg() && Op.getReg() == BaseReg)
- return true;
- }
- return false;
-}
-unsigned
-ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
- // ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops
- // (outs GPR:$wb), (ins GPR:$Rn, $p (2xOp), reglist:$regs, variable_ops)
- return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
-}
-
int
ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
const MCInstrDesc &DefMCID,
@@ -4816,6 +4838,14 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
}
+ if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) {
+ assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm());
+ if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) ||
+ MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) {
+ ErrInfo = "Incorrect array index for MVE_VMOV_q_rr";
+ return false;
+ }
+ }
return true;
}
@@ -5501,6 +5531,8 @@ unsigned llvm::ConstantMaterializationCost(unsigned Val,
return ForCodesize ? 4 : 1;
if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs
return ForCodesize ? 8 : 2;
+ if (ARM_AM::isSOImmTwoPartValNeg(Val)) // two instrs
+ return ForCodesize ? 8 : 2;
}
if (Subtarget->useMovt()) // MOVW + MOVT
return ForCodesize ? 8 : 2;
@@ -5605,12 +5637,32 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
/// | Frame overhead in Bytes | 2 | 4 |
/// | Stack fixup required | No | No |
/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerDefault implies that the function should be called with
+/// a save and restore of LR to the stack.
+///
+/// That is,
+///
+/// I1 Save LR OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION I1
+/// I3 Restore LR I2
+/// I3
+/// BX LR
+///
+/// +-------------------------+--------+-----+
+/// | | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes | 8 | 12 |
+/// | Frame overhead in Bytes | 2 | 4 |
+/// | Stack fixup required | Yes | Yes |
+/// +-------------------------+--------+-----+
enum MachineOutlinerClass {
MachineOutlinerTailCall,
MachineOutlinerThunk,
MachineOutlinerNoLRSave,
- MachineOutlinerRegSave
+ MachineOutlinerRegSave,
+ MachineOutlinerDefault
};
enum MachineOutlinerMBBFlags {
@@ -5628,6 +5680,9 @@ struct OutlinerCosts {
const int FrameNoLRSave;
const int CallRegSave;
const int FrameRegSave;
+ const int CallDefault;
+ const int FrameDefault;
+ const int SaveRestoreLROnStack;
OutlinerCosts(const ARMSubtarget &target)
: CallTailCall(target.isThumb() ? 4 : 4),
@@ -5637,7 +5692,10 @@ struct OutlinerCosts {
CallNoLRSave(target.isThumb() ? 4 : 4),
FrameNoLRSave(target.isThumb() ? 4 : 4),
CallRegSave(target.isThumb() ? 8 : 12),
- FrameRegSave(target.isThumb() ? 2 : 4) {}
+ FrameRegSave(target.isThumb() ? 2 : 4),
+ CallDefault(target.isThumb() ? 8 : 12),
+ FrameDefault(target.isThumb() ? 2 : 4),
+ SaveRestoreLROnStack(target.isThumb() ? 8 : 8) {}
};
unsigned
@@ -5662,6 +5720,37 @@ ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
return 0u;
}
+// Compute liveness of LR at the point after the interval [I, E), which
+// denotes a *backward* iteration through instructions. Used only for return
+// basic blocks, which do not end with a tail call.
+static bool isLRAvailable(const TargetRegisterInfo &TRI,
+ MachineBasicBlock::reverse_iterator I,
+ MachineBasicBlock::reverse_iterator E) {
+ // At the end of the function LR dead.
+ bool Live = false;
+ for (; I != E; ++I) {
+ const MachineInstr &MI = *I;
+
+ // Check defs of LR.
+ if (MI.modifiesRegister(ARM::LR, &TRI))
+ Live = false;
+
+ // Check uses of LR.
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR ||
+ Opcode == ARM::SUBS_PC_LR || Opcode == ARM::tBX_RET ||
+ Opcode == ARM::tBXNS_RET) {
+ // These instructions use LR, but it's not an (explicit or implicit)
+ // operand.
+ Live = true;
+ continue;
+ }
+ if (MI.readsRegister(ARM::LR, &TRI))
+ Live = true;
+ }
+ return !Live;
+}
+
outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
@@ -5707,10 +5796,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
// Erase every candidate that violates the restrictions above. (It could be
// true that we have viable candidates, so it's not worth bailing out in
// the case that, say, 1 out of 20 candidates violate the restructions.)
- RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
- RepeatedSequenceLocs.end(),
- CantGuaranteeValueAcrossCall),
- RepeatedSequenceLocs.end());
+ llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall);
// If the sequence doesn't have enough candidates left, then we're done.
if (RepeatedSequenceLocs.size() < 2)
@@ -5730,8 +5816,8 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
};
OutlinerCosts Costs(Subtarget);
- unsigned FrameID = 0;
- unsigned NumBytesToCreateFrame = 0;
+ unsigned FrameID = MachineOutlinerDefault;
+ unsigned NumBytesToCreateFrame = Costs.FrameDefault;
// If the last instruction in any candidate is a terminator, then we should
// tail call all of the candidates.
@@ -5740,22 +5826,31 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
NumBytesToCreateFrame = Costs.FrameTailCall;
SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall);
} else if (LastInstrOpcode == ARM::BL || LastInstrOpcode == ARM::BLX ||
- LastInstrOpcode == ARM::tBL || LastInstrOpcode == ARM::tBLXr ||
+ LastInstrOpcode == ARM::BLX_noip || LastInstrOpcode == ARM::tBL ||
+ LastInstrOpcode == ARM::tBLXr ||
+ LastInstrOpcode == ARM::tBLXr_noip ||
LastInstrOpcode == ARM::tBLXi) {
FrameID = MachineOutlinerThunk;
NumBytesToCreateFrame = Costs.FrameThunk;
SetCandidateCallInfo(MachineOutlinerThunk, Costs.CallThunk);
} else {
// We need to decide how to emit calls + frames. We can always emit the same
- // frame if we don't need to save to the stack.
+ // frame if we don't need to save to the stack. If we have to save to the
+ // stack, then we need a different frame.
unsigned NumBytesNoStackCalls = 0;
std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
for (outliner::Candidate &C : RepeatedSequenceLocs) {
C.initLRU(TRI);
-
- // Is LR available? If so, we don't need a save.
- if (C.LRU.available(ARM::LR)) {
+ // LR liveness is overestimated in return blocks, unless they end with a
+ // tail call.
+ const auto Last = C.getMBB()->rbegin();
+ const bool LRIsAvailable =
+ C.getMBB()->isReturnBlock() && !Last->isCall()
+ ? isLRAvailable(TRI, Last,
+ (MachineBasicBlock::reverse_iterator)C.front())
+ : C.LRU.available(ARM::LR);
+ if (LRIsAvailable) {
FrameID = MachineOutlinerNoLRSave;
NumBytesNoStackCalls += Costs.CallNoLRSave;
C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave);
@@ -5770,18 +5865,161 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
C.setCallInfo(MachineOutlinerRegSave, Costs.CallRegSave);
CandidatesWithoutStackFixups.push_back(C);
}
+
+ // Is SP used in the sequence at all? If not, we don't have to modify
+ // the stack, so we are guaranteed to get the same frame.
+ else if (C.UsedInSequence.available(ARM::SP)) {
+ NumBytesNoStackCalls += Costs.CallDefault;
+ C.setCallInfo(MachineOutlinerDefault, Costs.CallDefault);
+ CandidatesWithoutStackFixups.push_back(C);
+ }
+
+ // If we outline this, we need to modify the stack. Pretend we don't
+ // outline this by saving all of its bytes.
+ else
+ NumBytesNoStackCalls += SequenceSize;
}
- if (!CandidatesWithoutStackFixups.empty()) {
+ // If there are no places where we have to save LR, then note that we don't
+ // have to update the stack. Otherwise, give every candidate the default
+ // call type
+ if (NumBytesNoStackCalls <=
+ RepeatedSequenceLocs.size() * Costs.CallDefault) {
RepeatedSequenceLocs = CandidatesWithoutStackFixups;
+ FrameID = MachineOutlinerNoLRSave;
} else
- return outliner::OutlinedFunction();
+ SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault);
+ }
+
+ // Does every candidate's MBB contain a call? If so, then we might have a
+ // call in the range.
+ if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
+ // check if the range contains a call. These require a save + restore of
+ // the link register.
+ if (std::any_of(FirstCand.front(), FirstCand.back(),
+ [](const MachineInstr &MI) { return MI.isCall(); }))
+ NumBytesToCreateFrame += Costs.SaveRestoreLROnStack;
+
+ // Handle the last instruction separately. If it is tail call, then the
+ // last instruction is a call, we don't want to save + restore in this
+ // case. However, it could be possible that the last instruction is a
+ // call without it being valid to tail call this sequence. We should
+ // consider this as well.
+ else if (FrameID != MachineOutlinerThunk &&
+ FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
+ NumBytesToCreateFrame += Costs.SaveRestoreLROnStack;
}
return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
NumBytesToCreateFrame, FrameID);
}
+bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
+ int64_t Fixup,
+ bool Updt) const {
+ int SPIdx = MI->findRegisterUseOperandIdx(ARM::SP);
+ unsigned AddrMode = (MI->getDesc().TSFlags & ARMII::AddrModeMask);
+ if (SPIdx < 0)
+ // No SP operand
+ return true;
+ else if (SPIdx != 1 && (AddrMode != ARMII::AddrModeT2_i8s4 || SPIdx != 2))
+ // If SP is not the base register we can't do much
+ return false;
+
+ // Stack might be involved but addressing mode doesn't handle any offset.
+ // Rq: AddrModeT1_[1|2|4] don't operate on SP
+ if (AddrMode == ARMII::AddrMode1 // Arithmetic instructions
+ || AddrMode == ARMII::AddrMode4 // Load/Store Multiple
+ || AddrMode == ARMII::AddrMode6 // Neon Load/Store Multiple
+ || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register
+ || AddrMode == ARMII::AddrModeT2_pc // PCrel access
+ || AddrMode == ARMII::AddrMode2 // Used by PRE and POST indexed LD/ST
+ || AddrMode == ARMII::AddrModeNone)
+ return false;
+
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ unsigned ImmIdx = NumOps - 3;
+
+ const MachineOperand &Offset = MI->getOperand(ImmIdx);
+ assert(Offset.isImm() && "Is not an immediate");
+ int64_t OffVal = Offset.getImm();
+
+ if (OffVal < 0)
+ // Don't override data if the are below SP.
+ return false;
+
+ unsigned NumBits = 0;
+ unsigned Scale = 1;
+
+ switch (AddrMode) {
+ case ARMII::AddrMode3:
+ if (ARM_AM::getAM3Op(OffVal) == ARM_AM::sub)
+ return false;
+ OffVal = ARM_AM::getAM3Offset(OffVal);
+ NumBits = 8;
+ break;
+ case ARMII::AddrMode5:
+ if (ARM_AM::getAM5Op(OffVal) == ARM_AM::sub)
+ return false;
+ OffVal = ARM_AM::getAM5Offset(OffVal);
+ NumBits = 8;
+ Scale = 4;
+ break;
+ case ARMII::AddrMode5FP16:
+ if (ARM_AM::getAM5FP16Op(OffVal) == ARM_AM::sub)
+ return false;
+ OffVal = ARM_AM::getAM5FP16Offset(OffVal);
+ NumBits = 8;
+ Scale = 2;
+ break;
+ case ARMII::AddrModeT2_i8:
+ NumBits = 8;
+ break;
+ case ARMII::AddrModeT2_i8s4:
+ case ARMII::AddrModeT2_ldrex:
+ NumBits = 8;
+ Scale = 4;
+ break;
+ case ARMII::AddrModeT2_i12:
+ case ARMII::AddrMode_i12:
+ NumBits = 12;
+ break;
+ case ARMII::AddrModeT2_i7:
+ NumBits = 7;
+ break;
+ case ARMII::AddrModeT2_i7s2:
+ NumBits = 7;
+ Scale = 2;
+ break;
+ case ARMII::AddrModeT2_i7s4:
+ NumBits = 7;
+ Scale = 4;
+ break;
+ case ARMII::AddrModeT1_s: // SP-relative LD/ST
+ NumBits = 8;
+ Scale = 4;
+ break;
+ default:
+ llvm_unreachable("Unsupported addressing mode!");
+ }
+ // Make sure the offset is encodable for instructions that scale the
+ // immediate.
+ if (((OffVal * Scale + Fixup) & (Scale - 1)) != 0)
+ return false;
+ OffVal += Fixup / Scale;
+
+ unsigned Mask = (1 << NumBits) - 1;
+
+ if (OffVal <= Mask) {
+ if (Updt)
+ MI->getOperand(ImmIdx).setImm(OffVal);
+ return true;
+ }
+
+ return false;
+
+}
+
bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom(
MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
const Function &F = MF.getFunction();
@@ -5841,7 +6079,13 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
Flags |= MachineOutlinerMBBFlags::HasCalls;
- if (!LRU.available(ARM::LR))
+ // LR liveness is overestimated in return blocks.
+
+ bool LRIsAvailable =
+ MBB.isReturnBlock() && !MBB.back().isCall()
+ ? isLRAvailable(getRegisterInfo(), MBB.rbegin(), MBB.rend())
+ : LRU.available(ARM::LR);
+ if (!LRIsAvailable)
Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
return true;
@@ -5879,8 +6123,9 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
// Be conservative with ARMv8.1 MVE instructions.
if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
- Opc == ARM::t2WhileLoopStart || Opc == ARM::t2LoopDec ||
- Opc == ARM::t2LoopEnd)
+ Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
+ Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd ||
+ Opc == ARM::t2LoopEndDec)
return outliner::InstrType::Illegal;
const MCInstrDesc &MCID = MI.getDesc();
@@ -5914,16 +6159,56 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
return outliner::InstrType::Illegal;
if (MI.isCall()) {
+ // Get the function associated with the call. Look at each operand and find
+ // the one that represents the calle and get its name.
+ const Function *Callee = nullptr;
+ for (const MachineOperand &MOP : MI.operands()) {
+ if (MOP.isGlobal()) {
+ Callee = dyn_cast<Function>(MOP.getGlobal());
+ break;
+ }
+ }
+
+ // Dont't outline calls to "mcount" like functions, in particular Linux
+ // kernel function tracing relies on it.
+ if (Callee &&
+ (Callee->getName() == "\01__gnu_mcount_nc" ||
+ Callee->getName() == "\01mcount" || Callee->getName() == "__mcount"))
+ return outliner::InstrType::Illegal;
+
// If we don't know anything about the callee, assume it depends on the
// stack layout of the caller. In that case, it's only legal to outline
// as a tail-call. Explicitly list the call instructions we know about so
// we don't get unexpected results with call pseudo-instructions.
auto UnknownCallOutlineType = outliner::InstrType::Illegal;
if (Opc == ARM::BL || Opc == ARM::tBL || Opc == ARM::BLX ||
- Opc == ARM::tBLXr || Opc == ARM::tBLXi)
+ Opc == ARM::BLX_noip || Opc == ARM::tBLXr || Opc == ARM::tBLXr_noip ||
+ Opc == ARM::tBLXi)
UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
- return UnknownCallOutlineType;
+ if (!Callee)
+ return UnknownCallOutlineType;
+
+ // We have a function we have information about. Check if it's something we
+ // can safely outline.
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
+
+ // We don't know what's going on with the callee at all. Don't touch it.
+ if (!CalleeMF)
+ return UnknownCallOutlineType;
+
+ // Check if we know anything about the callee saves on the function. If we
+ // don't, then don't touch it, since that implies that we haven't computed
+ // anything about its stack frame yet.
+ MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
+ if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
+ MFI.getNumObjects() > 0)
+ return UnknownCallOutlineType;
+
+ // At this point, we can say that CalleeMF ought to not pass anything on the
+ // stack. Therefore, we can outline it.
+ return outliner::InstrType::Legal;
}
// Since calls are handled, don't touch LR or PC
@@ -5946,6 +6231,19 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
if (!MightNeedStackFixUp)
return outliner::InstrType::Legal;
+ // Any modification of SP will break our code to save/restore LR.
+ // FIXME: We could handle some instructions which add a constant offset to
+ // SP, with a bit more work.
+ if (MI.modifiesRegister(ARM::SP, TRI))
+ return outliner::InstrType::Illegal;
+
+ // At this point, we have a stack instruction that we might need to fix up.
+ // up. We'll handle it if it's a load or store.
+ if (checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(),
+ false))
+ return outliner::InstrType::Legal;
+
+ // We can't fix it up, so don't outline it.
return outliner::InstrType::Illegal;
}
@@ -5961,13 +6259,107 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
return outliner::InstrType::Legal;
}
+void ARMBaseInstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
+ for (MachineInstr &MI : MBB) {
+ checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(), true);
+ }
+}
+
+void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const {
+ unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+ int Align = -Subtarget.getStackAlignment().value();
+ BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP)
+ .addReg(ARM::LR, RegState::Kill)
+ .addReg(ARM::SP)
+ .addImm(Align)
+ .add(predOps(ARMCC::AL));
+}
+
+void ARMBaseInstrInfo::emitCFIForLRSaveOnStack(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+ MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+ unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+ int Align = Subtarget.getStackAlignment().value();
+ // Add a CFI saying the stack was moved down.
+ int64_t StackPosEntry =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align));
+ BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+ .addCFIIndex(StackPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ // Add a CFI saying that the LR that we want to find is now higher than
+ // before.
+ int64_t LRPosEntry =
+ MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfLR, -Align));
+ BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+ .addCFIIndex(LRPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+}
+
+void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It,
+ Register Reg) const {
+ MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+ unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+ unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+
+ int64_t LRPosEntry = MF.addFrameInst(
+ MCCFIInstruction::createRegister(nullptr, DwarfLR, DwarfReg));
+ BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+ .addCFIIndex(LRPosEntry)
+ .setMIFlags(MachineInstr::FrameSetup);
+}
+
+void ARMBaseInstrInfo::restoreLRFromStack(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+ unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
+ MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR)
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP);
+ if (!Subtarget.isThumb())
+ MIB.addReg(0);
+ MIB.addImm(Subtarget.getStackAlignment().value()).add(predOps(ARMCC::AL));
+}
+
+void ARMBaseInstrInfo::emitCFIForLRRestoreFromStack(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+ // Now stack has moved back up...
+ MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+ unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+ int64_t StackPosEntry =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
+ BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+ .addCFIIndex(StackPosEntry)
+ .setMIFlags(MachineInstr::FrameDestroy);
+
+ // ... and we have restored LR.
+ int64_t LRPosEntry =
+ MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR));
+ BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+ .addCFIIndex(LRPosEntry)
+ .setMIFlags(MachineInstr::FrameDestroy);
+}
+
+void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+ MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+ unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+
+ int64_t LRPosEntry =
+ MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR));
+ BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+ .addCFIIndex(LRPosEntry)
+ .setMIFlags(MachineInstr::FrameDestroy);
+}
+
void ARMBaseInstrInfo::buildOutlinedFrame(
MachineBasicBlock &MBB, MachineFunction &MF,
const outliner::OutlinedFunction &OF) const {
- // Nothing is needed for tail-calls.
- if (OF.FrameConstructionID == MachineOutlinerTailCall)
- return;
-
// For thunk outlining, rewrite the last instruction from a call to a
// tail-call.
if (OF.FrameConstructionID == MachineOutlinerThunk) {
@@ -5984,13 +6376,59 @@ void ARMBaseInstrInfo::buildOutlinedFrame(
if (isThumb && !Call->getOperand(FuncOp).isReg())
MIB.add(predOps(ARMCC::AL));
Call->eraseFromParent();
- return;
}
+ // Is there a call in the outlined range?
+ auto IsNonTailCall = [](MachineInstr &MI) {
+ return MI.isCall() && !MI.isReturn();
+ };
+ if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
+ MachineBasicBlock::iterator It = MBB.begin();
+ MachineBasicBlock::iterator Et = MBB.end();
+
+ if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+ OF.FrameConstructionID == MachineOutlinerThunk)
+ Et = std::prev(MBB.end());
+
+ // We have to save and restore LR, we need to add it to the liveins if it
+ // is not already part of the set. This is suffient since outlined
+ // functions only have one block.
+ if (!MBB.isLiveIn(ARM::LR))
+ MBB.addLiveIn(ARM::LR);
+
+ // Insert a save before the outlined region
+ saveLROnStack(MBB, It);
+ emitCFIForLRSaveOnStack(MBB, It);
+
+ // Fix up the instructions in the range, since we're going to modify the
+ // stack.
+ assert(OF.FrameConstructionID != MachineOutlinerDefault &&
+ "Can only fix up stack references once");
+ fixupPostOutline(MBB);
+
+ // Insert a restore before the terminator for the function. Restore LR.
+ restoreLRFromStack(MBB, Et);
+ emitCFIForLRRestoreFromStack(MBB, Et);
+ }
+
+ // If this is a tail call outlined function, then there's already a return.
+ if (OF.FrameConstructionID == MachineOutlinerTailCall ||
+ OF.FrameConstructionID == MachineOutlinerThunk)
+ return;
+
// Here we have to insert the return ourselves. Get the correct opcode from
// current feature set.
BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode()))
.add(predOps(ARMCC::AL));
+
+ // Did we have to modify the stack by saving the link register?
+ if (OF.FrameConstructionID != MachineOutlinerDefault &&
+ OF.Candidates[0].CallConstructionID != MachineOutlinerDefault)
+ return;
+
+ // We modified the stack.
+ // Walk over the basic block and fix up all the stack accesses.
+ fixupPostOutline(MBB);
}
MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
@@ -6022,21 +6460,70 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
CallMIB.add(predOps(ARMCC::AL));
CallMIB.addGlobalAddress(M.getNamedValue(MF.getName()));
+ if (C.CallConstructionID == MachineOutlinerNoLRSave ||
+ C.CallConstructionID == MachineOutlinerThunk) {
+ // No, so just insert the call.
+ It = MBB.insert(It, CallMIB);
+ return It;
+ }
+
+ const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>();
// Can we save to a register?
if (C.CallConstructionID == MachineOutlinerRegSave) {
unsigned Reg = findRegisterToSaveLRTo(C);
assert(Reg != 0 && "No callee-saved register available?");
// Save and restore LR from that register.
- if (!MBB.isLiveIn(ARM::LR))
- MBB.addLiveIn(ARM::LR);
copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true);
+ if (!AFI.isLRSpilled())
+ emitCFIForLRSaveToReg(MBB, It, Reg);
CallPt = MBB.insert(It, CallMIB);
copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true);
+ if (!AFI.isLRSpilled())
+ emitCFIForLRRestoreFromReg(MBB, It);
It--;
return CallPt;
}
- // Insert the call.
- It = MBB.insert(It, CallMIB);
- return It;
+ // We have the default case. Save and restore from SP.
+ if (!MBB.isLiveIn(ARM::LR))
+ MBB.addLiveIn(ARM::LR);
+ saveLROnStack(MBB, It);
+ if (!AFI.isLRSpilled())
+ emitCFIForLRSaveOnStack(MBB, It);
+ CallPt = MBB.insert(It, CallMIB);
+ restoreLRFromStack(MBB, It);
+ if (!AFI.isLRSpilled())
+ emitCFIForLRRestoreFromStack(MBB, It);
+ It--;
+ return CallPt;
}
+
+bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault(
+ MachineFunction &MF) const {
+ return Subtarget.isMClass() && MF.getFunction().hasMinSize();
+}
+
+bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AAResults *AA) const {
+ // Try hard to rematerialize any VCTPs because if we spill P0, it will block
+ // the tail predication conversion. This means that the element count
+ // register has to be live for longer, but that has to be better than
+ // spill/restore and VPT predication.
+ return isVCTP(&MI) && !isPredicated(MI);
+}
+
+unsigned llvm::getBLXOpcode(const MachineFunction &MF) {
+ return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_noip
+ : ARM::BLX;
+}
+
+unsigned llvm::gettBLXrOpcode(const MachineFunction &MF) {
+ return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::tBLXr_noip
+ : ARM::tBLXr;
+}
+
+unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) {
+ return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip
+ : ARM::BLX_pred;
+}
+
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 1a75b011ca59..1b843c428130 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -132,6 +132,10 @@ public:
const ScheduleDAG *DAG) const override;
ScheduleHazardRecognizer *
+ CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+ const ScheduleDAGMI *DAG) const override;
+
+ ScheduleHazardRecognizer *
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const override;
@@ -171,28 +175,13 @@ public:
bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
ArrayRef<MachineOperand> Pred2) const override;
- bool DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const override;
+ bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+ bool SkipDead) const override;
bool isPredicable(const MachineInstr &MI) const override;
// CPSR defined in instruction
static bool isCPSRDefined(const MachineInstr &MI);
- bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const;
- bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const;
-
- // Load, scaled register offset
- bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const;
- // Load, scaled register offset, not plus LSL2
- bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const;
- // Minus reg for ldstso addr mode
- bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const;
- // Scaled register offset in address mode 2
- bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const;
- // Load multiple, base reg in list
- bool isLDMBaseRegInList(const MachineInstr &MI) const;
- // get LDM variable defs size
- unsigned getLDMVariableDefsSize(const MachineInstr &MI) const;
/// GetInstSize - Returns the size of the specified MachineInstr.
///
@@ -372,11 +361,60 @@ public:
MachineBasicBlock::iterator &It, MachineFunction &MF,
const outliner::Candidate &C) const override;
+ /// Enable outlining by default at -Oz.
+ bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
+
+ bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override {
+ return MI->getOpcode() == ARM::t2LoopEndDec ||
+ MI->getOpcode() == ARM::t2DoLoopStartTP;
+ }
+
private:
/// Returns an unused general-purpose register which can be used for
/// constructing an outlined call if one exists. Returns 0 otherwise.
unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+ // Adds an instruction which saves the link register on top of the stack into
+ /// the MachineBasicBlock \p MBB at position \p It.
+ void saveLROnStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+
+ /// Adds an instruction which restores the link register from the top the
+ /// stack into the MachineBasicBlock \p MBB at position \p It.
+ void restoreLRFromStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+
+ /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+ /// for the case when the LR is saved on the stack.
+ void emitCFIForLRSaveOnStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+
+ /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+ /// for the case when the LR is saved in the register \p Reg.
+ void emitCFIForLRSaveToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It,
+ Register Reg) const;
+
+ /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+ /// after the LR is was restored from the stack.
+ void emitCFIForLRRestoreFromStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+
+ /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
+ /// after the LR is was restored from a register.
+ void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+ /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+ /// so that they will be valid post-outlining.
+ ///
+ /// \param MBB A \p MachineBasicBlock in an outlined function.
+ void fixupPostOutline(MachineBasicBlock &MBB) const;
+
+ /// Returns true if the machine instruction offset can handle the stack fixup
+ /// and updates it if requested.
+ bool checkAndUpdateStackOffset(MachineInstr *MI, int64_t Fixup,
+ bool Updt) const;
+
unsigned getInstBundleLength(const MachineInstr &MI) const;
int getVLDMDefCycle(const InstrItineraryData *ItinData,
@@ -439,6 +477,9 @@ private:
MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) const;
+ bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+ AAResults *AA) const override;
+
private:
/// Modeling special VFP / NEON fp MLA / MLS hazards.
@@ -593,56 +634,6 @@ unsigned VCMPOpcodeToVPT(unsigned Opcode) {
}
static inline
-unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
- switch (Opcode) {
- default:
- llvm_unreachable("unhandled vctp opcode");
- break;
- case ARM::MVE_VCTP8:
- return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
- case ARM::MVE_VCTP16:
- return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
- case ARM::MVE_VCTP32:
- return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
- case ARM::MVE_VCTP64:
- return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
- }
- return 0;
-}
-
-static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
- switch (Opcode) {
- default:
- llvm_unreachable("unhandled vctp opcode");
- case ARM::MVE_VCTP8: return 16;
- case ARM::MVE_VCTP16: return 8;
- case ARM::MVE_VCTP32: return 4;
- case ARM::MVE_VCTP64: return 2;
- }
- return 0;
-}
-
-static inline
-bool isVCTP(MachineInstr *MI) {
- switch (MI->getOpcode()) {
- default:
- break;
- case ARM::MVE_VCTP8:
- case ARM::MVE_VCTP16:
- case ARM::MVE_VCTP32:
- case ARM::MVE_VCTP64:
- return true;
- }
- return false;
-}
-
-static inline
-bool isLoopStart(MachineInstr &MI) {
- return MI.getOpcode() == ARM::t2DoLoopStart ||
- MI.getOpcode() == ARM::t2WhileLoopStart;
-}
-
-static inline
bool isCondBranchOpcode(int Opc) {
return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
}
@@ -653,11 +644,77 @@ static inline bool isJumpTableBranchOpcode(int Opc) {
Opc == ARM::t2BR_JT;
}
+static inline bool isLowOverheadTerminatorOpcode(int Opc) {
+ return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
+ Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec;
+}
+
static inline
bool isIndirectBranchOpcode(int Opc) {
return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
}
+static inline bool isIndirectCall(const MachineInstr &MI) {
+ int Opc = MI.getOpcode();
+ switch (Opc) {
+ // indirect calls:
+ case ARM::BLX:
+ case ARM::BLX_noip:
+ case ARM::BLX_pred:
+ case ARM::BLX_pred_noip:
+ case ARM::BX_CALL:
+ case ARM::BMOVPCRX_CALL:
+ case ARM::TCRETURNri:
+ case ARM::TAILJMPr:
+ case ARM::TAILJMPr4:
+ case ARM::tBLXr:
+ case ARM::tBLXr_noip:
+ case ARM::tBLXNSr:
+ case ARM::tBLXNS_CALL:
+ case ARM::tBX_CALL:
+ case ARM::tTAILJMPr:
+ assert(MI.isCall(MachineInstr::IgnoreBundle));
+ return true;
+ // direct calls:
+ case ARM::BL:
+ case ARM::BL_pred:
+ case ARM::BMOVPCB_CALL:
+ case ARM::BL_PUSHLR:
+ case ARM::BLXi:
+ case ARM::TCRETURNdi:
+ case ARM::TAILJMPd:
+ case ARM::SVC:
+ case ARM::HVC:
+ case ARM::TPsoft:
+ case ARM::tTAILJMPd:
+ case ARM::t2SMC:
+ case ARM::t2HVC:
+ case ARM::tBL:
+ case ARM::tBLXi:
+ case ARM::tBL_PUSHLR:
+ case ARM::tTAILJMPdND:
+ case ARM::tSVC:
+ case ARM::tTPsoft:
+ assert(MI.isCall(MachineInstr::IgnoreBundle));
+ return false;
+ }
+ assert(!MI.isCall(MachineInstr::IgnoreBundle));
+ return false;
+}
+
+static inline bool isIndirectControlFlowNotComingBack(const MachineInstr &MI) {
+ int opc = MI.getOpcode();
+ return MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode()) ||
+ isJumpTableBranchOpcode(opc);
+}
+
+static inline bool isSpeculationBarrierEndBBOpcode(int Opc) {
+ return Opc == ARM::SpeculationBarrierISBDSBEndBB ||
+ Opc == ARM::SpeculationBarrierSBEndBB ||
+ Opc == ARM::t2SpeculationBarrierISBDSBEndBB ||
+ Opc == ARM::t2SpeculationBarrierSBEndBB;
+}
+
static inline bool isPopOpcode(int Opc) {
return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
@@ -829,13 +886,17 @@ inline bool isLegalAddressImm(unsigned Opcode, int Imm,
return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
case ARMII::AddrModeT2_i7s4:
return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
+ case ARMII::AddrModeT2_i8:
+ return std::abs(Imm) < (((1 << 8) * 1) - 1);
+ case ARMII::AddrModeT2_i12:
+ return Imm >= 0 && Imm < (((1 << 12) * 1) - 1);
default:
llvm_unreachable("Unhandled Addressing mode");
}
}
-// Return true if the given intrinsic is a gather or scatter
-inline bool isGatherScatter(IntrinsicInst *IntInst) {
+// Return true if the given intrinsic is a gather
+inline bool isGather(IntrinsicInst *IntInst) {
if (IntInst == nullptr)
return false;
unsigned IntrinsicID = IntInst->getIntrinsicID();
@@ -845,8 +906,15 @@ inline bool isGatherScatter(IntrinsicInst *IntInst) {
IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated ||
IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset ||
- IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
- IntrinsicID == Intrinsic::masked_scatter ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated);
+}
+
+// Return true if the given intrinsic is a scatter
+inline bool isScatter(IntrinsicInst *IntInst) {
+ if (IntInst == nullptr)
+ return false;
+ unsigned IntrinsicID = IntInst->getIntrinsicID();
+ return (IntrinsicID == Intrinsic::masked_scatter ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated ||
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb ||
@@ -855,6 +923,17 @@ inline bool isGatherScatter(IntrinsicInst *IntInst) {
IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated);
}
+// Return true if the given intrinsic is a gather or scatter
+inline bool isGatherScatter(IntrinsicInst *IntInst) {
+ if (IntInst == nullptr)
+ return false;
+ return isGather(IntInst) || isScatter(IntInst);
+}
+
+unsigned getBLXOpcode(const MachineFunction &MF);
+unsigned gettBLXrOpcode(const MachineFunction &MF);
+unsigned getBLXpredOpcode(const MachineFunction &MF);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3579635f83b5..1a264dabeeb5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -55,7 +55,9 @@
using namespace llvm;
ARMBaseRegisterInfo::ARMBaseRegisterInfo()
- : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {}
+ : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {
+ ARM_MC::initLLVMToCVRegMapping(this);
+}
static unsigned getFramePointerReg(const ARMSubtarget &STI) {
return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
@@ -328,9 +330,13 @@ bool ARMBaseRegisterInfo::getRegAllocationHints(
case ARMRI::RegPairOdd:
Odd = 1;
break;
- default:
+ case ARMRI::RegLR:
TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+ if (MRI.getRegClass(VirtReg)->contains(ARM::LR))
+ Hints.push_back(ARM::LR);
return false;
+ default:
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
}
// This register should preferably be even (Odd == 0) or odd (Odd == 1).
@@ -634,10 +640,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
/// be a pointer to FrameIdx at the beginning of the basic block.
-void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
- Register BaseReg,
- int FrameIdx,
- int64_t Offset) const {
+Register
+ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ int FrameIdx,
+ int64_t Offset) const {
ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
(AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
@@ -651,6 +657,7 @@ void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const MCInstrDesc &MCID = TII.get(ADDriOpc);
+ Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -658,6 +665,8 @@ void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
if (!AFI->isThumb1OnlyFunction())
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+
+ return BaseReg;
}
void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 0a0907af2141..5afb6c6aa015 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -32,8 +32,11 @@ class LiveIntervals;
namespace ARMRI {
enum {
+ // Used for LDRD register pairs
RegPairOdd = 1,
- RegPairEven = 2
+ RegPairEven = 2,
+ // Used to hint for lr in t2DoLoopStart
+ RegLR = 3
};
} // end namespace ARMRI
@@ -165,9 +168,8 @@ public:
int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const override;
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
- int FrameIdx,
- int64_t Offset) const override;
+ Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+ int64_t Offset) const override;
void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
new file mode 100644
index 000000000000..581b4b9857af
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -0,0 +1,231 @@
+//===-- ARMBlockPlacement.cpp - ARM block placement pass ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass re-arranges machine basic blocks to suit target requirements.
+// Currently it only moves blocks to fix backwards WLS branches.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBasicBlockInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-block-placement"
+#define DEBUG_PREFIX "ARM Block Placement: "
+
+namespace llvm {
+class ARMBlockPlacement : public MachineFunctionPass {
+private:
+ const ARMBaseInstrInfo *TII;
+ std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
+ MachineLoopInfo *MLI = nullptr;
+
+public:
+ static char ID;
+ ARMBlockPlacement() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After);
+ bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // namespace llvm
+
+FunctionPass *llvm::createARMBlockPlacementPass() {
+ return new ARMBlockPlacement();
+}
+
+char ARMBlockPlacement::ID = 0;
+
+INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
+ false)
+
+bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget());
+ if (!ST.hasLOB())
+ return false;
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n");
+ MLI = &getAnalysis<MachineLoopInfo>();
+ TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo());
+ BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+ MF.RenumberBlocks();
+ BBUtils->computeAllBlockSizes();
+ BBUtils->adjustBBOffsetsAfter(&MF.front());
+ bool Changed = false;
+
+ // Find loops with a backwards branching WLS.
+ // This requires looping over the loops in the function, checking each
+ // preheader for a WLS and if its target is before the preheader. If moving
+ // the target block wouldn't produce another backwards WLS or a new forwards
+ // LE branch then move the target block after the preheader.
+ for (auto *ML : *MLI) {
+ MachineBasicBlock *Preheader = ML->getLoopPredecessor();
+ if (!Preheader)
+ continue;
+
+ for (auto &Terminator : Preheader->terminators()) {
+ if (Terminator.getOpcode() != ARM::t2WhileLoopStart)
+ continue;
+ MachineBasicBlock *LoopExit = Terminator.getOperand(1).getMBB();
+ // We don't want to move the function's entry block.
+ if (!LoopExit->getPrevNode())
+ continue;
+ if (blockIsBefore(Preheader, LoopExit))
+ continue;
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
+ << Preheader->getFullName() << " to "
+ << LoopExit->getFullName() << "\n");
+
+ // Make sure that moving the target block doesn't cause any of its WLSs
+ // that were previously not backwards to become backwards
+ bool CanMove = true;
+ for (auto &LoopExitTerminator : LoopExit->terminators()) {
+ if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStart)
+ continue;
+ // An example loop structure where the LoopExit can't be moved, since
+ // bb1's WLS will become backwards once it's moved after bb3 bb1: -
+ // LoopExit
+ // WLS bb2 - LoopExit2
+ // bb2:
+ // ...
+ // bb3: - Preheader
+ // WLS bb1
+ // bb4: - Header
+ MachineBasicBlock *LoopExit2 =
+ LoopExitTerminator.getOperand(1).getMBB();
+ // If the WLS from LoopExit to LoopExit2 is already backwards then
+ // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
+ // after the Preheader then moving will keep it as a forward branch, so
+ // it can be moved. If LoopExit2 is between the Preheader and LoopExit
+ // then moving LoopExit will make it a backwards branch, so it can't be
+ // moved since we'd fix one and introduce one backwards branch.
+ // TODO: Analyse the blocks to make a decision if it would be worth
+ // moving LoopExit even if LoopExit2 is between the Preheader and
+ // LoopExit.
+ if (!blockIsBefore(LoopExit2, LoopExit) &&
+ (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) {
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+ << "Can't move the target block as it would "
+ "introduce a new backwards WLS branch\n");
+ CanMove = false;
+ break;
+ }
+ }
+
+ if (CanMove) {
+ // Make sure no LEs become forwards.
+ // An example loop structure where the LoopExit can't be moved, since
+ // bb2's LE will become forwards once bb1 is moved after bb3.
+ // bb1: - LoopExit
+ // bb2:
+ // LE bb1 - Terminator
+ // bb3: - Preheader
+ // WLS bb1
+ // bb4: - Header
+ for (auto It = LoopExit->getIterator(); It != Preheader->getIterator();
+ It++) {
+ MachineBasicBlock *MBB = &*It;
+ for (auto &Terminator : MBB->terminators()) {
+ if (Terminator.getOpcode() != ARM::t2LoopEnd &&
+ Terminator.getOpcode() != ARM::t2LoopEndDec)
+ continue;
+ MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
+ // The LE will become forwards branching if it branches to LoopExit
+ // which isn't allowed by the architecture, so we should avoid
+ // introducing these.
+ // TODO: Analyse the blocks to make a decision if it would be worth
+ // moving LoopExit even if we'd introduce a forwards LE
+ if (LETarget == LoopExit) {
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+ << "Can't move the target block as it would "
+ "introduce a new forwards LE branch\n");
+ CanMove = false;
+ break;
+ }
+ }
+ }
+
+ if (!CanMove)
+ break;
+ }
+
+ if (CanMove) {
+ moveBasicBlock(LoopExit, Preheader);
+ Changed = true;
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB,
+ MachineBasicBlock *Other) {
+ return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB);
+}
+
+void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
+ MachineBasicBlock *After) {
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after "
+ << After->getName() << "\n");
+ MachineBasicBlock *BBPrevious = BB->getPrevNode();
+ assert(BBPrevious && "Cannot move the function entry basic block");
+ MachineBasicBlock *AfterNext = After->getNextNode();
+ MachineBasicBlock *BBNext = BB->getNextNode();
+
+ BB->moveAfter(After);
+
+ auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) {
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from "
+ << From->getName() << " to " << To->getName() << "\n");
+ assert(From->isSuccessor(To) &&
+ "'To' is expected to be a successor of 'From'");
+ MachineInstr &Terminator = *(--From->terminators().end());
+ if (!Terminator.isUnconditionalBranch()) {
+ // The BB doesn't have an unconditional branch so it relied on
+ // fall-through. Fix by adding an unconditional branch to the moved BB.
+ unsigned BrOpc =
+ BBUtils->isBBInRange(&Terminator, To, 254) ? ARM::tB : ARM::t2B;
+ MachineInstrBuilder MIB =
+ BuildMI(From, Terminator.getDebugLoc(), TII->get(BrOpc));
+ MIB.addMBB(To);
+ MIB.addImm(ARMCC::CondCodes::AL);
+ MIB.addReg(ARM::NoRegister);
+ LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from "
+ << From->getName() << " to " << To->getName() << ": "
+ << *MIB.getInstr());
+ }
+ };
+
+ // Fix fall-through to the moved BB from the one that used to be before it.
+ if (BBPrevious->isSuccessor(BB))
+ FixFallthrough(BBPrevious, BB);
+ // Fix fall through from the destination BB to the one that used to follow.
+ if (AfterNext && After->isSuccessor(AfterNext))
+ FixFallthrough(After, AfterNext);
+ // Fix fall through from the moved BB to the one that used to follow.
+ if (BBNext && BB->isSuccessor(BBNext))
+ FixFallthrough(BB, BBNext);
+
+ BBUtils->adjustBBOffsetsAfter(After);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
index d860473011e7..6feed82596cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -85,12 +85,11 @@ namespace {
/// Helper class for values going out through an ABI boundary (used for handling
/// function return values and call parameters).
-struct OutgoingValueHandler : public CallLowering::ValueHandler {
- OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
-
- bool isIncomingArgumentHandler() const override { return false; }
+struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+ ARMOutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
+ CCAssignFn *AssignFn)
+ : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -258,13 +257,14 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
CCAssignFn *AssignFn =
TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
- OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
+ ARMOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret,
+ AssignFn);
return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler);
}
bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
- const Value *Val,
- ArrayRef<Register> VRegs) const {
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
assert(!Val == VRegs.empty() && "Return value without a vreg");
auto const &ST = MIRBuilder.getMF().getSubtarget<ARMSubtarget>();
@@ -282,12 +282,10 @@ namespace {
/// Helper class for values coming in through an ABI boundary (used for handling
/// formal arguments and call return values).
-struct IncomingValueHandler : public CallLowering::ValueHandler {
- IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn) {}
-
- bool isIncomingArgumentHandler() const override { return true; }
+struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler {
+ ARMIncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, CCAssignFn AssignFn)
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -337,8 +335,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
- auto ValSize = VA.getValVT().getSizeInBits();
- auto LocSize = VA.getLocVT().getSizeInBits();
+ uint64_t ValSize = VA.getValVT().getFixedSizeInBits();
+ uint64_t LocSize = VA.getLocVT().getFixedSizeInBits();
assert(ValSize <= 64 && "Unsupported value size");
assert(LocSize <= 64 && "Unsupported location size");
@@ -399,10 +397,10 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
virtual void markPhysRegUsed(unsigned PhysReg) = 0;
};
-struct FormalArgHandler : public IncomingValueHandler {
+struct FormalArgHandler : public ARMIncomingValueHandler {
FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
CCAssignFn AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+ : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -412,9 +410,10 @@ struct FormalArgHandler : public IncomingValueHandler {
} // end anonymous namespace
-bool ARMCallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const {
+bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
auto &TLI = *getTLI<ARMTargetLowering>();
auto Subtarget = TLI.getSubtarget();
@@ -435,7 +434,7 @@ bool ARMCallLowering::lowerFormalArguments(
for (auto &Arg : F.args()) {
if (!isSupportedType(DL, TLI, Arg.getType()))
return false;
- if (Arg.hasPassPointeeByValueAttr())
+ if (Arg.hasPassPointeeByValueCopyAttr())
return false;
}
@@ -469,10 +468,10 @@ bool ARMCallLowering::lowerFormalArguments(
namespace {
-struct CallReturnHandler : public IncomingValueHandler {
+struct CallReturnHandler : public ARMIncomingValueHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -482,15 +481,16 @@ struct CallReturnHandler : public IncomingValueHandler {
};
// FIXME: This should move to the ARMSubtarget when it supports all the opcodes.
-unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
+unsigned getCallOpcode(const MachineFunction &MF, const ARMSubtarget &STI,
+ bool isDirect) {
if (isDirect)
return STI.isThumb() ? ARM::tBL : ARM::BL;
if (STI.isThumb())
- return ARM::tBLXr;
+ return gettBLXrOpcode(MF);
if (STI.hasV5TOps())
- return ARM::BLX;
+ return getBLXOpcode(MF);
if (STI.hasV4TOps())
return ARM::BX_CALL;
@@ -518,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
// Create the call instruction so we can add the implicit uses of arg
// registers, but don't insert it yet.
bool IsDirect = !Info.Callee.isReg();
- auto CallOpcode = getCallOpcode(STI, IsDirect);
+ auto CallOpcode = getCallOpcode(MF, STI, IsDirect);
auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
bool IsThumb = STI.isThumb();
@@ -538,23 +538,19 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
- bool IsVarArg = false;
SmallVector<ArgInfo, 8> ArgInfos;
for (auto Arg : Info.OrigArgs) {
if (!isSupportedType(DL, TLI, Arg.Ty))
return false;
- if (!Arg.IsFixed)
- IsVarArg = true;
-
if (Arg.Flags[0].isByVal())
return false;
splitToValueTypes(Arg, ArgInfos, MF);
}
- auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg);
- OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
+ auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, Info.IsVarArg);
+ ARMOutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
return false;
@@ -567,7 +563,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
ArgInfos.clear();
splitToValueTypes(Info.OrigRet, ArgInfos, MF);
- auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg);
+ auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, Info.IsVarArg);
CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
index ddbc9feb90e2..3be73d497d0b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -33,10 +33,12 @@ public:
ARMCallLowering(const ARMTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<Register> VRegs) const override;
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 195d0a89291b..630490f6f914 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -338,6 +338,32 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
}
#endif
+// Align blocks where the previous block does not fall through. This may add
+// extra NOP's but they will not be executed. It uses the PrefLoopAlignment as a
+// measure of how much to align, and only runs at CodeGenOpt::Aggressive.
+static bool AlignBlocks(MachineFunction *MF) {
+ if (MF->getTarget().getOptLevel() != CodeGenOpt::Aggressive ||
+ MF->getFunction().hasOptSize())
+ return false;
+
+ auto *TLI = MF->getSubtarget().getTargetLowering();
+ const Align Alignment = TLI->getPrefLoopAlignment();
+ if (Alignment < 4)
+ return false;
+
+ bool Changed = false;
+ bool PrevCanFallthough = true;
+ for (auto &MBB : *MF) {
+ if (!PrevCanFallthough) {
+ Changed = true;
+ MBB.setAlignment(Alignment);
+ }
+ PrevCanFallthough = MBB.canFallThrough();
+ }
+
+ return Changed;
+}
+
bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
MCP = mf.getConstantPool();
@@ -359,6 +385,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
isThumb2 = AFI->isThumb2Function();
bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
+ // TBB generation code in this constant island pass has not been adapted to
+ // deal with speculation barriers.
+ if (STI->hardenSlsRetBr())
+ GenerateTBB = false;
// Renumber all of the machine basic blocks in the function, guaranteeing that
// the numbers agree with the position of the block in the function.
@@ -376,6 +406,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
MF->RenumberBlocks();
}
+ // Align any non-fallthrough blocks
+ MadeChange |= AlignBlocks(MF);
+
// Perform the initial placement of the constant pool entries. To start with,
// we put them all at the end of the function.
std::vector<MachineInstr*> CPEMIs;
@@ -491,7 +524,11 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// The function needs to be as aligned as the basic blocks. The linker may
// move functions around based on their alignment.
- MF->ensureAlignment(BB->getAlignment());
+ // Special case: halfword literals still need word alignment on the function.
+ Align FuncAlign = MaxAlign;
+ if (MaxAlign == 2)
+ FuncAlign = Align(4);
+ MF->ensureAlignment(FuncAlign);
// Order the entries in BB by descending alignment. That ensures correct
// alignment of all entries as long as BB is sufficiently aligned. Keep
@@ -506,7 +543,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
const DataLayout &TD = MF->getDataLayout();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
- unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+ unsigned Size = CPs[i].getSizeInBytes(TD);
Align Alignment = CPs[i].getAlign();
// Verify that all constant pool entries are a multiple of their alignment.
// If not, we would have to pad them out so that instructions stay aligned.
@@ -549,6 +586,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
for (MachineBasicBlock &MBB : *MF) {
auto MI = MBB.getLastNonDebugInstr();
+ // Look past potential SpeculationBarriers at end of BB.
+ while (MI != MBB.end() &&
+ (isSpeculationBarrierEndBBOpcode(MI->getOpcode()) ||
+ MI->isDebugInstr()))
+ --MI;
+
if (MI == MBB.end())
continue;
@@ -771,15 +814,26 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
// Taking the address of a CP entry.
case ARM::LEApcrel:
- case ARM::LEApcrelJT:
- // This takes a SoImm, which is 8 bit immediate rotated. We'll
- // pretend the maximum offset is 255 * 4. Since each instruction
- // 4 byte wide, this is always correct. We'll check for other
- // displacements that fits in a SoImm as well.
- Bits = 8;
- Scale = 4;
- NegOk = true;
- IsSoImm = true;
+ case ARM::LEApcrelJT: {
+ // This takes a SoImm, which is 8 bit immediate rotated. We'll
+ // pretend the maximum offset is 255 * 4. Since each instruction
+ // 4 byte wide, this is always correct. We'll check for other
+ // displacements that fits in a SoImm as well.
+ Bits = 8;
+ NegOk = true;
+ IsSoImm = true;
+ unsigned CPI = I.getOperand(op).getIndex();
+ assert(CPI < CPEMIs.size());
+ MachineInstr *CPEMI = CPEMIs[CPI];
+ const Align CPEAlign = getCPEAlign(CPEMI);
+ const unsigned LogCPEAlign = Log2(CPEAlign);
+ if (LogCPEAlign >= 2)
+ Scale = 4;
+ else
+ // For constants with less than 4-byte alignment,
+ // we'll pretend the maximum offset is 255 * 1.
+ Scale = 1;
+ }
break;
case ARM::t2LEApcrel:
case ARM::t2LEApcrelJT:
@@ -2070,8 +2124,7 @@ static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
MachineFunction *MF = MBB->getParent();
++MBB;
- return MBB != MF->end() && MBB->begin() != MBB->end() &&
- &*MBB->begin() == CPEMI;
+ return MBB != MF->end() && !MBB->empty() && &*MBB->begin() == CPEMI;
}
static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 48622aae3cb4..a7f1765a9311 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -873,16 +873,27 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
// FIXME Windows CE supports older ARM CPUs
assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
- // Expand into a movi + orr.
- LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
- HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
- .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
- .addReg(DstReg);
-
assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
unsigned ImmVal = (unsigned)MO.getImm();
- unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
- unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+ unsigned SOImmValV1 = 0, SOImmValV2 = 0;
+
+ if (ARM_AM::isSOImmTwoPartVal(ImmVal)) { // Expand into a movi + orr.
+ LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+ HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg);
+ SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+ SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+ } else { // Expand into a mvn + sub.
+ LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), DstReg);
+ HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri))
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg);
+ SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(-ImmVal);
+ SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(-ImmVal);
+ SOImmValV1 = ~(-SOImmValV1);
+ }
+
unsigned MIFlags = MI.getFlags();
LO16 = LO16.addImm(SOImmValV1);
HI16 = HI16.addImm(SOImmValV2);
@@ -1860,6 +1871,66 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
default:
return false;
+ case ARM::VBSPd:
+ case ARM::VBSPq: {
+ Register DstReg = MI.getOperand(0).getReg();
+ if (DstReg == MI.getOperand(3).getReg()) {
+ // Expand to VBIT
+ unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBITd : ARM::VBITq;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1))
+ .addImm(MI.getOperand(4).getImm())
+ .add(MI.getOperand(5));
+ } else if (DstReg == MI.getOperand(2).getReg()) {
+ // Expand to VBIF
+ unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBIFd : ARM::VBIFq;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(1))
+ .addImm(MI.getOperand(4).getImm())
+ .add(MI.getOperand(5));
+ } else {
+ // Expand to VBSL
+ unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBSLd : ARM::VBSLq;
+ if (DstReg == MI.getOperand(1).getReg()) {
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .addImm(MI.getOperand(4).getImm())
+ .add(MI.getOperand(5));
+ } else {
+ // Use move to satisfy constraints
+ unsigned MoveOpc = Opcode == ARM::VBSPd ? ARM::VORRd : ARM::VORRq;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MoveOpc))
+ .addReg(DstReg,
+ RegState::Define |
+ getRenamableRegState(MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(1))
+ .addImm(MI.getOperand(4).getImm())
+ .add(MI.getOperand(5));
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc))
+ .add(MI.getOperand(0))
+ .addReg(DstReg,
+ RegState::Kill |
+ getRenamableRegState(MI.getOperand(0).isRenamable()))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .addImm(MI.getOperand(4).getImm())
+ .add(MI.getOperand(5));
+ }
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+
case ARM::TCRETURNdi:
case ARM::TCRETURNri: {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -2233,8 +2304,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.addImm(0);
MIB.add(predOps(ARMCC::AL));
- MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Thumb ? ARM::tBLXr : ARM::BLX));
+ MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Thumb ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF)));
if (Thumb)
MIB.add(predOps(ARMCC::AL));
MIB.addReg(Reg, RegState::Kill);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
index 4bfca8a803ca..da1d9af8d5b5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -606,7 +606,9 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
}
}
- if (IsIndirect) {
+ if ((Subtarget->isTargetELF() && Subtarget->isGVInGOT(GV)) ||
+ (Subtarget->isTargetMachO() && IsIndirect) ||
+ Subtarget->genLongCalls()) {
MachineInstrBuilder MIB;
unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
if (isThumb2)
@@ -2173,7 +2175,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
if (UseReg)
- return isThumb2 ? ARM::tBLXr : ARM::BLX;
+ return isThumb2 ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF);
else
return isThumb2 ? ARM::tBL : ARM::BL;
}
@@ -2264,9 +2266,11 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
// BL / BLX don't take a predicate, but tBL / tBLX do.
if (isThumb2)
MIB.add(predOps(ARMCC::AL));
- if (Subtarget->genLongCalls())
+ if (Subtarget->genLongCalls()) {
+ CalleeReg =
+ constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0);
MIB.addReg(CalleeReg);
- else
+ } else
MIB.addExternalSymbol(TLI.getLibcallName(Call));
// Add implicit physical register uses to the call.
@@ -2404,9 +2408,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
// ARM calls don't take a predicate, but tBL / tBLX do.
if(isThumb2)
MIB.add(predOps(ARMCC::AL));
- if (UseReg)
+ if (UseReg) {
+ CalleeReg =
+ constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0);
MIB.addReg(CalleeReg);
- else if (!IntrMemName)
+ } else if (!IntrMemName)
MIB.addGlobalAddress(GV, 0, 0);
else
MIB.addExternalSymbol(IntrMemName, 0);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h
index 5cd7006c22fc..99e0ef05b5e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFeatures.h
@@ -75,6 +75,7 @@ inline bool isV8EligibleForIT(const InstrType *Instr) {
// there are some "conditionally deprecated" opcodes
case ARM::tADDspr:
case ARM::tBLXr:
+ case ARM::tBLXr_noip:
return Instr->getOperand(2).getReg() != ARM::PC;
// ADD PC, SP and BLX PC were always unpredictable,
// now on top of it they're deprecated
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 8a8f3237bb6f..9eeb7f20dc8d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -883,9 +883,10 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
/// debug info. It's the same as what we use for resolving the code-gen
/// references for now. FIXME: This can go wrong when references are
/// SP-relative and simple call frames aren't used.
-int ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const {
- return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+StackOffset ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ Register &FrameReg) const {
+ return StackOffset::getFixed(ResolveFrameIndexReference(MF, FI, FrameReg, 0));
}
int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
@@ -2113,8 +2114,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned NumExtras = TargetAlign.value() / 4;
SmallVector<unsigned, 2> Extras;
while (NumExtras && !UnspilledCS1GPRs.empty()) {
- unsigned Reg = UnspilledCS1GPRs.back();
- UnspilledCS1GPRs.pop_back();
+ unsigned Reg = UnspilledCS1GPRs.pop_back_val();
if (!MRI.isReserved(Reg) &&
(!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) {
Extras.push_back(Reg);
@@ -2124,8 +2124,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// For non-Thumb1 functions, also check for hi-reg CS registers
if (!AFI->isThumb1OnlyFunction()) {
while (NumExtras && !UnspilledCS2GPRs.empty()) {
- unsigned Reg = UnspilledCS2GPRs.back();
- UnspilledCS2GPRs.pop_back();
+ unsigned Reg = UnspilledCS2GPRs.pop_back_val();
if (!MRI.isReserved(Reg)) {
Extras.push_back(Reg);
NumExtras--;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
index 4c2c07d64f57..9822e2321bb4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -10,6 +10,7 @@
#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -47,8 +48,8 @@ public:
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg, int SPAdj) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0fa32a0abeff..f083fa6662e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -10,11 +10,19 @@
#include "ARMBaseInstrInfo.h"
#include "ARMBaseRegisterInfo.h"
#include "ARMSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
using namespace llvm;
+static cl::opt<int> DataBankMask("arm-data-bank-mask", cl::init(-1),
+ cl::Hidden);
+static cl::opt<bool> AssumeITCMConflict("arm-assume-itcm-bankconflict",
+ cl::init(false), cl::Hidden);
+
static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
const TargetRegisterInfo &TRI) {
// FIXME: Detect integer instructions properly.
@@ -31,7 +39,7 @@ static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
}
ScheduleHazardRecognizer::HazardType
-ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+ARMHazardRecognizerFPMLx::getHazardType(SUnit *SU, int Stalls) {
assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
MachineInstr *MI = SU->getInstr();
@@ -68,33 +76,193 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
}
}
}
-
- return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+ return NoHazard;
}
-void ARMHazardRecognizer::Reset() {
+void ARMHazardRecognizerFPMLx::Reset() {
LastMI = nullptr;
FpMLxStalls = 0;
- ScoreboardHazardRecognizer::Reset();
}
-void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+void ARMHazardRecognizerFPMLx::EmitInstruction(SUnit *SU) {
MachineInstr *MI = SU->getInstr();
if (!MI->isDebugInstr()) {
LastMI = MI;
FpMLxStalls = 0;
}
-
- ScoreboardHazardRecognizer::EmitInstruction(SU);
}
-void ARMHazardRecognizer::AdvanceCycle() {
+void ARMHazardRecognizerFPMLx::AdvanceCycle() {
if (FpMLxStalls && --FpMLxStalls == 0)
// Stalled for 4 cycles but still can't schedule any other instructions.
LastMI = nullptr;
- ScoreboardHazardRecognizer::AdvanceCycle();
}
-void ARMHazardRecognizer::RecedeCycle() {
+void ARMHazardRecognizerFPMLx::RecedeCycle() {
llvm_unreachable("reverse ARM hazard checking unsupported");
}
+
+///////// Bank conflicts handled as hazards //////////////
+
+static bool getBaseOffset(const MachineInstr &MI, const MachineOperand *&BaseOp,
+ int64_t &Offset) {
+
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+ unsigned IndexMode =
+ (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
+
+ // Address mode tells us what we want to know about operands for T2
+ // instructions (but not size). It tells us size (but not about operands)
+ // for T1 instructions.
+ switch (AddrMode) {
+ default:
+ return false;
+ case ARMII::AddrModeT2_i8:
+ // t2LDRBT, t2LDRB_POST, t2LDRB_PRE, t2LDRBi8,
+ // t2LDRHT, t2LDRH_POST, t2LDRH_PRE, t2LDRHi8,
+ // t2LDRSBT, t2LDRSB_POST, t2LDRSB_PRE, t2LDRSBi8,
+ // t2LDRSHT, t2LDRSH_POST, t2LDRSH_PRE, t2LDRSHi8,
+ // t2LDRT, t2LDR_POST, t2LDR_PRE, t2LDRi8
+ BaseOp = &MI.getOperand(1);
+ Offset = (IndexMode == ARMII::IndexModePost)
+ ? 0
+ : (IndexMode == ARMII::IndexModePre ||
+ IndexMode == ARMII::IndexModeUpd)
+ ? MI.getOperand(3).getImm()
+ : MI.getOperand(2).getImm();
+ return true;
+ case ARMII::AddrModeT2_i12:
+ // t2LDRBi12, t2LDRHi12
+ // t2LDRSBi12, t2LDRSHi12
+ // t2LDRi12
+ BaseOp = &MI.getOperand(1);
+ Offset = MI.getOperand(2).getImm();
+ return true;
+ case ARMII::AddrModeT2_i8s4:
+ // t2LDRD_POST, t2LDRD_PRE, t2LDRDi8
+ BaseOp = &MI.getOperand(2);
+ Offset = (IndexMode == ARMII::IndexModePost)
+ ? 0
+ : (IndexMode == ARMII::IndexModePre ||
+ IndexMode == ARMII::IndexModeUpd)
+ ? MI.getOperand(4).getImm()
+ : MI.getOperand(3).getImm();
+ return true;
+ case ARMII::AddrModeT1_1:
+ // tLDRBi, tLDRBr (watch out!), TLDRSB
+ case ARMII::AddrModeT1_2:
+ // tLDRHi, tLDRHr (watch out!), TLDRSH
+ case ARMII::AddrModeT1_4:
+ // tLDRi, tLDRr (watch out!)
+ BaseOp = &MI.getOperand(1);
+ Offset = MI.getOperand(2).isImm() ? MI.getOperand(2).getImm() : 0;
+ return MI.getOperand(2).isImm();
+ }
+ return false;
+}
+
+ARMBankConflictHazardRecognizer::ARMBankConflictHazardRecognizer(
+ const ScheduleDAG *DAG, int64_t CPUBankMask, bool CPUAssumeITCMConflict)
+ : ScheduleHazardRecognizer(), MF(DAG->MF), DL(DAG->MF.getDataLayout()),
+ DataMask(DataBankMask.getNumOccurrences() ? int64_t(DataBankMask)
+ : CPUBankMask),
+ AssumeITCMBankConflict(AssumeITCMConflict.getNumOccurrences()
+ ? AssumeITCMConflict
+ : CPUAssumeITCMConflict) {
+ MaxLookAhead = 1;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMBankConflictHazardRecognizer::CheckOffsets(unsigned O0, unsigned O1) {
+ return (((O0 ^ O1) & DataMask) != 0) ? NoHazard : Hazard;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMBankConflictHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+ MachineInstr &L0 = *SU->getInstr();
+ if (!L0.mayLoad() || L0.mayStore() || L0.getNumMemOperands() != 1)
+ return NoHazard;
+
+ auto MO0 = *L0.memoperands().begin();
+ auto BaseVal0 = MO0->getValue();
+ auto BasePseudoVal0 = MO0->getPseudoValue();
+ int64_t Offset0 = 0;
+
+ if (MO0->getSize() > 4)
+ return NoHazard;
+
+ bool SPvalid = false;
+ const MachineOperand *SP = nullptr;
+ int64_t SPOffset0 = 0;
+
+ for (auto L1 : Accesses) {
+ auto MO1 = *L1->memoperands().begin();
+ auto BaseVal1 = MO1->getValue();
+ auto BasePseudoVal1 = MO1->getPseudoValue();
+ int64_t Offset1 = 0;
+
+ // Pointers to the same object
+ if (BaseVal0 && BaseVal1) {
+ const Value *Ptr0, *Ptr1;
+ Ptr0 = GetPointerBaseWithConstantOffset(BaseVal0, Offset0, DL, true);
+ Ptr1 = GetPointerBaseWithConstantOffset(BaseVal1, Offset1, DL, true);
+ if (Ptr0 == Ptr1 && Ptr0)
+ return CheckOffsets(Offset0, Offset1);
+ }
+
+ if (BasePseudoVal0 && BasePseudoVal1 &&
+ BasePseudoVal0->kind() == BasePseudoVal1->kind() &&
+ BasePseudoVal0->kind() == PseudoSourceValue::FixedStack) {
+ // Spills/fills
+ auto FS0 = cast<FixedStackPseudoSourceValue>(BasePseudoVal0);
+ auto FS1 = cast<FixedStackPseudoSourceValue>(BasePseudoVal1);
+ Offset0 = MF.getFrameInfo().getObjectOffset(FS0->getFrameIndex());
+ Offset1 = MF.getFrameInfo().getObjectOffset(FS1->getFrameIndex());
+ return CheckOffsets(Offset0, Offset1);
+ }
+
+ // Constant pools (likely in ITCM)
+ if (BasePseudoVal0 && BasePseudoVal1 &&
+ BasePseudoVal0->kind() == BasePseudoVal1->kind() &&
+ BasePseudoVal0->isConstantPool() && AssumeITCMBankConflict)
+ return Hazard;
+
+ // Is this a stack pointer-relative access? We could in general try to
+ // use "is this the same register and is it unchanged?", but the
+ // memory operand tracking is highly likely to have already found that.
+ // What we're after here is bank conflicts between different objects in
+ // the stack frame.
+ if (!SPvalid) { // set up SP
+ if (!getBaseOffset(L0, SP, SPOffset0) || SP->getReg().id() != ARM::SP)
+ SP = nullptr;
+ SPvalid = true;
+ }
+ if (SP) {
+ int64_t SPOffset1;
+ const MachineOperand *SP1;
+ if (getBaseOffset(*L1, SP1, SPOffset1) && SP1->getReg().id() == ARM::SP)
+ return CheckOffsets(SPOffset0, SPOffset1);
+ }
+ }
+
+ return NoHazard;
+}
+
+void ARMBankConflictHazardRecognizer::Reset() { Accesses.clear(); }
+
+void ARMBankConflictHazardRecognizer::EmitInstruction(SUnit *SU) {
+ MachineInstr &MI = *SU->getInstr();
+ if (!MI.mayLoad() || MI.mayStore() || MI.getNumMemOperands() != 1)
+ return;
+
+ auto MO = *MI.memoperands().begin();
+ uint64_t Size1 = MO->getSize();
+ if (Size1 > 4)
+ return;
+ Accesses.push_back(&MI);
+}
+
+void ARMBankConflictHazardRecognizer::AdvanceCycle() { Accesses.clear(); }
+
+void ARMBankConflictHazardRecognizer::RecedeCycle() { Accesses.clear(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
index ca02cc739e11..c1f1bcd0a629 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -13,27 +13,28 @@
#ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
#define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
-#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/DataTypes.h"
+#include <array>
+#include <initializer_list>
namespace llvm {
-class ARMBaseInstrInfo;
-class ARMBaseRegisterInfo;
-class ARMSubtarget;
+class DataLayout;
+class MachineFunction;
class MachineInstr;
+class ScheduleDAG;
-/// ARMHazardRecognizer handles special constraints that are not expressed in
-/// the scheduling itinerary. This is only used during postRA scheduling. The
-/// ARM preRA scheduler uses an unspecialized instance of the
-/// ScoreboardHazardRecognizer.
-class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+// Hazards related to FP MLx instructions
+class ARMHazardRecognizerFPMLx : public ScheduleHazardRecognizer {
MachineInstr *LastMI = nullptr;
unsigned FpMLxStalls = 0;
public:
- ARMHazardRecognizer(const InstrItineraryData *ItinData,
- const ScheduleDAG *DAG)
- : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched") {}
+ ARMHazardRecognizerFPMLx() : ScheduleHazardRecognizer() { MaxLookAhead = 1; }
HazardType getHazardType(SUnit *SU, int Stalls) override;
void Reset() override;
@@ -42,6 +43,27 @@ public:
void RecedeCycle() override;
};
+// Hazards related to bank conflicts
+class ARMBankConflictHazardRecognizer : public ScheduleHazardRecognizer {
+ SmallVector<MachineInstr *, 8> Accesses;
+ const MachineFunction &MF;
+ const DataLayout &DL;
+ int64_t DataMask;
+ bool AssumeITCMBankConflict;
+
+public:
+ ARMBankConflictHazardRecognizer(const ScheduleDAG *DAG, int64_t DDM,
+ bool ABC);
+ HazardType getHazardType(SUnit *SU, int Stalls) override;
+ void Reset() override;
+ void EmitInstruction(SUnit *SU) override;
+ void AdvanceCycle() override;
+ void RecedeCycle() override;
+
+private:
+ inline HazardType CheckOffsets(unsigned O0, unsigned O1);
+};
+
} // end namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 287e2e60e572..397979b4ab1e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -143,7 +143,7 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
cl::init(128));
-static cl::opt<unsigned>
+cl::opt<unsigned>
MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
cl::desc("Maximum interleave factor for MVE VLDn to generate."),
cl::init(2));
@@ -289,6 +289,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
// Vector reductions
setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
@@ -335,6 +337,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
// Pre and Post inc are supported on loads and stores
for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -439,6 +443,9 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::SELECT, VT, Expand);
}
}
@@ -987,6 +994,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMAX);
setTargetDAGCombine(ISD::FP_EXTEND);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SELECT_CC);
}
if (!Subtarget->hasFP64()) {
@@ -1716,8 +1725,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VCVTL: return "ARMISD::VCVTL";
case ARMISD::VMULLs: return "ARMISD::VMULLs";
case ARMISD::VMULLu: return "ARMISD::VMULLu";
+ case ARMISD::VQDMULH: return "ARMISD::VQDMULH";
case ARMISD::VADDVs: return "ARMISD::VADDVs";
case ARMISD::VADDVu: return "ARMISD::VADDVu";
+ case ARMISD::VADDVps: return "ARMISD::VADDVps";
+ case ARMISD::VADDVpu: return "ARMISD::VADDVpu";
case ARMISD::VADDLVs: return "ARMISD::VADDLVs";
case ARMISD::VADDLVu: return "ARMISD::VADDLVu";
case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs";
@@ -1728,10 +1740,20 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu";
case ARMISD::VMLAVs: return "ARMISD::VMLAVs";
case ARMISD::VMLAVu: return "ARMISD::VMLAVu";
+ case ARMISD::VMLAVps: return "ARMISD::VMLAVps";
+ case ARMISD::VMLAVpu: return "ARMISD::VMLAVpu";
case ARMISD::VMLALVs: return "ARMISD::VMLALVs";
case ARMISD::VMLALVu: return "ARMISD::VMLALVu";
+ case ARMISD::VMLALVps: return "ARMISD::VMLALVps";
+ case ARMISD::VMLALVpu: return "ARMISD::VMLALVpu";
case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs";
case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu";
+ case ARMISD::VMLALVAps: return "ARMISD::VMLALVAps";
+ case ARMISD::VMLALVApu: return "ARMISD::VMLALVApu";
+ case ARMISD::VMINVu: return "ARMISD::VMINVu";
+ case ARMISD::VMINVs: return "ARMISD::VMINVs";
+ case ARMISD::VMAXVu: return "ARMISD::VMAXVu";
+ case ARMISD::VMAXVs: return "ARMISD::VMAXVs";
case ARMISD::UMAAL: return "ARMISD::UMAAL";
case ARMISD::UMLAL: return "ARMISD::UMLAL";
case ARMISD::SMLAL: return "ARMISD::SMLAL";
@@ -1755,7 +1777,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
- case ARMISD::VBSL: return "ARMISD::VBSL";
+ case ARMISD::VBSP: return "ARMISD::VBSP";
case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
@@ -2509,9 +2531,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), Callee,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
} else if (Subtarget->isTargetCOFF()) {
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
@@ -3320,8 +3342,7 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
MVT::i32, DL, Chain, DescAddr,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- /* Alignment = */ 4,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
Chain = FuncTLVGet.getValue(1);
@@ -3535,8 +3556,7 @@ static bool allUsersAreInFunction(const Value *V, const Function *F) {
while (!Worklist.empty()) {
auto *U = Worklist.pop_back_val();
if (isa<ConstantExpr>(U)) {
- for (auto *UU : U->users())
- Worklist.push_back(UU);
+ append_range(Worklist, U->users());
continue;
}
@@ -4423,13 +4443,26 @@ SDValue ARMTargetLowering::LowerFormalArguments(
}
// varargs
- if (isVarArg && MFI.hasVAStart())
- VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
- CCInfo.getNextStackOffset(),
+ if (isVarArg && MFI.hasVAStart()) {
+ VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
TotalArgRegsSaveSize);
+ if (AFI->isCmseNSEntryFunction()) {
+ DiagnosticInfoUnsupported Diag(
+ DAG.getMachineFunction().getFunction(),
+ "secure entry function must not be variadic", dl.getDebugLoc());
+ DAG.getContext()->diagnose(Diag);
+ }
+ }
AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+ if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
+ DiagnosticInfoUnsupported Diag(
+ DAG.getMachineFunction().getFunction(),
+ "secure entry function requires arguments on stack", dl.getDebugLoc());
+ DAG.getContext()->diagnose(Diag);
+ }
+
return Chain;
}
@@ -4990,16 +5023,6 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
}
-// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
-static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
- const SDValue TrueVal, const SDValue FalseVal,
- const ISD::CondCode CC, const SDValue K) {
- return (isGTorGE(CC) &&
- ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
- (isLTorLE(CC) &&
- ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
-}
-
// Check if two chained conditionals could be converted into SSAT or USAT.
//
// SSAT can replace a set of two conditional selectors that bound a number to an
@@ -5011,101 +5034,68 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
// x < k ? (x < -k ? -k : x) : k
// etc.
//
-// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
-// a power of 2.
+// LLVM canonicalizes these to either a min(max()) or a max(min())
+// pattern. This function tries to match one of these and will return a SSAT
+// node if successful.
//
-// It returns true if the conversion can be done, false otherwise.
-// Additionally, the variable is returned in parameter V, the constant in K and
-// usat is set to true if the conditional represents an unsigned saturation
-static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
- uint64_t &K, bool &usat) {
- SDValue LHS1 = Op.getOperand(0);
- SDValue RHS1 = Op.getOperand(1);
+// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
+// is a power of 2.
+static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ SDValue V1 = Op.getOperand(0);
+ SDValue K1 = Op.getOperand(1);
SDValue TrueVal1 = Op.getOperand(2);
SDValue FalseVal1 = Op.getOperand(3);
ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
if (Op2.getOpcode() != ISD::SELECT_CC)
- return false;
+ return SDValue();
- SDValue LHS2 = Op2.getOperand(0);
- SDValue RHS2 = Op2.getOperand(1);
+ SDValue V2 = Op2.getOperand(0);
+ SDValue K2 = Op2.getOperand(1);
SDValue TrueVal2 = Op2.getOperand(2);
SDValue FalseVal2 = Op2.getOperand(3);
ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
- // Find out which are the constants and which are the variables
- // in each conditional
- SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
- ? &RHS1
- : nullptr;
- SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
- ? &RHS2
- : nullptr;
- SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
- SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
- SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
- SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
-
- // We must detect cases where the original operations worked with 16- or
- // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
- // must work with sign-extended values but the select operations return
- // the original non-extended value.
- SDValue V2TmpReg = V2Tmp;
- if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
- V2TmpReg = V2Tmp->getOperand(0);
-
- // Check that the registers and the constants have the correct values
- // in both conditionals
- if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
- V2TmpReg != V2)
- return false;
+ SDValue V1Tmp = V1;
+ SDValue V2Tmp = V2;
- // Figure out which conditional is saturating the lower/upper bound.
- const SDValue *LowerCheckOp =
- isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
- ? &Op
- : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
- ? &Op2
- : nullptr;
- const SDValue *UpperCheckOp =
- isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
- ? &Op
- : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
- ? &Op2
- : nullptr;
-
- if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
- return false;
+ // Check that the registers and the constants match a max(min()) or min(max())
+ // pattern
+ if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
+ K2 != FalseVal2 ||
+ !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
+ return SDValue();
// Check that the constant in the lower-bound check is
// the opposite of the constant in the upper-bound check
// in 1's complement.
- int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
- int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
+ if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
+ return SDValue();
+
+ int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
+ int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
int64_t PosVal = std::max(Val1, Val2);
int64_t NegVal = std::min(Val1, Val2);
- if (((Val1 > Val2 && UpperCheckOp == &Op) ||
- (Val1 < Val2 && UpperCheckOp == &Op2)) &&
- isPowerOf2_64(PosVal + 1)) {
-
- // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
- if (Val1 == ~Val2)
- usat = false;
- else if (NegVal == 0)
- usat = true;
- else
- return false;
-
- V = V2;
- K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+ if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
+ !isPowerOf2_64(PosVal + 1))
+ return SDValue();
- return true;
- }
+ // Handle the difference between USAT (unsigned) and SSAT (signed)
+ // saturation
+ // At this point, PosVal is guaranteed to be positive
+ uint64_t K = PosVal;
+ SDLoc dl(Op);
+ if (Val1 == ~Val2)
+ return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
+ DAG.getConstant(countTrailingOnes(K), dl, VT));
+ if (NegVal == 0)
+ return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
+ DAG.getConstant(countTrailingOnes(K), dl, VT));
- return false;
+ return SDValue();
}
// Check if a condition of the type x < k ? k : x can be converted into a
@@ -5165,18 +5155,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
// Try to convert two saturating conditional selects into a single SSAT
- SDValue SatValue;
- uint64_t SatConstant;
- bool SatUSat;
- if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
- isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
- if (SatUSat)
- return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
- DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
- else
- return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
- DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
- }
+ if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
+ if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
+ return SatValue;
// Try to convert expressions of the form x < k ? k : x (and similar forms)
// into more efficient bit operations, which is possible when k is 0 or -1
@@ -5185,6 +5166,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// instructions.
// Only allow this transformation on full-width (32-bit) operations
SDValue LowerSatConstant;
+ SDValue SatValue;
if (VT == MVT::i32 &&
isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
@@ -7768,17 +7750,19 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
- if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+ uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
+ uint64_t VTSize = VT.getFixedSizeInBits();
+ if (SrcVTSize == VTSize)
continue;
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
- unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+ unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
- if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
- if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
+ if (SrcVTSize < VTSize) {
+ if (2 * SrcVTSize != VTSize)
return SDValue();
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
@@ -7788,7 +7772,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
continue;
}
- if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
+ if (SrcVTSize != 2 * VTSize)
return SDValue();
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
@@ -7856,7 +7840,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
- int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+ int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
@@ -8658,6 +8642,23 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
}
+// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
+static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ assert(ST->hasMVEIntegerOps() && "Expected MVE!");
+ EVT VT = N.getValueType();
+ assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
+ "Expected a vector i1 type!");
+ SDValue Op = N.getOperand(0);
+ EVT FromVT = Op.getValueType();
+ SDLoc DL(N);
+
+ SDValue And =
+ DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
+ return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
+ DAG.getCondCode(ISD::SETNE));
+}
+
/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
/// element has been zero/sign-extended, depending on the isSigned parameter,
/// from an integer type half its size.
@@ -8722,10 +8723,11 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
return false;
}
-/// isZeroExtended - Check if a node is a vector value that is zero-extended
-/// or a constant BUILD_VECTOR with zero-extended elements.
+/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
+/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
- if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+ if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
+ ISD::isZEXTLoad(N))
return true;
if (isExtendedBUILD_VECTOR(N, DAG, false))
return true;
@@ -8793,13 +8795,14 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
}
/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
-/// extending load, or BUILD_VECTOR with extended elements, return the
-/// unextended value. The unextended vector should be 64 bits so that it can
+/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
+/// the unextended value. The unextended vector should be 64 bits so that it can
/// be used as an operand to a VMULL instruction. If the original vector size
/// before extension is less than 64 bits we add a an extension to resize
/// the vector to 64 bits.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
- if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+ if (N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
N->getOperand(0)->getValueType(0),
N->getValueType(0),
@@ -9767,6 +9770,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
+ case ISD::TRUNCATE: return LowerTruncatei1(Op, DAG, Subtarget);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
@@ -10399,8 +10403,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
// Remove the landing pad successor from the invoke block and replace it
// with the new dispatch block.
- SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
- BB->succ_end());
+ SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
while (!Successors.empty()) {
MachineBasicBlock *SMBB = Successors.pop_back_val();
if (SMBB->isEHPad()) {
@@ -10884,7 +10887,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
.addExternalSymbol("__chkstk");
- BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+ BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
.add(predOps(ARMCC::AL))
.addReg(Reg, RegState::Kill)
.addReg(ARM::R4, RegState::Implicit | RegState::Kill)
@@ -11263,6 +11266,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitLowered__chkstk(MI, BB);
case ARM::WIN__DBZCHK:
return EmitLowered__dbzchk(MI, BB);
+ case ARM::t2DoLoopStart:
+ // We are just here to set a register allocation hint, prefering lr for the
+ // input register to make it more likely to be movable and removable, later
+ // in the pipeline.
+ Register R = MI.getOperand(1).getReg();
+ MachineFunction *MF = MI.getParent()->getParent();
+ MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
+ return BB;
}
}
@@ -12104,9 +12115,198 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformSELECTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue SetCC;
+ SDValue LHS;
+ SDValue RHS;
+ ISD::CondCode CC;
+ SDValue TrueVal;
+ SDValue FalseVal;
+
+ if (N->getOpcode() == ISD::SELECT &&
+ N->getOperand(0)->getOpcode() == ISD::SETCC) {
+ SetCC = N->getOperand(0);
+ LHS = SetCC->getOperand(0);
+ RHS = SetCC->getOperand(1);
+ CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ TrueVal = N->getOperand(1);
+ FalseVal = N->getOperand(2);
+ } else if (N->getOpcode() == ISD::SELECT_CC) {
+ LHS = N->getOperand(0);
+ RHS = N->getOperand(1);
+ CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+ TrueVal = N->getOperand(2);
+ FalseVal = N->getOperand(3);
+ } else {
+ return SDValue();
+ }
+
+ unsigned int Opcode = 0;
+ if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
+ FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
+ (CC == ISD::SETULT || CC == ISD::SETUGT)) {
+ Opcode = ARMISD::VMINVu;
+ if (CC == ISD::SETUGT)
+ std::swap(TrueVal, FalseVal);
+ } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
+ FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
+ (CC == ISD::SETLT || CC == ISD::SETGT)) {
+ Opcode = ARMISD::VMINVs;
+ if (CC == ISD::SETGT)
+ std::swap(TrueVal, FalseVal);
+ } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
+ FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
+ (CC == ISD::SETUGT || CC == ISD::SETULT)) {
+ Opcode = ARMISD::VMAXVu;
+ if (CC == ISD::SETULT)
+ std::swap(TrueVal, FalseVal);
+ } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
+ FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
+ (CC == ISD::SETGT || CC == ISD::SETLT)) {
+ Opcode = ARMISD::VMAXVs;
+ if (CC == ISD::SETLT)
+ std::swap(TrueVal, FalseVal);
+ } else
+ return SDValue();
+
+ // Normalise to the right hand side being the vector reduction
+ switch (TrueVal->getOpcode()) {
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_SMAX:
+ std::swap(LHS, RHS);
+ std::swap(TrueVal, FalseVal);
+ break;
+ }
+
+ EVT VectorType = FalseVal->getOperand(0).getValueType();
+
+ if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
+ VectorType != MVT::v4i32)
+ return SDValue();
+
+ EVT VectorScalarType = VectorType.getVectorElementType();
+
+ // The values being selected must also be the ones being compared
+ if (TrueVal != LHS || FalseVal != RHS)
+ return SDValue();
+
+ EVT LeftType = LHS->getValueType(0);
+ EVT RightType = RHS->getValueType(0);
+
+ // The types must match the reduced type too
+ if (LeftType != VectorScalarType || RightType != VectorScalarType)
+ return SDValue();
+
+ // Legalise the scalar to an i32
+ if (VectorScalarType != MVT::i32)
+ LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+ // Generate the reduction as an i32 for legalisation purposes
+ auto Reduction =
+ DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
+
+ // The result isn't actually an i32 so truncate it back to its original type
+ if (VectorScalarType != MVT::i32)
+ Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
+
+ return Reduction;
+}
+
+// A special combine for the vqdmulh family of instructions. This is one of the
+// potential set of patterns that could patch this instruction. The base pattern
+// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
+// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
+// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
+// the max is unnecessary.
+static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue Shft;
+ ConstantSDNode *Clamp;
+
+ if (N->getOpcode() == ISD::SMIN) {
+ Shft = N->getOperand(0);
+ Clamp = isConstOrConstSplat(N->getOperand(1));
+ } else if (N->getOpcode() == ISD::VSELECT) {
+ // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
+ SDValue Cmp = N->getOperand(0);
+ if (Cmp.getOpcode() != ISD::SETCC ||
+ cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
+ Cmp.getOperand(0) != N->getOperand(1) ||
+ Cmp.getOperand(1) != N->getOperand(2))
+ return SDValue();
+ Shft = N->getOperand(1);
+ Clamp = isConstOrConstSplat(N->getOperand(2));
+ } else
+ return SDValue();
+
+ if (!Clamp)
+ return SDValue();
+
+ MVT ScalarType;
+ int ShftAmt = 0;
+ switch (Clamp->getSExtValue()) {
+ case (1 << 7) - 1:
+ ScalarType = MVT::i8;
+ ShftAmt = 7;
+ break;
+ case (1 << 15) - 1:
+ ScalarType = MVT::i16;
+ ShftAmt = 15;
+ break;
+ case (1ULL << 31) - 1:
+ ScalarType = MVT::i32;
+ ShftAmt = 31;
+ break;
+ default:
+ return SDValue();
+ }
+
+ if (Shft.getOpcode() != ISD::SRA)
+ return SDValue();
+ ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
+ if (!N1 || N1->getSExtValue() != ShftAmt)
+ return SDValue();
+
+ SDValue Mul = Shft.getOperand(0);
+ if (Mul.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ SDValue Ext0 = Mul.getOperand(0);
+ SDValue Ext1 = Mul.getOperand(1);
+ if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
+ Ext1.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+ EVT VecVT = Ext0.getOperand(0).getValueType();
+ if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8)
+ return SDValue();
+ if (Ext1.getOperand(0).getValueType() != VecVT ||
+ VecVT.getScalarType() != ScalarType ||
+ VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
+ return SDValue();
+
+ SDLoc DL(Mul);
+ SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0),
+ Ext1.getOperand(0));
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH);
+}
+
static SDValue PerformVSELECTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
+ return V;
+
// Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
//
// We need to re-implement this optimization here as the implementation in the
@@ -12116,9 +12316,6 @@ static SDValue PerformVSELECTCombine(SDNode *N,
//
// Currently, this is only done for MVE, as it's the only target that benefits
// from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
- if (!Subtarget->hasMVEIntegerOps())
- return SDValue();
-
if (N->getOperand(0).getOpcode() != ISD::XOR)
return SDValue();
SDValue XOR = N->getOperand(0);
@@ -12259,6 +12456,14 @@ static SDValue PerformADDVecReduce(SDNode *N,
return M;
if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
+ return M;
return SDValue();
}
@@ -13153,7 +13358,7 @@ static SDValue PerformORCombine(SDNode *N,
// Canonicalize the vector type to make instruction selection
// simpler.
EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
- SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+ SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
N0->getOperand(1),
N0->getOperand(0),
N1->getOperand(0));
@@ -13464,6 +13669,12 @@ static SDValue PerformVMOVrhCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ // fold (VMOVrh (fpconst x)) -> const x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
+ APFloat V = C->getValueAPF();
+ return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
+ }
+
// fold (VMOVrh (load x)) -> (zextload (i16*)x)
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -13638,6 +13849,23 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
}
+ // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
+ // more VPNOT which might get folded as else predicates.
+ if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
+ SDValue X =
+ DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
+ SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
+ DCI.DAG.getConstant(65535, dl, MVT::i32));
+ return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
+ }
+
+ // Only the bottom 16 bits of the source register are used.
+ if (Op.getValueType() == MVT::i32) {
+ APInt DemandedMask = APInt::getLowBitsSet(32, 16);
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
+ return SDValue(N, 0);
+ }
return SDValue();
}
@@ -13850,10 +14078,13 @@ static SDValue CombineBaseUpdate(SDNode *N,
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4; break;
+ case Intrinsic::arm_neon_vld1x2:
+ case Intrinsic::arm_neon_vld1x3:
+ case Intrinsic::arm_neon_vld1x4:
case Intrinsic::arm_neon_vld2dup:
case Intrinsic::arm_neon_vld3dup:
case Intrinsic::arm_neon_vld4dup:
- // TODO: Support updating VLDxDUP nodes. For now, we just skip
+ // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip
// combining base updates for such intrinsics.
continue;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
@@ -14445,27 +14676,38 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
// use the VMOVN over splitting the store. We are looking for patterns of:
// !rev: 0 N 1 N+1 2 N+2 ...
// rev: N 0 N+1 1 N+2 2 ...
- auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) {
+ // The shuffle may either be a single source (in which case N = NumElts/2) or
+ // two inputs extended with concat to the same size (in which case N =
+ // NumElts).
+ auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
+ ArrayRef<int> M = SVN->getMask();
unsigned NumElts = ToVT.getVectorNumElements();
- if (NumElts != M.size())
- return false;
+ if (SVN->getOperand(1).isUndef())
+ NumElts /= 2;
- unsigned Off0 = rev ? NumElts : 0;
- unsigned Off1 = rev ? 0 : NumElts;
+ unsigned Off0 = Rev ? NumElts : 0;
+ unsigned Off1 = Rev ? 0 : NumElts;
- for (unsigned i = 0; i < NumElts; i += 2) {
- if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
+ for (unsigned I = 0; I < NumElts; I += 2) {
+ if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
return false;
- if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
+ if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
return false;
}
return true;
};
- if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0)))
- if (isVMOVNOriginalMask(Shuffle->getMask(), false) ||
- isVMOVNOriginalMask(Shuffle->getMask(), true))
+ // It may be preferable to keep the store unsplit as the trunc may end up
+ // being removed. Check that here.
+ if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) {
+ if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) {
+ DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U);
+ return SDValue();
+ }
+ }
+ if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
+ if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
return SDValue();
LLVMContext &C = *DAG.getContext();
@@ -14486,7 +14728,8 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
- SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+ SDValue NewPtr =
+ DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
SDValue Extract =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
@@ -14539,15 +14782,15 @@ static SDValue PerformSTORECombine(SDNode *N,
SDValue BasePtr = St->getBasePtr();
SDValue NewST1 = DAG.getStore(
St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
- BasePtr, St->getPointerInfo(), St->getAlignment(),
+ BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
DAG.getConstant(4, DL, MVT::i32));
return DAG.getStore(NewST1.getValue(0), DL,
StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
- OffsetPtr, St->getPointerInfo(),
- std::min(4U, St->getAlignment() / 2),
+ OffsetPtr, St->getPointerInfo().getWithOffset(4),
+ St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -14721,27 +14964,105 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
// VADDLV u/s 32
// VMLALV u/s 16/32
+ // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
+ // extend it and use v4i32 instead.
+ auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
+ EVT AVT = A.getValueType();
+ if (!AVT.is128BitVector())
+ A = DAG.getNode(ExtendCode, dl,
+ AVT.changeVectorElementType(MVT::getIntegerVT(
+ 128 / AVT.getVectorMinNumElements())),
+ A);
+ return A;
+ };
auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
return SDValue();
SDValue A = N0->getOperand(0);
if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
- return A;
+ return ExtendIfNeeded(A, ExtendCode);
+ return SDValue();
+ };
+ auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
+ ArrayRef<MVT> ExtTypes, SDValue &Mask) {
+ if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
+ !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
+ return SDValue();
+ Mask = N0->getOperand(0);
+ SDValue Ext = N0->getOperand(1);
+ if (Ext->getOpcode() != ExtendCode)
+ return SDValue();
+ SDValue A = Ext->getOperand(0);
+ if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+ return ExtendIfNeeded(A, ExtendCode);
return SDValue();
};
auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
SDValue &A, SDValue &B) {
- if (ResVT != RetTy || N0->getOpcode() != ISD::MUL)
+ // For a vmla we are trying to match a larger pattern:
+ // ExtA = sext/zext A
+ // ExtB = sext/zext B
+ // Mul = mul ExtA, ExtB
+ // vecreduce.add Mul
+ // There might also be en extra extend between the mul and the addreduce, so
+ // long as the bitwidth is high enough to make them equivalent (for example
+ // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
+ if (ResVT != RetTy)
return false;
- SDValue ExtA = N0->getOperand(0);
- SDValue ExtB = N0->getOperand(1);
+ SDValue Mul = N0;
+ if (Mul->getOpcode() == ExtendCode &&
+ Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
+ ResVT.getScalarSizeInBits())
+ Mul = Mul->getOperand(0);
+ if (Mul->getOpcode() != ISD::MUL)
+ return false;
+ SDValue ExtA = Mul->getOperand(0);
+ SDValue ExtB = Mul->getOperand(1);
if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
return false;
A = ExtA->getOperand(0);
B = ExtB->getOperand(0);
if (A.getValueType() == B.getValueType() &&
- llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+ llvm::any_of(ExtTypes,
+ [&A](MVT Ty) { return A.getValueType() == Ty; })) {
+ A = ExtendIfNeeded(A, ExtendCode);
+ B = ExtendIfNeeded(B, ExtendCode);
return true;
+ }
+ return false;
+ };
+ auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
+ SDValue &A, SDValue &B, SDValue &Mask) {
+ // Same as the pattern above with a select for the zero predicated lanes
+ // ExtA = sext/zext A
+ // ExtB = sext/zext B
+ // Mul = mul ExtA, ExtB
+ // N0 = select Mask, Mul, 0
+ // vecreduce.add N0
+ if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
+ !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
+ return false;
+ Mask = N0->getOperand(0);
+ SDValue Mul = N0->getOperand(1);
+ if (Mul->getOpcode() == ExtendCode &&
+ Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
+ ResVT.getScalarSizeInBits())
+ Mul = Mul->getOperand(0);
+ if (Mul->getOpcode() != ISD::MUL)
+ return false;
+ SDValue ExtA = Mul->getOperand(0);
+ SDValue ExtB = Mul->getOperand(1);
+ if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
+ return false;
+ A = ExtA->getOperand(0);
+ B = ExtB->getOperand(0);
+ if (A.getValueType() == B.getValueType() &&
+ llvm::any_of(ExtTypes,
+ [&A](MVT Ty) { return A.getValueType() == Ty; })) {
+ A = ExtendIfNeeded(A, ExtendCode);
+ B = ExtendIfNeeded(B, ExtendCode);
+ return true;
+ }
return false;
};
auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
@@ -14754,20 +15075,93 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
- if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
+ if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND,
+ {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
return Create64bitNode(ARMISD::VADDLVs, {A});
- if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
+ if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND,
+ {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
return Create64bitNode(ARMISD::VADDLVu, {A});
+ if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
+ if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
+
+ SDValue Mask;
+ if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
+ return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
+ if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
+ return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
+ if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND,
+ {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
+ return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
+ if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND,
+ {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
+ return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
+ if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
+ if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
SDValue A, B;
if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
- if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+ if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND,
+ {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
return Create64bitNode(ARMISD::VMLALVs, {A, B});
- if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+ if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND,
+ {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
return Create64bitNode(ARMISD::VMLALVu, {A, B});
+ if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
+ if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
+
+ if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
+ return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
+ if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
+ return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
+ if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND,
+ {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
+ B, Mask))
+ return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
+ if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND,
+ {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
+ B, Mask))
+ return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
+ if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
+ if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
+ return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+ DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
+
+ // Some complications. We can get a case where the two inputs of the mul are
+ // the same, then the output sext will have been helpfully converted to a
+ // zext. Turn it back.
+ SDValue Op = N0;
+ if (Op->getOpcode() == ISD::VSELECT)
+ Op = Op->getOperand(1);
+ if (Op->getOpcode() == ISD::ZERO_EXTEND &&
+ Op->getOperand(0)->getOpcode() == ISD::MUL) {
+ SDValue Mul = Op->getOperand(0);
+ if (Mul->getOperand(0) == Mul->getOperand(1) &&
+ Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
+ if (Op != N0)
+ Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
+ N0->getOperand(0), Ext, N0->getOperand(2));
+ return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
+ }
+ }
+
return SDValue();
}
@@ -15219,12 +15613,13 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
SmallVector<SDValue, 4> Chains;
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
- SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+ SDValue NewPtr =
+ DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
SDValue NewLoad =
DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
- Alignment.value(), MMOFlags, AAInfo);
+ Alignment, MMOFlags, AAInfo);
Loads.push_back(NewLoad);
Chains.push_back(SDValue(NewLoad.getNode(), 1));
}
@@ -15312,6 +15707,9 @@ static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
if (!ST->hasMVEIntegerOps())
return SDValue();
+ if (SDValue V = PerformVQDMULHCombine(N, DAG))
+ return V;
+
if (VT != MVT::v4i32 && VT != MVT::v8i16)
return SDValue();
@@ -15919,6 +16317,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default: break;
+ case ISD::SELECT_CC:
+ case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
@@ -16335,6 +16735,19 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
switch (II->getIntrinsicID()) {
case Intrinsic::fma:
return !IsFMS(I);
+ case Intrinsic::arm_mve_add_predicated:
+ case Intrinsic::arm_mve_mul_predicated:
+ case Intrinsic::arm_mve_qadd_predicated:
+ case Intrinsic::arm_mve_hadd_predicated:
+ case Intrinsic::arm_mve_vqdmull_predicated:
+ case Intrinsic::arm_mve_qdmulh_predicated:
+ case Intrinsic::arm_mve_qrdmulh_predicated:
+ case Intrinsic::arm_mve_fma_predicated:
+ return true;
+ case Intrinsic::arm_mve_sub_predicated:
+ case Intrinsic::arm_mve_qsub_predicated:
+ case Intrinsic::arm_mve_hsub_predicated:
+ return Operand == 1;
default:
return false;
}
@@ -17063,8 +17476,7 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
return;
KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
- Known.Zero &= KnownRHS.Zero;
- Known.One &= KnownRHS.One;
+ Known = KnownBits::commonBits(Known, KnownRHS);
return;
}
case ISD::INTRINSIC_W_CHAIN: {
@@ -17937,6 +18349,9 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
return false;
if (VT == MVT::f16 && Subtarget->hasFullFP16())
return ARM_AM::getFP16Imm(Imm) != -1;
+ if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
+ ARM_AM::getFP32FP16Imm(Imm) != -1)
+ return true;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
if (VT == MVT::f64 && Subtarget->hasFP64())
@@ -18710,8 +19125,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
SmallVector<Value *, 6> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
- for (auto S : Shuffles)
- Ops.push_back(S);
+ append_range(Ops, Shuffles);
Ops.push_back(Builder.getInt32(SI->getAlignment()));
Builder.CreateCall(VstNFunc, Ops);
} else {
@@ -18727,8 +19141,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
SmallVector<Value *, 6> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
- for (auto S : Shuffles)
- Ops.push_back(S);
+ append_range(Ops, Shuffles);
for (unsigned F = 0; F < Factor; F++) {
Ops.push_back(Builder.getInt32(F));
Builder.CreateCall(VstNFunc, Ops);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
index 8b1f4183032e..61a127af07de 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -216,23 +216,37 @@ class VectorType;
VMULLs, // ...signed
VMULLu, // ...unsigned
+ VQDMULH, // MVE vqdmulh instruction
+
// MVE reductions
VADDVs, // sign- or zero-extend the elements of a vector to i32,
VADDVu, // add them all together, and return an i32 of their sum
+ VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask
+ VADDVpu,
VADDLVs, // sign- or zero-extend elements to i64 and sum, returning
VADDLVu, // the low and high 32-bit halves of the sum
- VADDLVAs, // same as VADDLV[su] but also add an input accumulator
+ VADDLVAs, // Same as VADDLV[su] but also add an input accumulator
VADDLVAu, // provided as low and high halves
- VADDLVps, // same as VADDLVs but with a v4i1 predicate mask
- VADDLVpu, // same as VADDLVu but with a v4i1 predicate mask
- VADDLVAps, // same as VADDLVps but with a v4i1 predicate mask
- VADDLVApu, // same as VADDLVpu but with a v4i1 predicate mask
- VMLAVs,
- VMLAVu,
- VMLALVs,
- VMLALVu,
- VMLALVAs,
- VMLALVAu,
+ VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask
+ VADDLVpu,
+ VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask
+ VADDLVApu,
+ VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply them
+ VMLAVu, // and add the results together, returning an i32 of their sum
+ VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask
+ VMLAVpu,
+ VMLALVs, // Same as VMLAV but with i64, returning the low and
+ VMLALVu, // high 32-bit halves of the sum
+ VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask
+ VMLALVpu,
+ VMLALVAs, // Same as VMLALV but also add an input accumulator
+ VMLALVAu, // provided as low and high halves
+ VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask
+ VMLALVApu,
+ VMINVu, // Find minimum unsigned value of a vector and register
+ VMINVs, // Find minimum signed value of a vector and register
+ VMAXVu, // Find maximum unsigned value of a vector and register
+ VMAXVs, // Find maximum signed value of a vector and register
SMULWB, // Signed multiply word by half word, bottom
SMULWT, // Signed multiply word by half word, top
@@ -271,8 +285,8 @@ class VectorType;
// Vector AND with NOT of immediate
VBICIMM,
- // Vector bitwise select
- VBSL,
+ // Pseudo vector bitwise select
+ VBSP,
// Pseudo-instruction representing a memory copy using ldm/stm
// instructions.
@@ -520,12 +534,6 @@ class VectorType;
const TargetRegisterClass *
getRegClassFor(MVT VT, bool isDivergent = false) const override;
- /// Returns true if a cast between SrcAS and DestAS is a noop.
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
- // Addrspacecasts are always noops.
- return true;
- }
-
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
unsigned &PrefAlign) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
index e13f3437cc7b..85da7c5a535e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -403,8 +403,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
bit isUnaryDataProc = 0;
bit canXformTo16Bit = 0;
// The instruction is a 16-bit flag setting Thumb instruction. Used
- // by the parser to determine whether to require the 'S' suffix on the
- // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+ // by the parser and if-converter to determine whether to require the 'S'
+ // suffix on the mnemonic (when not in an IT block) or preclude it (when
+ // in an IT block).
bit thumbArithFlagSetting = 0;
bit validForTailPredication = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
index da0a836c8f95..8dcb319923ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -104,11 +104,6 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
SDTCisInt<0>,
SDTCisInt<4>]>;
-// TODO Add another operand for 'Size' so that we can re-use this node when we
-// start supporting *TP versions.
-def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
- SDTCisVT<1, OtherVT>]>;
-
def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
@@ -167,9 +162,9 @@ def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
[SDNPInGlue]>;
def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>;
-def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
+def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
-def ARMusatnoshift : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>;
+def ARMusat : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>;
def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
@@ -303,10 +298,6 @@ def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;
-def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
-def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
-def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
-
// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
// vector register as a different vector type, without changing the contents of
// the register. It differs from 'bitconvert' in that bitconvert reinterprets
@@ -380,6 +371,11 @@ def imm_not_XFORM : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
+// asr_imm_XFORM - Returns a shift immediate with bit {5} set to 1
+def asr_imm_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(0x20 | N->getZExtValue(), SDLoc(N), MVT:: i32);
+}]>;
+
/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
def imm16_31 : ImmLeaf<i32, [{
return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
@@ -446,6 +442,8 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
+def asr_imm : ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }], asr_imm_XFORM>;
+
//===----------------------------------------------------------------------===//
// NEON/MVE pattern fragments
//
@@ -498,6 +496,18 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{
}]>;
+def ARMimmAllZerosV: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 0))))>;
+def ARMimmAllZerosD: PatLeaf<(bitconvert (v2i32 (ARMvmovImm (i32 0))))>;
+def ARMimmAllOnesV: PatLeaf<(bitconvert (v16i8 (ARMvmovImm (i32 0xEFF))))>;
+def ARMimmAllOnesD: PatLeaf<(bitconvert (v8i8 (ARMvmovImm (i32 0xEFF))))>;
+
+def ARMimmOneV: PatLeaf<(ARMvmovImm (i32 timm)), [{
+ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+ unsigned EltBits = 0;
+ uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
+ return (EltBits == N->getValueType(0).getScalarSizeInBits() && EltVal == 0x01);
+}]>;
+
//===----------------------------------------------------------------------===//
// Operand Definitions.
@@ -812,7 +822,9 @@ def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
def arm_i32imm : IntImmLeaf<i32, [{
if (Subtarget->useMovt())
return true;
- return ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue());
+ if (ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue()))
+ return true;
+ return ARM_AM::isSOImmTwoPartValNeg(Imm.getZExtValue());
}]>;
/// imm0_1 predicate - Immediate in the range [0,1].
@@ -2480,23 +2492,29 @@ let isCall = 1,
}
// ARMv5T and above
- def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
- IIC_Br, "blx\t$func",
- [(ARMcall GPR:$func)]>,
+ def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", []>,
Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
bits<4> func;
let Inst{31-4} = 0b1110000100101111111111110011;
let Inst{3-0} = func;
}
+ def BLX_noip : ARMPseudoExpand<(outs), (ins GPRnoip:$func),
+ 4, IIC_Br, [], (BLX GPR:$func)>,
+ Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>;
+
def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
- IIC_Br, "blx", "\t$func",
- [(ARMcall_pred GPR:$func)]>,
+ IIC_Br, "blx", "\t$func", []>,
Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
bits<4> func;
let Inst{27-4} = 0b000100101111111111110011;
let Inst{3-0} = func;
}
+ def BLX_pred_noip : ARMPseudoExpand<(outs), (ins GPRnoip:$func),
+ 4, IIC_Br, [],
+ (BLX_pred GPR:$func, (ops 14, zero_reg))>,
+ Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>;
+
// ARMv4T
// Note: Restrict $func to the tGPR regclass to prevent it being in LR.
@@ -2522,6 +2540,16 @@ let isCall = 1,
Requires<[IsARM]>, Sched<[WriteBr]>;
}
+def : ARMPat<(ARMcall GPR:$func), (BLX $func)>,
+ Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>;
+def : ARMPat<(ARMcall GPRnoip:$func), (BLX_noip $func)>,
+ Requires<[IsARM, HasV5T, SLSBLRMitigation]>;
+def : ARMPat<(ARMcall_pred GPR:$func), (BLX_pred $func)>,
+ Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>;
+def : ARMPat<(ARMcall_pred GPRnoip:$func), (BLX_pred_noip $func)>,
+ Requires<[IsARM, HasV5T, SLSBLRMitigation]>;
+
+
let isBranch = 1, isTerminator = 1 in {
// FIXME: should be able to write a pattern for ARMBrcond, but can't use
// a two-value operand where a dag node expects two operands. :(
@@ -4061,14 +4089,31 @@ def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
(SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
(USAT imm0_31:$pos, GPRnopc:$a, 0)>;
-def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : ARMPat<(ARMssat GPRnopc:$Rn, imm0_31:$imm),
(SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
-def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : ARMPat<(ARMusat GPRnopc:$Rn, imm0_31:$imm),
(USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos),
(SSAT16 imm1_16:$pos, GPRnopc:$a)>;
def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos),
(USAT16 imm0_15:$pos, GPRnopc:$a)>;
+def : ARMV6Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos),
+ (SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : ARMV6Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos),
+ (SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : ARMV6Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+ (USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : ARMV6Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos),
+ (USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : ARMPat<(ARMssat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos),
+ (SSAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>;
+def : ARMPat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+ (SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+def : ARMPat<(ARMusat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos),
+ (USAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>;
+def : ARMPat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+ (USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+
//===----------------------------------------------------------------------===//
// Bitwise Instructions.
@@ -6336,6 +6381,15 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
NoItinerary,
[(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>;
+// SpeculationBarrierEndBB must only be used after an unconditional control
+// flow, i.e. after a terminator for which isBarrier is True.
+let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+ def SpeculationBarrierISBDSBEndBB
+ : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+ def SpeculationBarrierSBEndBB
+ : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+}
+
//===----------------------------------
// Atomic cmpxchg for -O0
//===----------------------------------
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
index 2a1f50d97e3b..0dfea68887e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -318,6 +318,78 @@ def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, ?, 0b11, "f", ?>;
def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b11, "p", 0b1>;
+multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
+ dag PredOperands, Instruction Inst,
+ SDPatternOperator IdentityVec = null_frag> {
+ // Unpredicated
+ def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated with select
+ if !ne(VTI.Size, 0b11) then {
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+ (VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+
+ // Optionally with the select folded through the op
+ def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$Qn),
+ (VTI.Vec IdentityVec))))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$Qm)))>;
+ }
+
+ // Predicated with intrinsic
+ def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)),
+ PredOperands,
+ (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+}
+
+multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
+ dag PredOperands, Instruction Inst,
+ SDPatternOperator IdentityVec = null_frag> {
+ // Unpredicated
+ def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn)))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn))>;
+
+ // Predicated with select
+ if !ne(VTI.Size, 0b11) then {
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+ (VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$Rn)))),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+
+ // Optionally with the select folded through the op
+ def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+ (ARMvdup rGPR:$Rn),
+ (VTI.Vec IdentityVec))))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$Qm)))>;
+ }
+
+ // Predicated with intrinsic
+ def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))),
+ PredOperands,
+ (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+}
+
// --------- Start of base classes for the instructions themselves
class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
@@ -378,7 +450,7 @@ class MVE_ScalarShift<string iname, dag oops, dag iops, string asm, string cstr,
: MVE_MI_with_pred<oops, iops, NoItinerary, iname, asm, cstr, pattern> {
let Inst{31-20} = 0b111010100101;
let Inst{8} = 0b1;
-
+ let validForTailPredication=1;
}
class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr,
@@ -612,8 +684,13 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
let validForTailPredication = 1;
}
+def SDTVecReduceP : SDTypeProfile<1, 2, [ // VADDLVp
+ SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>
+]>;
def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+def ARMVADDVps : SDNode<"ARMISD::VADDVps", SDTVecReduceP>;
+def ARMVADDVpu : SDNode<"ARMISD::VADDVpu", SDTVecReduceP>;
multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
def acc : MVE_VADDV<"vaddva", VTI.Suffix,
@@ -630,20 +707,39 @@ multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
if VTI.Unsigned then {
def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
(i32 (InstN $vec))>;
+ def : Pat<(i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$vec),
+ (VTI.Vec ARMimmAllZerosV))))),
+ (i32 (InstN $vec, ARMVCCThen, $pred))>;
def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
(i32 (InstN $vec))>;
+ def : Pat<(i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+ (i32 (InstN $vec, ARMVCCThen, $pred))>;
def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
(i32 tGPREven:$acc))),
(i32 (InstA $acc, $vec))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$vec),
+ (VTI.Vec ARMimmAllZerosV))))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
(i32 tGPREven:$acc))),
(i32 (InstA $acc, $vec))>;
+ def : Pat<(i32 (add (i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
} else {
def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
(i32 (InstN $vec))>;
def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
(i32 tGPREven:$acc))),
(i32 (InstA $acc, $vec))>;
+ def : Pat<(i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+ (i32 (InstN $vec, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
}
def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
@@ -848,6 +944,14 @@ multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>;
}
+def SDTVecReduceR : SDTypeProfile<1, 2, [ // Reduction of an integer and vector into an integer
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>
+]>;
+def ARMVMINVu : SDNode<"ARMISD::VMINVu", SDTVecReduceR>;
+def ARMVMINVs : SDNode<"ARMISD::VMINVs", SDTVecReduceR>;
+def ARMVMAXVu : SDNode<"ARMISD::VMAXVu", SDTVecReduceR>;
+def ARMVMAXVs : SDNode<"ARMISD::VMAXVs", SDTVecReduceR>;
+
defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">;
@@ -878,6 +982,32 @@ let Predicates = [HasMVEInt] in {
def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))),
(i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>;
+ def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v16i8 MQPR:$src))),
+ (i32 (MVE_VMINVu8 $x, $src))>;
+ def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v8i16 MQPR:$src))),
+ (i32 (MVE_VMINVu16 $x, $src))>;
+ def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v4i32 MQPR:$src))),
+ (i32 (MVE_VMINVu32 $x, $src))>;
+ def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v16i8 MQPR:$src))),
+ (i32 (MVE_VMINVs8 $x, $src))>;
+ def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v8i16 MQPR:$src))),
+ (i32 (MVE_VMINVs16 $x, $src))>;
+ def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v4i32 MQPR:$src))),
+ (i32 (MVE_VMINVs32 $x, $src))>;
+
+ def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v16i8 MQPR:$src))),
+ (i32 (MVE_VMAXVu8 $x, $src))>;
+ def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v8i16 MQPR:$src))),
+ (i32 (MVE_VMAXVu16 $x, $src))>;
+ def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v4i32 MQPR:$src))),
+ (i32 (MVE_VMAXVu32 $x, $src))>;
+ def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v16i8 MQPR:$src))),
+ (i32 (MVE_VMAXVs8 $x, $src))>;
+ def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v8i16 MQPR:$src))),
+ (i32 (MVE_VMAXVs16 $x, $src))>;
+ def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v4i32 MQPR:$src))),
+ (i32 (MVE_VMAXVs32 $x, $src))>;
+
}
multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> {
@@ -1009,12 +1139,28 @@ def SDTVecReduce2LA : SDTypeProfile<2, 4, [ // VMLALVA
SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
SDTCisVec<4>, SDTCisVec<5>
]>;
+def SDTVecReduce2P : SDTypeProfile<1, 3, [ // VMLAV
+ SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>
+]>;
+def SDTVecReduce2LP : SDTypeProfile<2, 3, [ // VMLALV
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>
+]>;
+def SDTVecReduce2LAP : SDTypeProfile<2, 5, [ // VMLALVA
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+ SDTCisVec<4>, SDTCisVec<5>, SDTCisVec<6>
+]>;
def ARMVMLAVs : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>;
def ARMVMLAVu : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>;
def ARMVMLALVs : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>;
def ARMVMLALVu : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>;
-def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
-def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
+def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+def ARMVMLAVps : SDNode<"ARMISD::VMLAVps", SDTVecReduce2P>;
+def ARMVMLAVpu : SDNode<"ARMISD::VMLAVpu", SDTVecReduce2P>;
+def ARMVMLALVps : SDNode<"ARMISD::VMLALVps", SDTVecReduce2LP>;
+def ARMVMLALVpu : SDNode<"ARMISD::VMLALVpu", SDTVecReduce2LP>;
+def ARMVMLALVAps : SDNode<"ARMISD::VMLALVAps", SDTVecReduce2LAP>;
+def ARMVMLALVApu : SDNode<"ARMISD::VMLALVApu", SDTVecReduce2LAP>;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
@@ -1033,22 +1179,68 @@ let Predicates = [HasMVEInt] in {
(i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
- (i32 tGPREven:$src3))),
+ (i32 tGPREven:$src3))),
(i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>;
def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
- (i32 tGPREven:$src3))),
+ (i32 tGPREven:$src3))),
(i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>;
def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
(i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
(i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
- (i32 tGPREven:$src3))),
+ (i32 tGPREven:$src3))),
(i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>;
def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
(i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
(i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+
+ // Predicated
+ def : Pat<(i32 (vecreduce_add (vselect (v4i1 VCCR:$pred),
+ (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)),
+ (v4i32 ARMimmAllZerosV)))),
+ (i32 (MVE_VMLADAVu32 $src1, $src2, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (vecreduce_add (vselect (v8i1 VCCR:$pred),
+ (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)),
+ (v8i16 ARMimmAllZerosV)))),
+ (i32 (MVE_VMLADAVu16 $src1, $src2, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))),
+ (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+ def : Pat<(i32 (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))),
+ (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+ def : Pat<(i32 (vecreduce_add (vselect (v16i1 VCCR:$pred),
+ (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)),
+ (v16i8 ARMimmAllZerosV)))),
+ (i32 (MVE_VMLADAVu8 $src1, $src2, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))),
+ (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+ def : Pat<(i32 (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))),
+ (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+
+ def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v4i1 VCCR:$pred),
+ (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)),
+ (v4i32 ARMimmAllZerosV)))),
+ (i32 tGPREven:$src3))),
+ (i32 (MVE_VMLADAVau32 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v8i1 VCCR:$pred),
+ (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)),
+ (v8i16 ARMimmAllZerosV)))),
+ (i32 tGPREven:$src3))),
+ (i32 (MVE_VMLADAVau16 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v16i1 VCCR:$pred),
+ (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)),
+ (v16i8 ARMimmAllZerosV)))),
+ (i32 tGPREven:$src3))),
+ (i32 (MVE_VMLADAVau8 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
}
// vmlav aliases vmladav
@@ -1168,6 +1360,25 @@ let Predicates = [HasMVEInt] in {
(MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
(MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+
+ // Predicated
+ def : Pat<(ARMVMLALVps (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+ (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+ def : Pat<(ARMVMLALVpu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+ (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+ def : Pat<(ARMVMLALVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+ (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+ def : Pat<(ARMVMLALVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+ (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+
+ def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+ (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+ def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
+ (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+ def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+ (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+ def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
+ (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
}
// vmlalv aliases vmlaldav
@@ -1215,7 +1426,7 @@ class MVE_comp<InstrItinClass itin, string iname, string suffix,
}
class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
- list<dag> pattern=[]>
+ list<dag> pattern=[]>
: MVE_comp<NoItinerary, iname, suffix, "", pattern> {
let Inst{28} = 0b1;
@@ -1231,46 +1442,19 @@ class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
let Predicates = [HasMVEFloat];
}
-def MVE_VMAXNMf32 : MVE_VMINMAXNM<"vmaxnm", "f32", 0b0, 0b0>;
-def MVE_VMAXNMf16 : MVE_VMINMAXNM<"vmaxnm", "f16", 0b1, 0b0>;
-
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4f32 (fmaxnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
- (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
- def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
- (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
- def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0),
- (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
- (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
- ARMVCCThen, (v4i1 VCCR:$mask),
- (v4f32 MQPR:$inactive)))>;
- def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0),
- (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
- (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
- ARMVCCThen, (v8i1 VCCR:$mask),
- (v8f16 MQPR:$inactive)))>;
-}
-
-def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>;
-def MVE_VMINNMf16 : MVE_VMINMAXNM<"vminnm", "f16", 0b1, 0b1>;
+multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> {
+ def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size{0}, bit_4>;
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4f32 (fminnum (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
- (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
- def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
- (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
- def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
- (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
- (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
- ARMVCCThen, (v4i1 VCCR:$mask),
- (v4f32 MQPR:$inactive)))>;
- def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
- (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
- (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
- ARMVCCThen, (v8i1 VCCR:$mask),
- (v8f16 MQPR:$inactive)))>;
+ let Predicates = [HasMVEFloat] in {
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 0)), !cast<Instruction>(NAME)>;
+ }
}
+defm MVE_VMAXNMf32 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v4f32, fmaxnum, int_arm_mve_max_predicated>;
+defm MVE_VMAXNMf16 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v8f16, fmaxnum, int_arm_mve_max_predicated>;
+defm MVE_VMINNMf32 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v4f32, fminnum, int_arm_mve_min_predicated>;
+defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_mve_min_predicated>;
+
class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
bit bit_4, list<dag> pattern=[]>
@@ -1288,22 +1472,11 @@ class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
}
multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>;
- defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- // Unpredicated min/max
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated min/max
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>;
}
}
@@ -1476,61 +1649,41 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f
(MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
}
-multiclass MVE_bit_op<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
- let Predicates = [HasMVEInt] in {
- // Unpredicated operation
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
- // Predicated operation
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (instruction
- (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
- }
-}
-
-defm : MVE_bit_op<MVE_v16i8, and, int_arm_mve_and_predicated, MVE_VAND>;
-defm : MVE_bit_op<MVE_v8i16, and, int_arm_mve_and_predicated, MVE_VAND>;
-defm : MVE_bit_op<MVE_v4i32, and, int_arm_mve_and_predicated, MVE_VAND>;
-defm : MVE_bit_op<MVE_v2i64, and, int_arm_mve_and_predicated, MVE_VAND>;
-
-defm : MVE_bit_op<MVE_v16i8, or, int_arm_mve_orr_predicated, MVE_VORR>;
-defm : MVE_bit_op<MVE_v8i16, or, int_arm_mve_orr_predicated, MVE_VORR>;
-defm : MVE_bit_op<MVE_v4i32, or, int_arm_mve_orr_predicated, MVE_VORR>;
-defm : MVE_bit_op<MVE_v2i64, or, int_arm_mve_orr_predicated, MVE_VORR>;
-
-defm : MVE_bit_op<MVE_v16i8, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-defm : MVE_bit_op<MVE_v8i16, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-defm : MVE_bit_op<MVE_v4i32, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-defm : MVE_bit_op<MVE_v2i64, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
-
-multiclass MVE_bit_op_with_inv<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
- let Predicates = [HasMVEInt] in {
- // Unpredicated operation
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (vnotq (VTI.Vec MQPR:$Qn)))),
- (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
- // Predicated operation
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (instruction
- (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
- }
+let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPattern<MVE_v16i8, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+ defm : MVE_TwoOpPattern<MVE_v8i16, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+ defm : MVE_TwoOpPattern<MVE_v4i32, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+ defm : MVE_TwoOpPattern<MVE_v2i64, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+
+ defm : MVE_TwoOpPattern<MVE_v16i8, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+ defm : MVE_TwoOpPattern<MVE_v8i16, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+ defm : MVE_TwoOpPattern<MVE_v4i32, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+ defm : MVE_TwoOpPattern<MVE_v2i64, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+
+ defm : MVE_TwoOpPattern<MVE_v16i8, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+ defm : MVE_TwoOpPattern<MVE_v8i16, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+ defm : MVE_TwoOpPattern<MVE_v4i32, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+ defm : MVE_TwoOpPattern<MVE_v2i64, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+
+ defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+ defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+ defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+ defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
+
+ defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_orn_predicated, (? ), MVE_VORN>;
+ defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_orn_predicated, (? ), MVE_VORN>;
+ defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_orn_predicated, (? ), MVE_VORN>;
+ defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>,
+ int_arm_mve_orn_predicated, (? ), MVE_VORN>;
}
-defm : MVE_bit_op_with_inv<MVE_v16i8, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-defm : MVE_bit_op_with_inv<MVE_v8i16, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-defm : MVE_bit_op_with_inv<MVE_v4i32, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-defm : MVE_bit_op_with_inv<MVE_v2i64, and, int_arm_mve_bic_predicated, MVE_VBIC>;
-
-defm : MVE_bit_op_with_inv<MVE_v16i8, or, int_arm_mve_orn_predicated, MVE_VORN>;
-defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
-defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
-defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
-
class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
: MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
@@ -1565,7 +1718,8 @@ multiclass MVE_bit_cmode_p<string iname, bit opcode,
defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm));
let Predicates = [HasMVEInt] in {
- def : Pat<UnpredPat, (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>;
+ def : Pat<UnpredPat,
+ (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>;
def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
UnpredPat, (VTI.Vec MQPR:$src))),
(VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm,
@@ -1775,31 +1929,18 @@ class MVE_VMULt1<string iname, string suffix, bits<2> size,
let validForTailPredication = 1;
}
-multiclass MVE_VMUL_m<string iname, MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
- def "" : MVE_VMULt1<iname, VTI.Suffix, VTI.Size>;
- defvar Inst = !cast<Instruction>(NAME);
+multiclass MVE_VMUL_m<MVEVectorVTInfo VTI> {
+ def "" : MVE_VMULt1<"vmul", VTI.Suffix, VTI.Size>;
let Predicates = [HasMVEInt] in {
- // Unpredicated multiply
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated multiply
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, mul, int_arm_mve_mul_predicated, (? ),
+ !cast<Instruction>(NAME), ARMimmOneV>;
}
}
-multiclass MVE_VMUL<MVEVectorVTInfo VTI>
- : MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>;
-
-defm MVE_VMULi8 : MVE_VMUL<MVE_v16i8>;
-defm MVE_VMULi16 : MVE_VMUL<MVE_v8i16>;
-defm MVE_VMULi32 : MVE_VMUL<MVE_v4i32>;
+defm MVE_VMULi8 : MVE_VMUL_m<MVE_v16i8>;
+defm MVE_VMULi16 : MVE_VMUL_m<MVE_v8i16>;
+defm MVE_VMULi32 : MVE_VMUL_m<MVE_v4i32>;
class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
list<dag> pattern=[]>
@@ -1811,30 +1952,30 @@ class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
let Inst{12-8} = 0b01011;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
+def MVEvqdmulh : SDNode<"ARMISD::VQDMULH", SDTIntBinOp>;
+
multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int,
+ SDNode Op, Intrinsic unpred_int, Intrinsic pred_int,
bit rounding> {
def "" : MVE_VQxDMULH_Base<iname, VTI.Suffix, VTI.Size, rounding>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- // Unpredicated multiply
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, pred_int, (? ), Inst>;
- // Predicated multiply
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ // Extra unpredicated multiply intrinsic patterns
+ def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
}
}
multiclass MVE_VQxDMULH<string iname, MVEVectorVTInfo VTI, bit rounding>
- : MVE_VQxDMULH_m<iname, VTI, !if(rounding, int_arm_mve_vqrdmulh,
+ : MVE_VQxDMULH_m<iname, VTI, !if(rounding, null_frag,
+ MVEvqdmulh),
+ !if(rounding, int_arm_mve_vqrdmulh,
int_arm_mve_vqdmulh),
!if(rounding, int_arm_mve_qrdmulh_predicated,
int_arm_mve_qdmulh_predicated),
@@ -1862,21 +2003,12 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
}
multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- // Unpredicated add/subtract
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated add/subtract
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>;
}
}
@@ -1914,22 +2046,13 @@ class MVE_VQSUB_<string suffix, bit U, bits<2> size>
: MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size>;
multiclass MVE_VQADD_m<MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- // Unpredicated saturating add
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated saturating add
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+ !cast<Instruction>(NAME)>;
}
}
@@ -1944,22 +2067,13 @@ defm MVE_VQADDu16 : MVE_VQADD<MVE_v8u16, uaddsat>;
defm MVE_VQADDu32 : MVE_VQADD<MVE_v4u32, uaddsat>;
multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- // Unpredicated saturating subtract
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated saturating subtract
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+ !cast<Instruction>(NAME)>;
}
}
@@ -2085,30 +2199,32 @@ defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
// modelling that here with these patterns, but we're using no wrap forms of
// add to ensure that the extra bit of information is not needed for the
// arithmetic or the rounding.
-def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
- (v16i8 (ARMvmovImm (i32 3585)))),
- (i32 1))),
- (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
- (v8i16 (ARMvmovImm (i32 2049)))),
- (i32 1))),
- (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
- (v4i32 (ARMvmovImm (i32 1)))),
- (i32 1))),
- (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
- (v16i8 (ARMvmovImm (i32 3585)))),
- (i32 1))),
- (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
- (v8i16 (ARMvmovImm (i32 2049)))),
- (i32 1))),
- (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
-def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
- (v4i32 (ARMvmovImm (i32 1)))),
- (i32 1))),
- (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+ (v16i8 (ARMvmovImm (i32 3585)))),
+ (i32 1))),
+ (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
+ def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+ (v8i16 (ARMvmovImm (i32 2049)))),
+ (i32 1))),
+ (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
+ def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+ (v4i32 (ARMvmovImm (i32 1)))),
+ (i32 1))),
+ (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
+ def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+ (v16i8 (ARMvmovImm (i32 3585)))),
+ (i32 1))),
+ (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
+ def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+ (v8i16 (ARMvmovImm (i32 2049)))),
+ (i32 1))),
+ (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
+ def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+ (v4i32 (ARMvmovImm (i32 1)))),
+ (i32 1))),
+ (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+}
class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
@@ -2357,8 +2473,9 @@ multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate,
let Predicates = [HasMVEInt] in {
// VQABS and VQNEG have more difficult isel patterns defined elsewhere
- if !eq(saturate, 0) then {
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
+ if !not(saturate) then {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))),
+ (VTI.Vec (Inst $v))>;
}
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
@@ -2915,7 +3032,7 @@ multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst,
defvar outparams = (inst (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm),
(imm:$imm));
- def : Pat<(OutVTI.Vec !setop(inparams, int_arm_mve_vshrn)),
+ def : Pat<(OutVTI.Vec !setdagop(inparams, int_arm_mve_vshrn)),
(OutVTI.Vec outparams)>;
def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated
(InVTI.Pred VCCR:$pred)))),
@@ -3117,7 +3234,7 @@ multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name,
defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # name);
defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # name # "_predicated");
- def : Pat<(VTI.Vec !setop(inparams, unpred_int)),
+ def : Pat<(VTI.Vec !setdagop(inparams, unpred_int)),
(VTI.Vec outparams)>;
def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))),
(VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
@@ -3469,18 +3586,12 @@ class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
}
multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEFloat] in {
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
}
}
@@ -3571,14 +3682,23 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
let Predicates = [HasMVEFloat] in {
if fms then {
- def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>;
- def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (fma (fneg m1), m2, add)),
+ (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec (fma (fneg m1), m2, add)),
+ add)),
+ (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)),
(Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)),
(Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
} else {
- def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (fma m1, m2, add)),
+ (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec (fma m1, m2, add)),
+ add)),
+ (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)),
(Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
}
@@ -3591,20 +3711,14 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
let validForTailPredication = 1;
}
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEFloat] in {
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
}
}
@@ -3706,7 +3820,14 @@ multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
: MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
-defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
+defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))),
+ (MVE_VABDf16 MQPR:$Qm, MQPR:$Qn)>;
+ def : Pat<(v4f32 (fabs (fsub (v4f32 MQPR:$Qm), (v4f32 MQPR:$Qn)))),
+ (MVE_VABDf32 MQPR:$Qm, MQPR:$Qn)>;
+}
class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
Operand imm_operand_type>
@@ -3926,8 +4047,8 @@ multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int,
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
-
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))),
+ (VTI.Vec (Inst $v))>;
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive))),
(VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
@@ -3962,6 +4083,8 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
+
+ let isCommutable = 1;
}
multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
@@ -4287,6 +4410,10 @@ let Predicates = [HasMVEInt] in {
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
+def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
let Predicates = [HasMVEInt] in {
foreach VT = [ v4i1, v8i1, v16i1 ] in {
def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@@ -4299,6 +4426,13 @@ let Predicates = [HasMVEInt] in {
(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
}
+ // If we happen to be casting from a load we can convert that straight
+ // into a predicate load, so long as the load is of the correct type.
+ foreach VT = [ v4i1, v8i1, v16i1 ] in {
+ def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))),
+ (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>;
+ }
+
// Here we match the specific SDNode type 'ARMVectorRegCastImpl'
// rather than the more general 'ARMVectorRegCast' which would also
// match some bitconverts. If we use the latter in cases where the
@@ -4307,7 +4441,8 @@ let Predicates = [HasMVEInt] in {
foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
- def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), (VT MQPR:$src)>;
+ def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))),
+ (VT MQPR:$src)>;
}
// end of MVE compares
@@ -4635,7 +4770,7 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
let Inst{16} = 0b1;
let Inst{12} = T;
let Inst{8} = 0b0;
- let Inst{7} = !if(!eq(bit_17, 0), 1, 0);
+ let Inst{7} = !not(bit_17);
let Inst{0} = 0b1;
let validForTailPredication = 1;
let retainsPreviousHalfElement = 1;
@@ -4666,7 +4801,7 @@ multiclass MVE_VMOVN_p<Instruction Inst, bit top,
(VTI.Vec MQPR:$Qm), (i32 top))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>;
- if !eq(top, 0) then {
+ if !not(top) then {
// If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd
// lanes of a with the odd lanes of b. In other words, the lanes we're
// _keeping_ from a are the even ones. So we can flip it round and say that
@@ -5023,32 +5158,6 @@ multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
}
}
-// Patterns for vector-scalar instructions with FP operands
-multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
- Instruction instr_f16,
- Instruction instr_f32> {
- let Predicates = [HasMVEFloat] in {
- // Unpredicated F16
- def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))),
- (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>;
- // Unpredicated F32
- def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))),
- (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>;
- // Predicated F16
- def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)),
- (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
- (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val),
- ARMVCCThen, (v8i1 VCCR:$mask),
- (v8f16 MQPR:$inactive)))>;
- // Predicated F32
- def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)),
- (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
- (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val),
- ARMVCCThen, (v4i1 VCCR:$mask),
- (v4f32 MQPR:$inactive)))>;
- }
-}
-
class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
bit bit_5, bit bit_12, bit bit_16, bit bit_28>
: MVE_qDest_rSrc<iname, suffix, ""> {
@@ -5064,10 +5173,11 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
// Vector-scalar add/sub
multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
- defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
- unpred_op, pred_int>;
+ let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>;
+ }
}
multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI>
@@ -5086,36 +5196,35 @@ defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>;
// Vector-scalar saturating add/sub
multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
- SDNode unpred_op_s, SDNode unpred_op_u,
- Intrinsic pred_int> {
+ SDNode Op, Intrinsic PredInt> {
def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract,
0b0, VTI.Unsigned>;
- defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s);
- defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
- unpred_op, pred_int, 0, 1>;
+
+ let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+ !cast<Instruction>(NAME)>;
+ }
}
-multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI>
- : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat,
- int_arm_mve_qadd_predicated>;
+multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI, SDNode Op>
+ : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, Op, int_arm_mve_qadd_predicated>;
-multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI>
- : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat,
- int_arm_mve_qsub_predicated>;
+multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI, SDNode Op>
+ : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, Op, int_arm_mve_qsub_predicated>;
-defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8>;
-defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>;
-defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>;
-defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8>;
-defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>;
-defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>;
+defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8, saddsat>;
+defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16, saddsat>;
+defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32, saddsat>;
+defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8, uaddsat>;
+defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16, uaddsat>;
+defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32, uaddsat>;
-defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8>;
-defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>;
-defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>;
-defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8>;
-defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>;
-defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>;
+defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8, ssubsat>;
+defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16, ssubsat>;
+defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32, ssubsat>;
+defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8, usubsat>;
+defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16, usubsat>;
+defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32, usubsat>;
class MVE_VQDMULL_qr<string iname, string suffix, bit size,
bit T, string cstr="", list<dag> pattern=[]>
@@ -5206,19 +5315,25 @@ defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>;
defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
+multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
+ SDNode Op, Intrinsic PredInt> {
+ def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract>;
+ defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
+ !cast<Instruction>(NAME)>;
+}
+
let Predicates = [HasMVEFloat] in {
- def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>;
- def MVE_VADD_qr_f16 : MVE_VxADDSUB_qr<"vadd", "f16", 0b1, 0b11, 0b0>;
+ defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd,
+ int_arm_mve_add_predicated>;
+ defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd,
+ int_arm_mve_add_predicated>;
- def MVE_VSUB_qr_f32 : MVE_VxADDSUB_qr<"vsub", "f32", 0b0, 0b11, 0b1>;
- def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>;
+ defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub,
+ int_arm_mve_sub_predicated>;
+ defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub,
+ int_arm_mve_sub_predicated>;
}
-defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated,
- MVE_VADD_qr_f16, MVE_VADD_qr_f32>;
-defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated,
- MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>;
-
class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
bit bit_7, bit bit_17, list<dag> pattern=[]>
: MVE_qDest_single_rSrc<iname, suffix, pattern> {
@@ -5346,8 +5461,10 @@ class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
- defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
- mul, int_arm_mve_mul_predicated>;
+ let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPatternDup<VTI, mul, int_arm_mve_mul_predicated, (? ),
+ !cast<Instruction>(NAME), ARMimmOneV>;
+ }
}
defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m<MVE_v16i8>;
@@ -5364,21 +5481,25 @@ class MVE_VxxMUL_qr<string iname, string suffix,
let Inst{12} = 0b0;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
- Intrinsic int_unpred, Intrinsic int_pred> {
+ PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> {
def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
- defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
- int_unpred, int_pred>;
+
+ let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPatternDup<VTI, Op, int_pred, (? ), !cast<Instruction>(NAME)>;
+ }
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, int_unpred, int_pred>;
}
multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> :
- MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0,
+ MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, MVEvqdmulh,
int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>;
multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> :
- MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1,
+ MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, null_frag,
int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>;
defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m<MVE_v16s8>;
@@ -5389,13 +5510,17 @@ defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>;
defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>;
defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>;
-let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
- def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
- def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
+multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> {
+ let validForTailPredication = 1 in
+ def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11>;
+ defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
+ !cast<Instruction>(NAME)>;
}
-defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated,
- MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>;
+let Predicates = [HasMVEFloat] in {
+ defm MVE_VMUL_qr_f16 : MVE_VxxMUL_qr_f_m<MVE_v8f16>;
+ defm MVE_VMUL_qr_f32 : MVE_VxxMUL_qr_f_m<MVE_v4f32>;
+}
class MVE_VFMAMLA_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit S,
@@ -5470,6 +5595,10 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
if scalar_addend then {
def : Pat<(VTI.Vec (fma v1, v2, vs)),
(VTI.Vec (Inst v1, v2, is))>;
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec (fma v1, v2, vs)),
+ v1)),
+ (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
(VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
} else {
@@ -5477,6 +5606,14 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
(VTI.Vec (Inst v2, v1, is))>;
def : Pat<(VTI.Vec (fma vs, v1, v2)),
(VTI.Vec (Inst v2, v1, is))>;
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec (fma vs, v2, v1)),
+ v1)),
+ (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ (VTI.Vec (fma v2, vs, v1)),
+ v1)),
+ (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
(VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
@@ -5605,7 +5742,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>;
def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
-let hasSideEffects = 1 in
+let isReMaterializable = 1 in
class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
"$Rn", vpred_n, "", pattern> {
@@ -5629,7 +5766,8 @@ multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
- def : Pat<(intr rGPR:$Rn), (VTI.Pred (Inst rGPR:$Rn))>;
+ def : Pat<(intr rGPR:$Rn),
+ (VTI.Pred (Inst rGPR:$Rn))>;
def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
(VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask))>;
}
@@ -5707,6 +5845,41 @@ def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd),
let AsmMatchConverter = "cvtMVEVMOVQtoDReg";
}
+let Predicates = [HasMVEInt] in {
+ // Double lane moves. There are a number of patterns here. We know that the
+ // insertelt's will be in descending order by index, and need to match the 5
+ // patterns that might contain 2-0 or 3-1 pairs. These are:
+ // 3 2 1 0 -> vmovqrr 31; vmovqrr 20
+ // 3 2 1 -> vmovqrr 31; vmov 2
+ // 3 1 -> vmovqrr 31
+ // 2 1 0 -> vmovqrr 20; vmov 1
+ // 2 0 -> vmovqrr 20
+ // The other potential patterns will be handled by single lane inserts.
+ def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+ rGPR:$srcA, (i32 0)),
+ rGPR:$srcB, (i32 1)),
+ rGPR:$srcC, (i32 2)),
+ rGPR:$srcD, (i32 3)),
+ (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcC, (i32 2), (i32 0)),
+ rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>;
+ def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+ rGPR:$srcB, (i32 1)),
+ rGPR:$srcC, (i32 2)),
+ rGPR:$srcD, (i32 3)),
+ (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)),
+ rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>;
+ def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)),
+ (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 3), (i32 1))>;
+ def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1),
+ rGPR:$srcB, (i32 0)),
+ rGPR:$srcC, (i32 1)),
+ rGPR:$srcD, (i32 2)),
+ (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)),
+ rGPR:$srcB, rGPR:$srcD, (i32 2), (i32 0))>;
+ def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)),
+ (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 2), (i32 0))>;
+}
+
// end of coproc mov
// start of MVE interleaving load/store
@@ -5735,6 +5908,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
let mayLoad = load;
let mayStore = !eq(load,0);
let hasSideEffects = 0;
+ let validForTailPredication = load;
}
// A parameter class used to encapsulate all the ways the writeback
@@ -6344,6 +6518,7 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte
let Inst{4} = 0b0;
let Defs = [VPR];
+ let validForTailPredication=1;
}
class MVE_VPTt1<string suffix, bits<2> size, dag iops>
@@ -6456,6 +6631,7 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=
let Defs = [VPR];
let Predicates = [HasMVEFloat];
+ let validForTailPredication=1;
}
class MVE_VPTft1<string suffix, bit size>
@@ -6583,13 +6759,6 @@ let Predicates = [HasMVEInt] in {
(v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))),
(v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
-
- def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
- (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, ARMCCne))>;
- def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))),
- (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, ARMCCne))>;
- def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))),
- (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, ARMCCne))>;
}
let Predicates = [HasMVEFloat] in {
@@ -6938,7 +7107,7 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
PatFrag LoadKind, int shift>
- : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))),
(Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
@@ -7105,11 +7274,11 @@ multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string
(VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
// Masked ext loads
- def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+ def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
(VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
- def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+ def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
(VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
- def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+ def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
(VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
index 1b3f6075c0e9..a8c0d05d91c4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -509,7 +509,7 @@ def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>;
def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
-def NEONvbsl : SDNode<"ARMISD::VBSL",
+def NEONvbsp : SDNode<"ARMISD::VBSP",
SDTypeProfile<1, 3, [SDTCisVec<0>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
@@ -534,20 +534,6 @@ def NEONvtbl1 : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>;
def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
-def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
- ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
- unsigned EltBits = 0;
- uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
- return (EltBits == 32 && EltVal == 0);
-}]>;
-
-def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
- ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
- unsigned EltBits = 0;
- uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
- return (EltBits == 8 && EltVal == 0xff);
-}]>;
-
//===----------------------------------------------------------------------===//
// NEON load / store instructions
//===----------------------------------------------------------------------===//
@@ -4211,10 +4197,10 @@ def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16",
defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
"vaddl", "s", add, sext, 1>;
defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
- "vaddl", "u", add, zext, 1>;
+ "vaddl", "u", add, zanyext, 1>;
// VADDW : Vector Add Wide (Q = Q + D)
defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
-defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
+defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zanyext, 0>;
// VHADD : Vector Halving Add
defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
@@ -4526,9 +4512,9 @@ let Predicates = [HasNEON, HasV8_1a] in {
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (saddsat
(v4i32 QPR:$src1),
- (v4i32 (int_arm_neon_vqrdmulh
+ (v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
- (v4i32 (ARMvduplane (v4i32 QPR:$src3),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3),
imm:$lane)))))),
(v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
@@ -4579,17 +4565,17 @@ let Predicates = [HasNEON, HasV8_1a] in {
(v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
- (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
+ (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
def : Pat<(v8i16 (ssubsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
- (v8i16 (ARMvduplane (v8i16 QPR:$src3),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3),
imm:$lane)))))),
(v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
- (v4i16 (EXTRACT_SUBREG
+ (v4i16 (EXTRACT_SUBREG
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
@@ -4601,7 +4587,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
- (v2i32 (EXTRACT_SUBREG
+ (v2i32 (EXTRACT_SUBREG
QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
@@ -5059,10 +5045,10 @@ def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16",
defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
"vsubl", "s", sub, sext, 0>;
defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
- "vsubl", "u", sub, zext, 0>;
+ "vsubl", "u", sub, zanyext, 0>;
// VSUBW : Vector Subtract Wide (Q = Q - D)
defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
-defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
+defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zanyext, 0>;
// VHSUB : Vector Halving Subtract
defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@@ -5273,9 +5259,9 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
// Vector Bitwise Operations.
def vnotd : PatFrag<(ops node:$in),
- (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+ (xor node:$in, ARMimmAllOnesD)>;
def vnotq : PatFrag<(ops node:$in),
- (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+ (xor node:$in, ARMimmAllOnesV)>;
// VAND : Vector Bitwise AND
@@ -5442,74 +5428,86 @@ def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
}
-// VBSL : Vector Bitwise Select
-def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
- (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
- N3RegFrm, IIC_VCNTiD,
- "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
- [(set DPR:$Vd,
- (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+// The TwoAddress pass will not go looking for equivalent operations
+// with different register constraints; it just inserts copies.
+// That is why pseudo VBSP implemented. Is is expanded later into
+// VBIT/VBIF/VBSL taking into account register constraints to avoid copies.
+def VBSPd
+ : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+ IIC_VBINiD, "",
+ [(set DPR:$Vd,
+ (v2i32 (NEONvbsp DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
let Predicates = [HasNEON] in {
def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
(v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
(v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
(v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
(v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
(v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
- (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
- (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
- (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+ (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
}
-def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
- (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
- N3RegFrm, IIC_VCNTiQ,
- "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
- [(set QPR:$Vd,
- (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
-
+def VBSPq
+ : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+ IIC_VBINiQ, "",
+ [(set QPR:$Vd,
+ (v4i32 (NEONvbsp QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
let Predicates = [HasNEON] in {
def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
(v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
(v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
(v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
(v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
(v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
- (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
- (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
- (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+ (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
}
+// VBSL : Vector Bitwise Select
+def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+ (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+ N3RegFrm, IIC_VBINiD,
+ "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ []>;
+
+def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+ (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+ N3RegFrm, IIC_VBINiQ,
+ "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+ []>;
+
// VBIF : Vector Bitwise Insert if False
// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
-// FIXME: This instruction's encoding MAY NOT BE correct.
def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1,
(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
N3RegFrm, IIC_VBINiD,
@@ -5523,7 +5521,6 @@ def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1,
// VBIT : Vector Bitwise Insert if True
// like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
-// FIXME: This instruction's encoding MAY NOT BE correct.
def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1,
(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
N3RegFrm, IIC_VBINiD,
@@ -5535,10 +5532,6 @@ def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1,
"vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
[]>;
-// VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking
-// for equivalent operations with different register constraints; it just
-// inserts copies.
-
// Vector Absolute Differences.
// VABD : Vector Absolute Difference
@@ -6047,9 +6040,9 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
// Vector Negate.
def vnegd : PatFrag<(ops node:$in),
- (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+ (sub ARMimmAllZerosD, node:$in)>;
def vnegq : PatFrag<(ops node:$in),
- (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+ (sub ARMimmAllZerosV, node:$in)>;
class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
@@ -6263,11 +6256,11 @@ defm : NEONImmReplicateInstAlias<i32, VMOVv2i32, VMOVv4i32,
let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
- [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+ [(set DPR:$Vd, (v2i32 ARMimmAllZerosD))],
(VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
Requires<[HasZCZ]>;
def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
- [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+ [(set QPR:$Vd, (v4i32 ARMimmAllZerosV))],
(VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
Requires<[HasZCZ]>;
}
@@ -7953,7 +7946,7 @@ let Predicates = [HasNEON,IsLE] in {
(VLD1LNd16 addrmode6:$addr,
(f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
}
-// The following patterns are basically a copy of the patterns above,
+// The following patterns are basically a copy of the patterns above,
// however with an additional VREV16d instruction to convert data
// loaded by VLD1LN into proper vector format in big endian mode.
let Predicates = [HasNEON,IsBE] in {
@@ -9086,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
}
-def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
-def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;
-defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
-defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
class BF16MM<bit Q, RegisterClass RegTy,
string opc>
@@ -9098,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy,
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
N3RegFrm, IIC_VDOTPROD, "", "",
[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
- (v16i8 QPR:$Vn),
- (v16i8 QPR:$Vm)))]> {
+ (v8bf16 QPR:$Vn),
+ (v8bf16 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
let DecoderNamespace = "VFPV8";
@@ -9113,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
[(set (v4f32 QPR:$dst),
(OpNode (v4f32 QPR:$Vd),
- (v16i8 QPR:$Vn),
- (v16i8 QPR:$Vm)))]> {
+ (v8bf16 QPR:$Vn),
+ (v8bf16 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";
let DecoderNamespace = "VFPV8";
}
@@ -9135,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
def : Pat<
(v4f32 (OpNode (v4f32 QPR:$Vd),
- (v16i8 QPR:$Vn),
- (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
- VectorIndex16:$lane)))))),
+ (v8bf16 QPR:$Vn),
+ (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+ VectorIndex16:$lane)))),
(!cast<Instruction>(NAME) QPR:$Vd,
QPR:$Vn,
(EXTRACT_SUBREG QPR:$Vm,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
index 7fae32117243..3a33dfeecdc9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -548,14 +548,18 @@ let isCall = 1,
// Also used for Thumb2
def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
- "blx${p}\t$func",
- [(ARMcall GPR:$func)]>,
+ "blx${p}\t$func", []>,
Requires<[IsThumb, HasV5T]>,
T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
bits<4> func;
let Inst{6-3} = func;
let Inst{2-0} = 0b000;
}
+ def tBLXr_noip : ARMPseudoExpand<(outs), (ins pred:$p, GPRnoip:$func),
+ 2, IIC_Br, [], (tBLXr pred:$p, GPR:$func)>,
+ Requires<[IsThumb, HasV5T]>,
+ Sched<[WriteBrL]>;
+
// ARMv8-M Security Extensions
def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
@@ -586,6 +590,11 @@ let isCall = 1,
Requires<[IsThumb]>, Sched<[WriteBr]>;
}
+def : ARMPat<(ARMcall GPR:$func), (tBLXr $func)>,
+ Requires<[IsThumb, HasV5T, NoSLSBLRMitigation]>;
+def : ARMPat<(ARMcall GPRnoip:$func), (tBLXr_noip $func)>,
+ Requires<[IsThumb, HasV5T, SLSBLRMitigation]>;
+
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
let isPredicable = 1 in
def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 7137e8ee66b8..5642cab32e7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -1724,7 +1724,7 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
// only.
// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
class T2IstT<bits<2> type, string opc, InstrItinClass ii>
- : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+ : T2Ii8<(outs), (ins rGPR:$Rt, t2addrmode_imm8:$addr), ii, opc,
"\t$Rt, $addr", []>, Sched<[WriteST]> {
let Inst{31-27} = 0b11111;
let Inst{26-25} = 0b00;
@@ -2575,7 +2575,6 @@ def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
Requires<[IsThumb2, HasDSP]>;
// Signed/Unsigned saturate.
-let hasSideEffects = 1 in
class T2SatI<dag iops, string opc, string asm>
: T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, []> {
bits<4> Rd;
@@ -2624,9 +2623,9 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
let Inst{4} = 0;
}
-def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : T2Pat<(ARMssat GPRnopc:$Rn, imm0_31:$imm),
(t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
-def : T2Pat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm),
+def : T2Pat<(ARMusat GPRnopc:$Rn, imm0_31:$imm),
(t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos),
(t2SSAT imm1_32:$pos, GPR:$a, 0)>;
@@ -2636,6 +2635,23 @@ def : T2Pat<(int_arm_ssat16 GPR:$a, imm1_16:$pos),
(t2SSAT16 imm1_16:$pos, GPR:$a)>;
def : T2Pat<(int_arm_usat16 GPR:$a, imm0_15:$pos),
(t2USAT16 imm0_15:$pos, GPR:$a)>;
+def : T2Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos),
+ (t2SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos),
+ (t2SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : T2Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+ (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos),
+ (t2USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>;
+def : T2Pat<(ARMssat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+ (t2SSAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+ (t2SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+def : T2Pat<(ARMusat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos),
+ (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>;
+def : T2Pat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos),
+ (t2USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>;
+
//===----------------------------------------------------------------------===//
// Shift and rotate Instructions.
@@ -4919,6 +4935,15 @@ def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
// Armv8-R 'Data Full Barrier'
def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
+// SpeculationBarrierEndBB must only be used after an unconditional control
+// flow, i.e. after a terminator for which isBarrier is True.
+let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+ def t2SpeculationBarrierISBDSBEndBB
+ : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+ def t2SpeculationBarrierSBEndBB
+ : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
+}
+
// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
// width specifier.
def : t2InstAlias<"ldr${p} $Rt, $addr",
@@ -5404,9 +5429,16 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
let isTerminator = 1;
}
+let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
+
+let usesCustomInserter = 1 in
def t2DoLoopStart :
- t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
- [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
+ [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
+
+let isTerminator = 1, hasSideEffects = 1 in
+def t2DoLoopStartTP :
+ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>;
let hasSideEffects = 0 in
def t2LoopDec :
@@ -5426,8 +5458,14 @@ def t2LoopEnd :
t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;
+def t2LoopEndDec :
+ t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target),
+ 8, IIC_Br, []>, Sched<[WriteBr]>;
+
} // end isBranch, isTerminator, hasSideEffects
+}
+
} // end isNotDuplicable
class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
@@ -5446,6 +5484,7 @@ class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
let Inst{3-0} = Rm{3-0};
let Uses = [CPSR];
+ let hasSideEffects = 0;
}
def t2CSEL : CS<"csel", 0b1000>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
index 8a652c1d90f6..2be58d7a0e62 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -54,6 +54,16 @@ def vfp_f16imm : Operand<f16>,
let ParserMatchClass = FPImmOperand;
}
+def vfp_f32f16imm_xform : SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = ARM_AM::getFP32FP16Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>;
+
+def vfp_f32f16imm : PatLeaf<(f32 fpimm), [{
+ return ARM_AM::getFP32FP16Imm(N->getValueAPF()) != -1;
+ }], vfp_f32f16imm_xform>;
+
def vfp_f32imm_xform : SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = ARM_AM::getFP32Imm(InVal);
@@ -1551,6 +1561,8 @@ class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let Inst{5} = Sm{0};
let Inst{15-12} = Sd{4-1};
let Inst{22} = Sd{0};
+
+ let hasSideEffects = 0;
}
class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -2252,16 +2264,6 @@ def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
(VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
-// (fma x, (fneg y), z) -> (vfms z, x, y)
-def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
- (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
- Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
- (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
- Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin))),
- (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
- Requires<[HasFullFP16]>;
def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -2379,16 +2381,6 @@ def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
(VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
-// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
-def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
- (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
- Requires<[HasVFP4,HasDPVFP]>;
-def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
- (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
- Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin)))),
- (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
- Requires<[HasFullFP16]>;
//===----------------------------------------------------------------------===//
// FP Conditional moves.
@@ -2634,6 +2626,11 @@ def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
}
}
+def : Pat<(f32 (vfp_f32f16imm:$imm)),
+ (f32 (COPY_TO_REGCLASS (f16 (FCONSTH (vfp_f32f16imm_xform (f32 $imm)))), SPR))> {
+ let Predicates = [HasFullFP16];
+}
+
//===----------------------------------------------------------------------===//
// Assembler aliases.
//
@@ -2849,6 +2846,12 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
}
defm VSTR_P0 : vfp_vstrldr_sysreg<0b0,0b1101, "p0",
(outs), (ins VCCR:$P0)>;
+
+ let Defs = [VPR] in {
+ defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">;
+ }
+ defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0",
+ (outs VCCR:$P0), (ins)>;
}
let Uses = [FPSCR] in {
@@ -2860,11 +2863,3 @@ let Uses = [FPSCR] in {
defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
}
}
-
-let Predicates = [HasV8_1MMainline, HasMVEInt] in {
- let Defs = [VPR] in {
- defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">;
- }
- defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0",
- (outs VCCR:$P0), (ins)>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index c8a894fb11a8..09a94cc3a8e8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -164,8 +164,6 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM,
}
}
-const unsigned zero_reg = 0;
-
#define GET_GLOBALISEL_IMPL
#include "ARMGenGlobalISel.inc"
#undef GET_GLOBALISEL_IMPL
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index f3657155f47e..d9b60f4c4eba 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -88,7 +88,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32})
- .minScalar(0, s32);
+ .clampScalar(0, s32, s32);
if (ST.hasNEON())
getActionDefinitionsBuilder({G_ADD, G_SUB})
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a84d23d3bb96..aa1fe4e4ffda 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1268,6 +1268,7 @@ findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
// Thumb1 is already using updating loads/stores.
if (isThumb1) return false;
+ LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI);
const MachineOperand &BaseOP = MI->getOperand(0);
Register Base = BaseOP.getReg();
@@ -1319,8 +1320,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
return false;
}
}
- if (MergeInstr != MBB.end())
+ if (MergeInstr != MBB.end()) {
+ LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr);
MBB.erase(MergeInstr);
+ }
unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@@ -1335,6 +1338,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
// Transfer memoperands.
MIB.setMemRefs(MI->memoperands());
+ LLVM_DEBUG(dbgs() << " Added new load/store: " << *MIB);
MBB.erase(MBBI);
return true;
}
@@ -1382,9 +1386,27 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
case ARM::t2LDRi8:
case ARM::t2LDRi12:
return ARM::t2LDR_POST;
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRBi12:
+ return ARM::t2LDRB_POST;
+ case ARM::t2LDRSBi8:
+ case ARM::t2LDRSBi12:
+ return ARM::t2LDRSB_POST;
+ case ARM::t2LDRHi8:
+ case ARM::t2LDRHi12:
+ return ARM::t2LDRH_POST;
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSHi12:
+ return ARM::t2LDRSH_POST;
case ARM::t2STRi8:
case ARM::t2STRi12:
return ARM::t2STR_POST;
+ case ARM::t2STRBi8:
+ case ARM::t2STRBi12:
+ return ARM::t2STRB_POST;
+ case ARM::t2STRHi8:
+ case ARM::t2STRHi12:
+ return ARM::t2STRH_POST;
case ARM::MVE_VLDRBS16:
return ARM::MVE_VLDRBS16_post;
@@ -1427,6 +1449,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
// Thumb1 doesn't have updating LDR/STR.
// FIXME: Use LDM/STM with single register instead.
if (isThumb1) return false;
+ LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI);
Register Base = getLoadStoreBaseOp(*MI).getReg();
bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
@@ -1468,6 +1491,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
} else
return false;
}
+ LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr);
MBB.erase(MergeInstr);
ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
@@ -1479,39 +1503,54 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
// updating load/store-multiple instructions can be used with only one
// register.)
MachineOperand &MO = MI->getOperand(0);
- BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
- .addReg(Base, getDefRegState(true)) // WB base register
- .addReg(Base, getKillRegState(isLd ? BaseKill : false))
- .addImm(Pred).addReg(PredReg)
- .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
- getKillRegState(MO.isKill())))
- .cloneMemRefs(*MI);
+ auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+ .addReg(Base, getDefRegState(true)) // WB base register
+ .addReg(Base, getKillRegState(isLd ? BaseKill : false))
+ .addImm(Pred)
+ .addReg(PredReg)
+ .addReg(MO.getReg(), (isLd ? getDefRegState(true)
+ : getKillRegState(MO.isKill())))
+ .cloneMemRefs(*MI);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
} else if (isLd) {
if (isAM2) {
// LDR_PRE, LDR_POST
if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
- BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
- .addReg(Base, RegState::Define)
- .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg)
- .cloneMemRefs(*MI);
+ auto MIB =
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base)
+ .addImm(Offset)
+ .addImm(Pred)
+ .addReg(PredReg)
+ .cloneMemRefs(*MI);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
} else {
int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
- BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
- .addReg(Base, RegState::Define)
- .addReg(Base)
- .addReg(0)
- .addImm(Imm)
- .add(predOps(Pred, PredReg))
- .cloneMemRefs(*MI);
+ auto MIB =
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base)
+ .addReg(0)
+ .addImm(Imm)
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
}
} else {
// t2LDR_PRE, t2LDR_POST
- BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
- .addReg(Base, RegState::Define)
- .addReg(Base)
- .addImm(Offset)
- .add(predOps(Pred, PredReg))
- .cloneMemRefs(*MI);
+ auto MIB =
+ BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+ .addReg(Base, RegState::Define)
+ .addReg(Base)
+ .addImm(Offset)
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
}
} else {
MachineOperand &MO = MI->getOperand(0);
@@ -1521,21 +1560,25 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
// STR_PRE, STR_POST
- BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
- .addReg(MO.getReg(), getKillRegState(MO.isKill()))
- .addReg(Base)
- .addReg(0)
- .addImm(Imm)
- .add(predOps(Pred, PredReg))
- .cloneMemRefs(*MI);
+ auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+ .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ .addReg(Base)
+ .addReg(0)
+ .addImm(Imm)
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
} else {
// t2STR_PRE, t2STR_POST
- BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
- .addReg(MO.getReg(), getKillRegState(MO.isKill()))
- .addReg(Base)
- .addImm(Offset)
- .add(predOps(Pred, PredReg))
- .cloneMemRefs(*MI);
+ auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+ .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+ .addReg(Base)
+ .addImm(Offset)
+ .add(predOps(Pred, PredReg))
+ .cloneMemRefs(*MI);
+ (void)MIB;
+ LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB);
}
}
MBB.erase(MBBI);
@@ -1549,6 +1592,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
"Must have t2STRDi8 or t2LDRDi8");
if (MI.getOperand(3).getImm() != 0)
return false;
+ LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << MI);
// Behaviour for writeback is undefined if base register is the same as one
// of the others.
@@ -1576,6 +1620,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
} else
return false;
}
+ LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr);
MBB.erase(MergeInstr);
DebugLoc DL = MI.getDebugLoc();
@@ -1597,6 +1642,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
MIB.add(MO);
MIB.cloneMemRefs(MI);
+ LLVM_DEBUG(dbgs() << " Added new load/store: " << *MIB);
MBB.erase(MBBI);
return true;
}
@@ -2539,11 +2585,169 @@ static int getBaseOperandIndex(MachineInstr &MI) {
case ARM::MVE_VSTRBU8:
case ARM::MVE_VSTRHU16:
case ARM::MVE_VSTRWU32:
+ case ARM::t2LDRHi8:
+ case ARM::t2LDRHi12:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRBi8:
+ case ARM::t2LDRBi12:
+ case ARM::t2LDRSBi8:
+ case ARM::t2LDRSBi12:
+ case ARM::t2STRBi8:
+ case ARM::t2STRBi12:
+ case ARM::t2STRHi8:
+ case ARM::t2STRHi12:
return 1;
+ case ARM::MVE_VLDRBS16_post:
+ case ARM::MVE_VLDRBS32_post:
+ case ARM::MVE_VLDRBU16_post:
+ case ARM::MVE_VLDRBU32_post:
+ case ARM::MVE_VLDRHS32_post:
+ case ARM::MVE_VLDRHU32_post:
+ case ARM::MVE_VLDRBU8_post:
+ case ARM::MVE_VLDRHU16_post:
+ case ARM::MVE_VLDRWU32_post:
+ case ARM::MVE_VSTRB16_post:
+ case ARM::MVE_VSTRB32_post:
+ case ARM::MVE_VSTRH32_post:
+ case ARM::MVE_VSTRBU8_post:
+ case ARM::MVE_VSTRHU16_post:
+ case ARM::MVE_VSTRWU32_post:
+ case ARM::MVE_VLDRBS16_pre:
+ case ARM::MVE_VLDRBS32_pre:
+ case ARM::MVE_VLDRBU16_pre:
+ case ARM::MVE_VLDRBU32_pre:
+ case ARM::MVE_VLDRHS32_pre:
+ case ARM::MVE_VLDRHU32_pre:
+ case ARM::MVE_VLDRBU8_pre:
+ case ARM::MVE_VLDRHU16_pre:
+ case ARM::MVE_VLDRWU32_pre:
+ case ARM::MVE_VSTRB16_pre:
+ case ARM::MVE_VSTRB32_pre:
+ case ARM::MVE_VSTRH32_pre:
+ case ARM::MVE_VSTRBU8_pre:
+ case ARM::MVE_VSTRHU16_pre:
+ case ARM::MVE_VSTRWU32_pre:
+ return 2;
}
return -1;
}
+static bool isPostIndex(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case ARM::MVE_VLDRBS16_post:
+ case ARM::MVE_VLDRBS32_post:
+ case ARM::MVE_VLDRBU16_post:
+ case ARM::MVE_VLDRBU32_post:
+ case ARM::MVE_VLDRHS32_post:
+ case ARM::MVE_VLDRHU32_post:
+ case ARM::MVE_VLDRBU8_post:
+ case ARM::MVE_VLDRHU16_post:
+ case ARM::MVE_VLDRWU32_post:
+ case ARM::MVE_VSTRB16_post:
+ case ARM::MVE_VSTRB32_post:
+ case ARM::MVE_VSTRH32_post:
+ case ARM::MVE_VSTRBU8_post:
+ case ARM::MVE_VSTRHU16_post:
+ case ARM::MVE_VSTRWU32_post:
+ return true;
+ }
+ return false;
+}
+
+static bool isPreIndex(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case ARM::MVE_VLDRBS16_pre:
+ case ARM::MVE_VLDRBS32_pre:
+ case ARM::MVE_VLDRBU16_pre:
+ case ARM::MVE_VLDRBU32_pre:
+ case ARM::MVE_VLDRHS32_pre:
+ case ARM::MVE_VLDRHU32_pre:
+ case ARM::MVE_VLDRBU8_pre:
+ case ARM::MVE_VLDRHU16_pre:
+ case ARM::MVE_VLDRWU32_pre:
+ case ARM::MVE_VSTRB16_pre:
+ case ARM::MVE_VSTRB32_pre:
+ case ARM::MVE_VSTRH32_pre:
+ case ARM::MVE_VSTRBU8_pre:
+ case ARM::MVE_VSTRHU16_pre:
+ case ARM::MVE_VSTRWU32_pre:
+ return true;
+ }
+ return false;
+}
+
+// Given a memory access Opcode, check that the give Imm would be a valid Offset
+// for this instruction (same as isLegalAddressImm), Or if the instruction
+// could be easily converted to one where that was valid. For example converting
+// t2LDRi12 to t2LDRi8 for negative offsets. Works in conjunction with
+// AdjustBaseAndOffset below.
+static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm,
+ const TargetInstrInfo *TII,
+ int &CodesizeEstimate) {
+ if (isLegalAddressImm(Opcode, Imm, TII))
+ return true;
+
+ // We can convert AddrModeT2_i12 to AddrModeT2_i8.
+ const MCInstrDesc &Desc = TII->get(Opcode);
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ switch (AddrMode) {
+ case ARMII::AddrModeT2_i12:
+ CodesizeEstimate += 1;
+ return std::abs(Imm) < (((1 << 8) * 1) - 1);
+ }
+ return false;
+}
+
+// Given an MI adjust its address BaseReg to use NewBaseReg and address offset
+// by -Offset. This can either happen in-place or be a replacement as MI is
+// converted to another instruction type.
+static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
+ int Offset, const TargetInstrInfo *TII) {
+ unsigned BaseOp = getBaseOperandIndex(*MI);
+ MI->getOperand(BaseOp).setReg(NewBaseReg);
+ int OldOffset = MI->getOperand(BaseOp + 1).getImm();
+ if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII))
+ MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset);
+ else {
+ unsigned ConvOpcode;
+ switch (MI->getOpcode()) {
+ case ARM::t2LDRHi12:
+ ConvOpcode = ARM::t2LDRHi8;
+ break;
+ case ARM::t2LDRSHi12:
+ ConvOpcode = ARM::t2LDRSHi8;
+ break;
+ case ARM::t2LDRBi12:
+ ConvOpcode = ARM::t2LDRBi8;
+ break;
+ case ARM::t2LDRSBi12:
+ ConvOpcode = ARM::t2LDRSBi8;
+ break;
+ case ARM::t2STRHi12:
+ ConvOpcode = ARM::t2STRHi8;
+ break;
+ case ARM::t2STRBi12:
+ ConvOpcode = ARM::t2STRBi8;
+ break;
+ default:
+ llvm_unreachable("Unhandled convertable opcode");
+ }
+ assert(isLegalAddressImm(ConvOpcode, OldOffset - Offset, TII) &&
+ "Illegal Address Immediate after convert!");
+
+ const MCInstrDesc &MCID = TII->get(ConvOpcode);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+ .add(MI->getOperand(0))
+ .add(MI->getOperand(1))
+ .addImm(OldOffset - Offset)
+ .add(MI->getOperand(3))
+ .add(MI->getOperand(4))
+ .cloneMemRefs(*MI);
+ MI->eraseFromParent();
+ }
+}
+
static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
Register NewReg,
const TargetInstrInfo *TII,
@@ -2562,34 +2766,70 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
TRC = TII->getRegClass(MCID, 2, TRI, *MF);
MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
- return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
- .addReg(NewReg, RegState::Define)
- .add(MI->getOperand(0))
- .add(MI->getOperand(1))
- .addImm(Offset)
- .add(MI->getOperand(3))
- .add(MI->getOperand(4))
- .cloneMemRefs(*MI);
+ unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
+ switch (AddrMode) {
+ case ARMII::AddrModeT2_i7:
+ case ARMII::AddrModeT2_i7s2:
+ case ARMII::AddrModeT2_i7s4:
+ // Any MVE load/store
+ return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+ .addReg(NewReg, RegState::Define)
+ .add(MI->getOperand(0))
+ .add(MI->getOperand(1))
+ .addImm(Offset)
+ .add(MI->getOperand(3))
+ .add(MI->getOperand(4))
+ .cloneMemRefs(*MI);
+ case ARMII::AddrModeT2_i8:
+ if (MI->mayLoad()) {
+ return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+ .add(MI->getOperand(0))
+ .addReg(NewReg, RegState::Define)
+ .add(MI->getOperand(1))
+ .addImm(Offset)
+ .add(MI->getOperand(3))
+ .add(MI->getOperand(4))
+ .cloneMemRefs(*MI);
+ } else {
+ return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+ .addReg(NewReg, RegState::Define)
+ .add(MI->getOperand(0))
+ .add(MI->getOperand(1))
+ .addImm(Offset)
+ .add(MI->getOperand(3))
+ .add(MI->getOperand(4))
+ .cloneMemRefs(*MI);
+ }
+ default:
+ llvm_unreachable("Unhandled createPostIncLoadStore");
+ }
}
// Given a Base Register, optimise the load/store uses to attempt to create more
-// post-inc accesses. We do this by taking zero offset loads/stores with an add,
-// and convert them to a postinc load/store of the same type. Any subsequent
-// accesses will be adjusted to use and account for the post-inc value.
+// post-inc accesses and less register moves. We do this by taking zero offset
+// loads/stores with an add, and convert them to a postinc load/store of the
+// same type. Any subsequent accesses will be adjusted to use and account for
+// the post-inc value.
// For example:
// LDR #0 LDR_POSTINC #16
// LDR #4 LDR #-12
// LDR #8 LDR #-8
// LDR #12 LDR #-4
// ADD #16
+//
+// At the same time if we do not find an increment but do find an existing
+// pre/post inc instruction, we can still adjust the offsets of subsequent
+// instructions to save the register move that would otherwise be needed for the
+// in-place increment.
bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
// We are looking for:
// One zero offset load/store that can become postinc
MachineInstr *BaseAccess = nullptr;
+ MachineInstr *PrePostInc = nullptr;
// An increment that can be folded in
MachineInstr *Increment = nullptr;
// Other accesses after BaseAccess that will need to be updated to use the
- // postinc value
+ // postinc value.
SmallPtrSet<MachineInstr *, 8> OtherAccesses;
for (auto &Use : MRI->use_nodbg_instructions(Base)) {
if (!Increment && getAddSubImmediate(Use) != 0) {
@@ -2604,53 +2844,81 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
if (!Use.getOperand(BaseOp).isReg() ||
Use.getOperand(BaseOp).getReg() != Base)
return false;
- if (Use.getOperand(BaseOp + 1).getImm() == 0)
+ if (isPreIndex(Use) || isPostIndex(Use))
+ PrePostInc = &Use;
+ else if (Use.getOperand(BaseOp + 1).getImm() == 0)
BaseAccess = &Use;
else
OtherAccesses.insert(&Use);
}
- if (!BaseAccess || !Increment ||
- BaseAccess->getParent() != Increment->getParent())
- return false;
- Register PredReg;
- if (Increment->definesRegister(ARM::CPSR) ||
- getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
- return false;
+ int IncrementOffset;
+ Register NewBaseReg;
+ if (BaseAccess && Increment) {
+ if (PrePostInc || BaseAccess->getParent() != Increment->getParent())
+ return false;
+ Register PredReg;
+ if (Increment->definesRegister(ARM::CPSR) ||
+ getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
+ return false;
- LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
- << Base.virtRegIndex() << "\n");
+ LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
+ << Base.virtRegIndex() << "\n");
- // Make sure that Increment has no uses before BaseAccess.
- for (MachineInstr &Use :
- MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
- if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
- LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n");
+ // Make sure that Increment has no uses before BaseAccess.
+ for (MachineInstr &Use :
+ MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
+ if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
+ LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n");
+ return false;
+ }
+ }
+
+ // Make sure that Increment can be folded into Base
+ IncrementOffset = getAddSubImmediate(*Increment);
+ unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
+ BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
+ if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
+ LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n");
return false;
}
}
+ else if (PrePostInc) {
+ // If we already have a pre/post index load/store then set BaseAccess,
+ // IncrementOffset and NewBaseReg to the values it already produces,
+ // allowing us to update and subsequent uses of BaseOp reg with the
+ // incremented value.
+ if (Increment)
+ return false;
- // Make sure that Increment can be folded into Base
- int IncrementOffset = getAddSubImmediate(*Increment);
- unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
- BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
- if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
- LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n");
- return false;
+ LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on already "
+ << "indexed VirtualReg " << Base.virtRegIndex() << "\n");
+ int BaseOp = getBaseOperandIndex(*PrePostInc);
+ IncrementOffset = PrePostInc->getOperand(BaseOp+1).getImm();
+ BaseAccess = PrePostInc;
+ NewBaseReg = PrePostInc->getOperand(0).getReg();
}
+ else
+ return false;
// And make sure that the negative value of increment can be added to all
// other offsets after the BaseAccess. We rely on either
// dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
// to keep things simple.
+ // This also adds a simple codesize metric, to detect if an instruction (like
+ // t2LDRBi12) which can often be shrunk to a thumb1 instruction (tLDRBi)
+ // cannot because it is converted to something else (t2LDRBi8). We start this
+ // at -1 for the gain from removing the increment.
SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
+ int CodesizeEstimate = -1;
for (auto *Use : OtherAccesses) {
if (DT->dominates(BaseAccess, Use)) {
SuccessorAccesses.insert(Use);
unsigned BaseOp = getBaseOperandIndex(*Use);
- if (!isLegalAddressImm(
- Use->getOpcode(),
- Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
+ if (!isLegalOrConvertableAddressImm(Use->getOpcode(),
+ Use->getOperand(BaseOp + 1).getImm() -
+ IncrementOffset,
+ TII, CodesizeEstimate)) {
LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n");
return false;
}
@@ -2660,24 +2928,27 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
return false;
}
}
+ if (STI->hasMinSize() && CodesizeEstimate > 0) {
+ LLVM_DEBUG(dbgs() << " Expected to grow instructions under minsize\n");
+ return false;
+ }
- // Replace BaseAccess with a post inc
- LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
- LLVM_DEBUG(dbgs() << " And : "; Increment->dump());
- Register NewBaseReg = Increment->getOperand(0).getReg();
- MachineInstr *BaseAccessPost =
- createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
- BaseAccess->eraseFromParent();
- Increment->eraseFromParent();
- (void)BaseAccessPost;
- LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump());
+ if (!PrePostInc) {
+ // Replace BaseAccess with a post inc
+ LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
+ LLVM_DEBUG(dbgs() << " And : "; Increment->dump());
+ NewBaseReg = Increment->getOperand(0).getReg();
+ MachineInstr *BaseAccessPost =
+ createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
+ BaseAccess->eraseFromParent();
+ Increment->eraseFromParent();
+ (void)BaseAccessPost;
+ LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump());
+ }
for (auto *Use : SuccessorAccesses) {
LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
- unsigned BaseOp = getBaseOperandIndex(*Use);
- Use->getOperand(BaseOp).setReg(NewBaseReg);
- int OldOffset = Use->getOperand(BaseOp + 1).getImm();
- Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
+ AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII);
LLVM_DEBUG(dbgs() << " To : "; Use->dump());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index be75d6bef08c..61a924078f29 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -56,6 +56,7 @@
#include "ARMBaseRegisterInfo.h"
#include "ARMBasicBlockInfo.h"
#include "ARMSubtarget.h"
+#include "MVETailPredUtils.h"
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallSet.h"
@@ -73,6 +74,37 @@ using namespace llvm;
#define DEBUG_TYPE "arm-low-overhead-loops"
#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
+static cl::opt<bool>
+DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
+ cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
+ cl::init(false));
+
+static bool isVectorPredicated(MachineInstr *MI) {
+ int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
+ return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
+}
+
+static bool isVectorPredicate(MachineInstr *MI) {
+ return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
+}
+
+static bool hasVPRUse(MachineInstr &MI) {
+ return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
+}
+
+static bool isDomainMVE(MachineInstr *MI) {
+ uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+ return Domain == ARMII::DomainMVE;
+}
+
+static bool shouldInspect(MachineInstr &MI) {
+ return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
+}
+
+static bool isDo(MachineInstr *MI) {
+ return MI->getOpcode() != ARM::t2WhileLoopStart;
+}
+
namespace {
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -111,8 +143,7 @@ namespace {
// Insert exit blocks.
SmallVector<MachineBasicBlock*, 2> ExitBlocks;
ML.getExitBlocks(ExitBlocks);
- for (auto *MBB : ExitBlocks)
- Order.push_back(MBB);
+ append_range(Order, ExitBlocks);
// Then add the loop body.
Search(ML.getHeader());
@@ -143,73 +174,187 @@ namespace {
}
};
- // Represent a VPT block, a list of instructions that begins with a VPT/VPST
- // and has a maximum of four proceeding instructions. All instructions within
- // the block are predicated upon the vpr and we allow instructions to define
- // the vpr within in the block too.
- class VPTBlock {
- // The predicate then instruction, which is either a VPT, or a VPST
- // instruction.
- std::unique_ptr<PredicatedMI> PredicateThen;
- PredicatedMI *Divergent = nullptr;
- SmallVector<PredicatedMI, 4> Insts;
+ // Represent the current state of the VPR and hold all instances which
+ // represent a VPT block, which is a list of instructions that begins with a
+ // VPT/VPST and has a maximum of four proceeding instructions. All
+ // instructions within the block are predicated upon the vpr and we allow
+ // instructions to define the vpr within in the block too.
+ class VPTState {
+ friend struct LowOverheadLoop;
+
+ SmallVector<MachineInstr *, 4> Insts;
+
+ static SmallVector<VPTState, 4> Blocks;
+ static SetVector<MachineInstr *> CurrentPredicates;
+ static std::map<MachineInstr *,
+ std::unique_ptr<PredicatedMI>> PredicatedInsts;
+
+ static void CreateVPTBlock(MachineInstr *MI) {
+ assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
+ && "Can't begin VPT without predicate");
+ Blocks.emplace_back(MI);
+ // The execution of MI is predicated upon the current set of instructions
+ // that are AND'ed together to form the VPR predicate value. In the case
+ // that MI is a VPT, CurrentPredicates will also just be MI.
+ PredicatedInsts.emplace(
+ MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
+ }
- public:
- VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
- PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
+ static void reset() {
+ Blocks.clear();
+ PredicatedInsts.clear();
+ CurrentPredicates.clear();
}
- void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
- if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) {
- Divergent = &Insts.back();
- LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
- }
- Insts.emplace_back(MI, Preds);
- assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
+ static void addInst(MachineInstr *MI) {
+ Blocks.back().insert(MI);
+ PredicatedInsts.emplace(
+ MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
}
+ static void addPredicate(MachineInstr *MI) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
+ CurrentPredicates.insert(MI);
+ }
+
+ static void resetPredicate(MachineInstr *MI) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
+ CurrentPredicates.clear();
+ CurrentPredicates.insert(MI);
+ }
+
+ public:
// Have we found an instruction within the block which defines the vpr? If
// so, not all the instructions in the block will have the same predicate.
- bool HasNonUniformPredicate() const {
- return Divergent != nullptr;
+ static bool hasUniformPredicate(VPTState &Block) {
+ return getDivergent(Block) == nullptr;
}
- // Is the given instruction part of the predicate set controlling the entry
- // to the block.
- bool IsPredicatedOn(MachineInstr *MI) const {
- return PredicateThen->Predicates.count(MI);
+ // If it exists, return the first internal instruction which modifies the
+ // VPR.
+ static MachineInstr *getDivergent(VPTState &Block) {
+ SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+ for (unsigned i = 1; i < Insts.size(); ++i) {
+ MachineInstr *Next = Insts[i];
+ if (isVectorPredicate(Next))
+ return Next; // Found an instruction altering the vpr.
+ }
+ return nullptr;
}
- // Returns true if this is a VPT instruction.
- bool isVPT() const { return !isVPST(); }
+ // Return whether the given instruction is predicated upon a VCTP.
+ static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
+ SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
+ if (Exclusive && Predicates.size() != 1)
+ return false;
+ for (auto *PredMI : Predicates)
+ if (isVCTP(PredMI))
+ return true;
+ return false;
+ }
- // Returns true if this is a VPST instruction.
- bool isVPST() const {
- return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
+ // Is the VPST, controlling the block entry, predicated upon a VCTP.
+ static bool isEntryPredicatedOnVCTP(VPTState &Block,
+ bool Exclusive = false) {
+ SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+ return isPredicatedOnVCTP(Insts.front(), Exclusive);
}
- // Is the given instruction the only predicate which controls the entry to
- // the block.
- bool IsOnlyPredicatedOn(MachineInstr *MI) const {
- return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
+ // If this block begins with a VPT, we can check whether it's using
+ // at least one predicated input(s), as well as possible loop invariant
+ // which would result in it being implicitly predicated.
+ static bool hasImplicitlyValidVPT(VPTState &Block,
+ ReachingDefAnalysis &RDA) {
+ SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+ MachineInstr *VPT = Insts.front();
+ assert(isVPTOpcode(VPT->getOpcode()) &&
+ "Expected VPT block to begin with VPT/VPST");
+
+ if (VPT->getOpcode() == ARM::MVE_VPST)
+ return false;
+
+ auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
+ MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
+ return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
+ };
+
+ auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
+ MachineOperand &MO = MI->getOperand(Idx);
+ if (!MO.isReg() || !MO.getReg())
+ return true;
+
+ SmallPtrSet<MachineInstr *, 2> Defs;
+ RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
+ if (Defs.empty())
+ return true;
+
+ for (auto *Def : Defs)
+ if (Def->getParent() == VPT->getParent())
+ return false;
+ return true;
+ };
+
+ // Check that at least one of the operands is directly predicated on a
+ // vctp and allow an invariant value too.
+ return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
+ (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
+ (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
}
- unsigned size() const { return Insts.size(); }
- SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
- MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
- PredicatedMI *getDivergent() const { return Divergent; }
- };
+ static bool isValid(ReachingDefAnalysis &RDA) {
+ // All predication within the loop should be based on vctp. If the block
+ // isn't predicated on entry, check whether the vctp is within the block
+ // and that all other instructions are then predicated on it.
+ for (auto &Block : Blocks) {
+ if (isEntryPredicatedOnVCTP(Block, false) ||
+ hasImplicitlyValidVPT(Block, RDA))
+ continue;
- struct Reduction {
- MachineInstr *Init;
- MachineInstr &Copy;
- MachineInstr &Reduce;
- MachineInstr &VPSEL;
+ SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+ // We don't know how to convert a block with just a VPT;VCTP into
+ // anything valid once we remove the VCTP. For now just bail out.
+ assert(isVPTOpcode(Insts.front()->getOpcode()) &&
+ "Expected VPT block to start with a VPST or VPT!");
+ if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
+ isVCTP(Insts.back()))
+ return false;
+
+ for (auto *MI : Insts) {
+ // Check that any internal VCTPs are 'Then' predicated.
+ if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
+ return false;
+ // Skip other instructions that build up the predicate.
+ if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
+ continue;
+ // Check that any other instructions are predicated upon a vctp.
+ // TODO: We could infer when VPTs are implicitly predicated on the
+ // vctp (when the operands are predicated).
+ if (!isPredicatedOnVCTP(MI)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ VPTState(MachineInstr *MI) { Insts.push_back(MI); }
- Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add,
- MachineInstr *Sel)
- : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { }
+ void insert(MachineInstr *MI) {
+ Insts.push_back(MI);
+ // VPT/VPST + 4 predicated instructions.
+ assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
+ }
+
+ bool containsVCTP() const {
+ for (auto *MI : Insts)
+ if (isVCTP(MI))
+ return true;
+ return false;
+ }
+
+ unsigned size() const { return Insts.size(); }
+ SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
};
struct LowOverheadLoop {
@@ -221,17 +366,14 @@ namespace {
const TargetRegisterInfo &TRI;
const ARMBaseInstrInfo &TII;
MachineFunction *MF = nullptr;
- MachineInstr *InsertPt = nullptr;
+ MachineBasicBlock::iterator StartInsertPt;
+ MachineBasicBlock *StartInsertBB = nullptr;
MachineInstr *Start = nullptr;
MachineInstr *Dec = nullptr;
MachineInstr *End = nullptr;
- MachineInstr *VCTP = nullptr;
- SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
- VPTBlock *CurrentBlock = nullptr;
- SetVector<MachineInstr*> CurrentPredicate;
- SmallVector<VPTBlock, 4> VPTBlocks;
+ MachineOperand TPNumElements;
+ SmallVector<MachineInstr*, 4> VCTPs;
SmallPtrSet<MachineInstr*, 4> ToRemove;
- SmallVector<std::unique_ptr<Reduction>, 1> Reductions;
SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
bool Revert = false;
bool CannotTailPredicate = false;
@@ -239,12 +381,14 @@ namespace {
LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
const ARMBaseInstrInfo &TII)
- : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
+ : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
+ TPNumElements(MachineOperand::CreateImm(0)) {
MF = ML.getHeader()->getParent();
if (auto *MBB = ML.getLoopPreheader())
Preheader = MBB;
else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
Preheader = MBB;
+ VPTState::reset();
}
// If this is an MVE instruction, check that we know how to use tail
@@ -259,18 +403,18 @@ namespace {
bool IsTailPredicationLegal() const {
// For now, let's keep things really simple and only support a single
// block for tail predication.
- return !Revert && FoundAllComponents() && VCTP &&
+ return !Revert && FoundAllComponents() && !VCTPs.empty() &&
!CannotTailPredicate && ML.getNumBlocks() == 1;
}
+ // Given that MI is a VCTP, check that is equivalent to any other VCTPs
+ // found.
+ bool AddVCTP(MachineInstr *MI);
+
// Check that the predication in the loop will be equivalent once we
// perform the conversion. Also ensure that we can provide the number
// of elements to the loop start instruction.
- bool ValidateTailPredicate(MachineInstr *StartInsertPt);
-
- // See whether the live-out instructions are a reduction that we can fixup
- // later.
- bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers);
+ bool ValidateTailPredicate();
// Check that any values available outside of the loop will be the same
// after tail predication conversion.
@@ -283,34 +427,41 @@ namespace {
// Check the branch targets are within range and we satisfy our
// restrictions.
- void CheckLegality(ARMBasicBlockUtils *BBUtils);
+ void Validate(ARMBasicBlockUtils *BBUtils);
bool FoundAllComponents() const {
return Start && Dec && End;
}
- SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+ SmallVectorImpl<VPTState> &getVPTBlocks() {
+ return VPTState::Blocks;
+ }
- // Return the loop iteration count, or the number of elements if we're tail
- // predicating.
- MachineOperand &getCount() {
- return IsTailPredicationLegal() ?
- VCTP->getOperand(1) : Start->getOperand(0);
+ // Return the operand for the loop start instruction. This will be the loop
+ // iteration count, or the number of elements if we're tail predicating.
+ MachineOperand &getLoopStartOperand() {
+ if (IsTailPredicationLegal())
+ return TPNumElements;
+ return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0);
}
unsigned getStartOpcode() const {
- bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+ bool IsDo = isDo(Start);
if (!IsTailPredicationLegal())
return IsDo ? ARM::t2DLS : ARM::t2WLS;
- return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
+ return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
}
void dump() const {
if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
- if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
+ if (!VCTPs.empty()) {
+ dbgs() << "ARM Loops: Found VCTP(s):\n";
+ for (auto *MI : VCTPs)
+ dbgs() << " - " << *MI;
+ }
if (!FoundAllComponents())
dbgs() << "ARM Loops: Not a low-overhead loop.\n";
else if (!(Start && Dec && End))
@@ -357,14 +508,15 @@ namespace {
bool RevertNonLoops();
void RevertWhile(MachineInstr *MI) const;
+ void RevertDo(MachineInstr *MI) const;
bool RevertLoopDec(MachineInstr *MI) const;
void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
- void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
+ void RevertLoopEndDec(MachineInstr *MI) const;
- void FixupReductions(LowOverheadLoop &LoLoop) const;
+ void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
@@ -376,149 +528,228 @@ namespace {
char ARMLowOverheadLoops::ID = 0;
+SmallVector<VPTState, 4> VPTState::Blocks;
+SetVector<MachineInstr *> VPTState::CurrentPredicates;
+std::map<MachineInstr *,
+ std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
+
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
false, false)
-MachineInstr *LowOverheadLoop::isSafeToDefineLR() {
- // We can define LR because LR already contains the same value.
- if (Start->getOperand(0).getReg() == ARM::LR)
- return Start;
-
- unsigned CountReg = Start->getOperand(0).getReg();
- auto IsMoveLR = [&CountReg](MachineInstr *MI) {
- return MI->getOpcode() == ARM::tMOVr &&
- MI->getOperand(0).getReg() == ARM::LR &&
- MI->getOperand(1).getReg() == CountReg &&
- MI->getOperand(2).getImm() == ARMCC::AL;
- };
-
- MachineBasicBlock *MBB = Start->getParent();
-
- // Find an insertion point:
- // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
- // to Count before Start, we can insert at that mov.
- if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR))
- if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
- return LRDef;
-
- // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
- // to Count after Start, we can insert at that mov.
- if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR))
- if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
- return LRDef;
-
- // We've found no suitable LR def and Start doesn't use LR directly. Can we
- // just define LR anyway?
- return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr;
-}
+static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
+ InstSet &ToRemove, InstSet &Ignore) {
+
+ // Check that we can remove all of Killed without having to modify any IT
+ // blocks.
+ auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
+ // Collect the dead code and the MBBs in which they reside.
+ SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
+ for (auto *Dead : Killed)
+ BasicBlocks.insert(Dead->getParent());
+
+ // Collect IT blocks in all affected basic blocks.
+ std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
+ for (auto *MBB : BasicBlocks) {
+ for (auto &IT : *MBB) {
+ if (IT.getOpcode() != ARM::t2IT)
+ continue;
+ RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
+ ITBlocks[&IT]);
+ }
+ }
-bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
- assert(VCTP && "VCTP instruction expected but is not set");
- // All predication within the loop should be based on vctp. If the block
- // isn't predicated on entry, check whether the vctp is within the block
- // and that all other instructions are then predicated on it.
- for (auto &Block : VPTBlocks) {
- if (Block.IsPredicatedOn(VCTP))
- continue;
- if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
- << *Block.getDivergent()->MI);
- return false;
+ // If we're removing all of the instructions within an IT block, then
+ // also remove the IT instruction.
+ SmallPtrSet<MachineInstr *, 2> ModifiedITs;
+ SmallPtrSet<MachineInstr *, 2> RemoveITs;
+ for (auto *Dead : Killed) {
+ if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
+ MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
+ RemoveITs.insert(IT);
+ auto &CurrentBlock = ITBlocks[IT];
+ CurrentBlock.erase(Dead);
+ if (CurrentBlock.empty())
+ ModifiedITs.erase(IT);
+ else
+ ModifiedITs.insert(IT);
+ }
}
- SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
- for (auto &PredMI : Insts) {
- // Check the instructions in the block and only allow:
- // - VCTPs
- // - Instructions predicated on the main VCTP
- // - Any VCMP
- // - VCMPs just "and" their result with VPR.P0. Whether they are
- // located before/after the VCTP is irrelevant - the end result will
- // be the same in both cases, so there's no point in requiring them
- // to be located after the VCTP!
- if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
- VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
- continue;
- LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
- << " - which is predicated on:\n";
- for (auto *MI : PredMI.Predicates)
- dbgs() << " - " << *MI);
+ if (!ModifiedITs.empty())
return false;
+ Killed.insert(RemoveITs.begin(), RemoveITs.end());
+ return true;
+ };
+
+ SmallPtrSet<MachineInstr *, 2> Uses;
+ if (!RDA.isSafeToRemove(MI, Uses, Ignore))
+ return false;
+
+ if (WontCorruptITs(Uses, RDA)) {
+ ToRemove.insert(Uses.begin(), Uses.end());
+ LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
+ << " - can also remove:\n";
+ for (auto *Use : Uses)
+ dbgs() << " - " << *Use);
+
+ SmallPtrSet<MachineInstr*, 4> Killed;
+ RDA.collectKilledOperands(MI, Killed);
+ if (WontCorruptITs(Killed, RDA)) {
+ ToRemove.insert(Killed.begin(), Killed.end());
+ LLVM_DEBUG(for (auto *Dead : Killed)
+ dbgs() << " - " << *Dead);
}
+ return true;
}
+ return false;
+}
- if (!ValidateLiveOuts())
+bool LowOverheadLoop::ValidateTailPredicate() {
+ if (!IsTailPredicationLegal()) {
+ LLVM_DEBUG(if (VCTPs.empty())
+ dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
+ dbgs() << "ARM Loops: Tail-predication is not valid.\n");
return false;
+ }
- // For tail predication, we need to provide the number of elements, instead
- // of the iteration count, to the loop start instruction. The number of
- // elements is provided to the vctp instruction, so we need to check that
- // we can use this register at InsertPt.
- Register NumElements = VCTP->getOperand(1).getReg();
+ assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
+ assert(ML.getBlocks().size() == 1 &&
+ "Shouldn't be processing a loop with more than one block");
- // If the register is defined within loop, then we can't perform TP.
- // TODO: Check whether this is just a mov of a register that would be
- // available.
- if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
+ if (DisableTailPredication) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
return false;
}
- // The element count register maybe defined after InsertPt, in which case we
- // need to try to move either InsertPt or the def so that the [w|d]lstp can
- // use the value.
- // TODO: On failing to move an instruction, check if the count is provided by
- // a mov and whether we can use the mov operand directly.
- MachineBasicBlock *InsertBB = StartInsertPt->getParent();
- if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
- if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
- if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
- ElemDef->removeFromParent();
- InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef);
- LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
- << *ElemDef);
- } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) {
- StartInsertPt->removeFromParent();
- InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
- StartInsertPt);
- LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
- } else {
- LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
- << "start instruction.\n");
- return false;
+ if (!VPTState::isValid(RDA)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
+ return false;
+ }
+
+ if (!ValidateLiveOuts()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
+ return false;
+ }
+
+ // Check that creating a [W|D]LSTP, which will define LR with an element
+ // count instead of iteration count, won't affect any other instructions
+ // than the LoopStart and LoopDec.
+ // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
+ Register StartReg = isDo(Start) ? Start->getOperand(1).getReg()
+ : Start->getOperand(0).getReg();
+ if (StartInsertPt == Start && StartReg == ARM::LR) {
+ if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) {
+ SmallPtrSet<MachineInstr *, 2> Uses;
+ RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
+ for (auto *Use : Uses) {
+ if (Use != Start && Use != Dec) {
+ LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use);
+ return false;
+ }
}
}
}
- // Especially in the case of while loops, InsertBB may not be the
- // preheader, so we need to check that the register isn't redefined
- // before entering the loop.
- auto CannotProvideElements = [this](MachineBasicBlock *MBB,
- Register NumElements) {
- // NumElements is redefined in this block.
- if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
- return true;
+ // For tail predication, we need to provide the number of elements, instead
+ // of the iteration count, to the loop start instruction. The number of
+ // elements is provided to the vctp instruction, so we need to check that
+ // we can use this register at InsertPt.
+ MachineInstr *VCTP = VCTPs.back();
+ if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
+ TPNumElements = Start->getOperand(2);
+ StartInsertPt = Start;
+ StartInsertBB = Start->getParent();
+ } else {
+ TPNumElements = VCTP->getOperand(1);
+ MCRegister NumElements = TPNumElements.getReg().asMCReg();
+
+ // If the register is defined within loop, then we can't perform TP.
+ // TODO: Check whether this is just a mov of a register that would be
+ // available.
+ if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
+ return false;
+ }
- // Don't continue searching up through multiple predecessors.
- if (MBB->pred_size() > 1)
- return true;
+ // The element count register maybe defined after InsertPt, in which case we
+ // need to try to move either InsertPt or the def so that the [w|d]lstp can
+ // use the value.
+
+ if (StartInsertPt != StartInsertBB->end() &&
+ !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
+ if (auto *ElemDef =
+ RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
+ if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
+ ElemDef->removeFromParent();
+ StartInsertBB->insert(StartInsertPt, ElemDef);
+ LLVM_DEBUG(dbgs()
+ << "ARM Loops: Moved element count def: " << *ElemDef);
+ } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
+ StartInsertPt->removeFromParent();
+ StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
+ &*StartInsertPt);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
+ } else {
+ // If we fail to move an instruction and the element count is provided
+ // by a mov, use the mov operand if it will have the same value at the
+ // insertion point
+ MachineOperand Operand = ElemDef->getOperand(1);
+ if (isMovRegOpcode(ElemDef->getOpcode()) &&
+ RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
+ RDA.getUniqueReachingMIDef(&*StartInsertPt,
+ Operand.getReg().asMCReg())) {
+ TPNumElements = Operand;
+ NumElements = TPNumElements.getReg();
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "ARM Loops: Unable to move element count to loop "
+ << "start instruction.\n");
+ return false;
+ }
+ }
+ }
+ }
- return false;
- };
+ // Especially in the case of while loops, InsertBB may not be the
+ // preheader, so we need to check that the register isn't redefined
+ // before entering the loop.
+ auto CannotProvideElements = [this](MachineBasicBlock *MBB,
+ MCRegister NumElements) {
+ if (MBB->empty())
+ return false;
+ // NumElements is redefined in this block.
+ if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
+ return true;
- // First, find the block that looks like the preheader.
- MachineBasicBlock *MBB = Preheader;
- if (!MBB) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n");
- return false;
- }
+ // Don't continue searching up through multiple predecessors.
+ if (MBB->pred_size() > 1)
+ return true;
- // Then search backwards for a def, until we get to InsertBB.
- while (MBB != InsertBB) {
- if (CannotProvideElements(MBB, NumElements)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
return false;
+ };
+
+ // Search backwards for a def, until we get to InsertBB.
+ MachineBasicBlock *MBB = Preheader;
+ while (MBB && MBB != StartInsertBB) {
+ if (CannotProvideElements(MBB, NumElements)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
+ return false;
+ }
+ MBB = *MBB->pred_begin();
}
- MBB = *MBB->pred_begin();
+ }
+
+ // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
+ // world the [w|d]lstp instruction would be last instruction in the preheader
+ // and so it would only affect instructions within the loop body. But due to
+ // scheduling, and/or the logic in this pass (above), the insertion point can
+ // be moved earlier. So if the Loop Start isn't the last instruction in the
+ // preheader, and if the initial element count is smaller than the vector
+ // width, the Loop Start instruction will immediately generate one or more
+ // false lane mask which can, incorrectly, affect the proceeding MVE
+ // instructions in the preheader.
+ if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
+ return false;
}
// Check that the value change of the element count is what we expect and
@@ -529,15 +760,20 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
return -getAddSubImmediate(*MI) == ExpectedVecWidth;
};
- MBB = VCTP->getParent();
- if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) {
+ MachineBasicBlock *MBB = VCTP->getParent();
+ // Remove modifications to the element count since they have no purpose in a
+ // tail predicated loop. Explicitly refer to the vctp operand no matter which
+ // register NumElements has been assigned to, since that is what the
+ // modifications will be using
+ if (auto *Def = RDA.getUniqueReachingMIDef(
+ &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
SmallPtrSet<MachineInstr*, 2> ElementChain;
- SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+ SmallPtrSet<MachineInstr*, 2> Ignore;
unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
- Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+ Ignore.insert(VCTPs.begin(), VCTPs.end());
- if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
+ if (TryRemove(Def, RDA, ElementChain, Ignore)) {
bool FoundSub = false;
for (auto *MI : ElementChain) {
@@ -545,27 +781,24 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
continue;
if (isSubImmOpcode(MI->getOpcode())) {
- if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth))
+ if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
+ " count: " << *MI);
return false;
+ }
FoundSub = true;
- } else
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
+ " count: " << *MI);
return false;
+ }
}
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n";
- for (auto *MI : ElementChain)
- dbgs() << " - " << *MI);
ToRemove.insert(ElementChain.begin(), ElementChain.end());
}
}
return true;
}
-static bool isVectorPredicated(MachineInstr *MI) {
- int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
- return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
-}
-
static bool isRegInClass(const MachineOperand &MO,
const TargetRegisterClass *Class) {
return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
@@ -623,7 +856,6 @@ static bool canGenerateNonZeros(const MachineInstr &MI) {
return false;
}
-
// Look at its register uses to see if it only can only receive zeros
// into its false lanes which would then produce zeros. Also check that
// the output register is also defined by an FalseLanesZero instruction
@@ -636,120 +868,40 @@ static bool producesFalseLanesZero(MachineInstr &MI,
if (canGenerateNonZeros(MI))
return false;
+ bool isPredicated = isVectorPredicated(&MI);
+ // Predicated loads will write zeros to the falsely predicated bytes of the
+ // destination register.
+ if (MI.mayLoad())
+ return isPredicated;
+
+ auto IsZeroInit = [](MachineInstr *Def) {
+ return !isVectorPredicated(Def) &&
+ Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
+ Def->getOperand(1).getImm() == 0;
+ };
+
bool AllowScalars = isHorizontalReduction(MI);
for (auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.getReg())
continue;
if (!isRegInClass(MO, QPRs) && AllowScalars)
continue;
- if (auto *OpDef = RDA.getMIOperand(&MI, MO))
- if (FalseLanesZero.count(OpDef))
- continue;
- return false;
- }
- LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
- return true;
-}
-bool
-LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) {
- // Also check for reductions where the operation needs to be merging values
- // from the last and previous loop iterations. This means an instruction
- // producing a value and a vmov storing the value calculated in the previous
- // iteration. So we can have two live-out regs, one produced by a vmov and
- // both being consumed by a vpsel.
- LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n";
- for (auto *MI : LiveMIs)
- dbgs() << " - " << *MI);
-
- if (!Preheader)
- return false;
-
- // Expect a vmov, a vadd and a single vpsel user.
- // TODO: This means we can't currently support multiple reductions in the
- // loop.
- if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1)
- return false;
-
- MachineInstr *VPSEL = *LiveOutUsers.begin();
- if (VPSEL->getOpcode() != ARM::MVE_VPSEL)
- return false;
-
- unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1;
- MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx);
- if (!Pred || Pred != VCTP) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n");
- return false;
- }
-
- MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1);
- if (!Reduce)
- return false;
-
- assert(LiveMIs.count(Reduce) && "Expected MI to be live-out");
-
- // TODO: Support more operations than VADD.
- switch (VCTP->getOpcode()) {
- default:
- return false;
- case ARM::MVE_VCTP8:
- if (Reduce->getOpcode() != ARM::MVE_VADDi8)
- return false;
- break;
- case ARM::MVE_VCTP16:
- if (Reduce->getOpcode() != ARM::MVE_VADDi16)
- return false;
- break;
- case ARM::MVE_VCTP32:
- if (Reduce->getOpcode() != ARM::MVE_VADDi32)
+ // Check that this instruction will produce zeros in its false lanes:
+ // - If it only consumes false lanes zero or constant 0 (vmov #0)
+ // - If it's predicated, it only matters that it's def register already has
+ // false lane zeros, so we can ignore the uses.
+ SmallPtrSet<MachineInstr *, 2> Defs;
+ RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
+ for (auto *Def : Defs) {
+ if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
+ continue;
+ if (MO.isUse() && isPredicated)
+ continue;
return false;
- break;
- }
-
- // Test that the reduce op is overwriting ones of its operands.
- if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() &&
- Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n");
- return false;
- }
-
- // Check that the VORR is actually a VMOV.
- MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2);
- if (!Copy || Copy->getOpcode() != ARM::MVE_VORR ||
- !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() ||
- Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg())
- return false;
-
- assert(LiveMIs.count(Copy) && "Expected MI to be live-out");
-
- // Check that the vadd and vmov are only used by each other and the vpsel.
- SmallPtrSet<MachineInstr*, 2> CopyUsers;
- RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers);
- if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n");
- return false;
- }
-
- SmallPtrSet<MachineInstr*, 2> ReduceUsers;
- RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers);
- if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n");
- return false;
+ }
}
-
- // Then find whether there's an instruction initialising the register that
- // is storing the reduction.
- SmallPtrSet<MachineInstr*, 2> Incoming;
- RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming);
- if (Incoming.size() > 1)
- return false;
-
- MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin();
- LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n"
- << " - " << *Copy
- << " - " << *Reduce
- << " - " << *VPSEL);
- Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL));
+ LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
return true;
}
@@ -769,7 +921,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
// the false lanes are zeroed and here we're trying to track that those false
// lanes remain zero, or where they change, the differences are masked away
// by their user(s).
- // All MVE loads and stores have to be predicated, so we know that any load
+ // All MVE stores have to be predicated, so we know that any predicate load
// operands, or stored results are equivalent already. Other explicitly
// predicated instructions will perform the same operation in the original
// loop and the tail-predicated form too. Because of this, we can insert
@@ -782,42 +934,32 @@ bool LowOverheadLoop::ValidateLiveOuts() {
MachineBasicBlock *Header = ML.getHeader();
for (auto &MI : *Header) {
- const MCInstrDesc &MCID = MI.getDesc();
- uint64_t Flags = MCID.TSFlags;
- if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
+ if (!shouldInspect(MI))
continue;
if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
continue;
- // Predicated loads will write zeros to the falsely predicated bytes of the
- // destination register.
- if (isVectorPredicated(&MI)) {
- if (MI.mayLoad())
- FalseLanesZero.insert(&MI);
- Predicated.insert(&MI);
- continue;
- }
+ bool isPredicated = isVectorPredicated(&MI);
+ bool retainsOrReduces =
+ retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
- if (MI.getNumDefs() == 0)
+ if (isPredicated)
+ Predicated.insert(&MI);
+ if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
+ FalseLanesZero.insert(&MI);
+ else if (MI.getNumDefs() == 0)
continue;
-
- if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) {
- // We require retaining and horizontal operations to operate upon zero'd
- // false lanes to ensure the conversion doesn't change the output.
- if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI))
- return false;
- // Otherwise we need to evaluate this instruction later to see whether
- // unknown false lanes will get masked away by their user(s).
+ else if (!isPredicated && retainsOrReduces)
+ return false;
+ else if (!isPredicated)
FalseLanesUnknown.insert(&MI);
- } else if (!isHorizontalReduction(MI))
- FalseLanesZero.insert(&MI);
}
auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
SmallPtrSetImpl<MachineInstr *> &Predicated) {
SmallPtrSet<MachineInstr *, 2> Uses;
- RDA.getGlobalUses(MI, MO.getReg(), Uses);
+ RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
for (auto *Use : Uses) {
if (Use != MI && !Predicated.count(Use))
return false;
@@ -840,139 +982,155 @@ bool LowOverheadLoop::ValidateLiveOuts() {
LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
NonPredicated.insert(MI);
- continue;
+ break;
}
}
// Any unknown false lanes have been masked away by the user(s).
- Predicated.insert(MI);
+ if (!NonPredicated.contains(MI))
+ Predicated.insert(MI);
}
SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
- SmallPtrSet<MachineInstr*, 2> LiveOutUsers;
SmallVector<MachineBasicBlock *, 2> ExitBlocks;
ML.getExitBlocks(ExitBlocks);
assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
assert(ExitBlocks.size() == 1 && "Expected a single exit block");
MachineBasicBlock *ExitBB = ExitBlocks.front();
for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
+ // TODO: Instead of blocking predication, we could move the vctp to the exit
+ // block and calculate it's operand there in or the preheader.
+ if (RegMask.PhysReg == ARM::VPR)
+ return false;
// Check Q-regs that are live in the exit blocks. We don't collect scalars
// because they won't be affected by lane predication.
- if (QPRs->contains(RegMask.PhysReg)) {
+ if (QPRs->contains(RegMask.PhysReg))
if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
LiveOutMIs.insert(MI);
- RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers);
- }
}
- // If we have any non-predicated live-outs, they need to be part of a
- // reduction that we can fixup later. The reduction that the form of an
- // operation that uses its previous values through a vmov and then a vpsel
- // resides in the exit blocks to select the final bytes from n and n-1
- // iterations.
- if (!NonPredicated.empty() &&
- !FindValidReduction(NonPredicated, LiveOutUsers))
- return false;
-
// We've already validated that any VPT predication within the loop will be
// equivalent when we perform the predication transformation; so we know that
// any VPT predicated instruction is predicated upon VCTP. Any live-out
// instruction needs to be predicated, so check this here. The instructions
// in NonPredicated have been found to be a reduction that we can ensure its
// legality.
- for (auto *MI : LiveOutMIs)
- if (!isVectorPredicated(MI) && !NonPredicated.count(MI))
+ for (auto *MI : LiveOutMIs) {
+ if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI);
return false;
+ }
+ }
return true;
}
-void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
+void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
if (Revert)
return;
- if (!End->getOperand(1).isMBB())
- report_fatal_error("Expected LoopEnd to target basic block");
+ // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
+ // can only jump back.
+ auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
+ ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
+ MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
+ ? End->getOperand(1).getMBB()
+ : End->getOperand(2).getMBB();
+ // TODO Maybe there's cases where the target doesn't have to be the header,
+ // but for now be safe and revert.
+ if (TgtBB != ML.getHeader()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
+ return false;
+ }
- // TODO Maybe there's cases where the target doesn't have to be the header,
- // but for now be safe and revert.
- if (End->getOperand(1).getMBB() != ML.getHeader()) {
- LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
- Revert = true;
- return;
- }
+ // The WLS and LE instructions have 12-bits for the label offset. WLS
+ // requires a positive offset, while LE uses negative.
+ if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
+ !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
+ return false;
+ }
- // The WLS and LE instructions have 12-bits for the label offset. WLS
- // requires a positive offset, while LE uses negative.
- if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
- !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
- Revert = true;
- return;
- }
+ if (Start->getOpcode() == ARM::t2WhileLoopStart &&
+ (BBUtils->getOffsetOf(Start) >
+ BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
+ !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
+ return false;
+ }
+ return true;
+ };
- if (Start->getOpcode() == ARM::t2WhileLoopStart &&
- (BBUtils->getOffsetOf(Start) >
- BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
- !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
- LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
- Revert = true;
- return;
- }
+ // Find a suitable position to insert the loop start instruction. It needs to
+ // be able to safely define LR.
+ auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec,
+ MachineBasicBlock::iterator &InsertPt,
+ MachineBasicBlock *&InsertBB,
+ ReachingDefAnalysis &RDA,
+ InstSet &ToRemove) {
+ // For a t2DoLoopStart it is always valid to use the start insertion point.
+ // For WLS we can define LR if LR already contains the same value.
+ if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) {
+ InsertPt = MachineBasicBlock::iterator(Start);
+ InsertBB = Start->getParent();
+ return true;
+ }
+
+ // We've found no suitable LR def and Start doesn't use LR directly. Can we
+ // just define LR anyway?
+ if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
+ return false;
+
+ InsertPt = MachineBasicBlock::iterator(Start);
+ InsertBB = Start->getParent();
+ return true;
+ };
- InsertPt = Revert ? nullptr : isSafeToDefineLR();
- if (!InsertPt) {
+ if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
+ ToRemove)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
Revert = true;
return;
- } else
- LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
+ }
+ LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
+ dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
+ else
+ dbgs() << "ARM Loops: Will insert LoopStart at "
+ << *StartInsertPt
+ );
- if (!IsTailPredicationLegal()) {
- LLVM_DEBUG(if (!VCTP)
- dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
- dbgs() << "ARM Loops: Tail-predication is not valid.\n");
- return;
+ Revert = !ValidateRanges(Start, End, BBUtils, ML);
+ CannotTailPredicate = !ValidateTailPredicate();
+}
+
+bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
+ if (VCTPs.empty()) {
+ VCTPs.push_back(MI);
+ return true;
}
- assert(ML.getBlocks().size() == 1 &&
- "Shouldn't be processing a loop with more than one block");
- CannotTailPredicate = !ValidateTailPredicate(InsertPt);
- LLVM_DEBUG(if (CannotTailPredicate)
- dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
+ // If we find another VCTP, check whether it uses the same value as the main VCTP.
+ // If it does, store it in the VCTPs set, else refuse it.
+ MachineInstr *Prev = VCTPs.back();
+ if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+ !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
+ "definition from the main VCTP");
+ return false;
+ }
+ VCTPs.push_back(MI);
+ return true;
}
bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
if (CannotTailPredicate)
return false;
- if (isVCTP(MI)) {
- // If we find another VCTP, check whether it uses the same value as the main VCTP.
- // If it does, store it in the SecondaryVCTPs set, else refuse it.
- if (VCTP) {
- if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
- !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
- "definition from the main VCTP");
- return false;
- }
- LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
- SecondaryVCTPs.insert(MI);
- } else {
- LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
- VCTP = MI;
- }
- } else if (isVPTOpcode(MI->getOpcode())) {
- if (MI->getOpcode() != ARM::MVE_VPST) {
- assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
- "VPT does not implicitly define VPR?!");
- CurrentPredicate.insert(MI);
- }
-
- VPTBlocks.emplace_back(MI, CurrentPredicate);
- CurrentBlock = &VPTBlocks.back();
+ if (!shouldInspect(*MI))
return true;
- } else if (MI->getOpcode() == ARM::MVE_VPSEL ||
- MI->getOpcode() == ARM::MVE_VPNOT) {
+
+ if (MI->getOpcode() == ARM::MVE_VPSEL ||
+ MI->getOpcode() == ARM::MVE_VPNOT) {
// TODO: Allow VPSEL and VPNOT, we currently cannot because:
// 1) It will use the VPR as a predicate operand, but doesn't have to be
// instead a VPT block, which means we can assert while building up
@@ -984,49 +1142,62 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
return false;
}
- bool IsUse = false;
- bool IsDef = false;
+ // Record all VCTPs and check that they're equivalent to one another.
+ if (isVCTP(MI) && !AddVCTP(MI))
+ return false;
+
+ // Inspect uses first so that any instructions that alter the VPR don't
+ // alter the predicate upon themselves.
const MCInstrDesc &MCID = MI->getDesc();
- for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
- const MachineOperand &MO = MI->getOperand(i);
- if (!MO.isReg() || MO.getReg() != ARM::VPR)
+ bool IsUse = false;
+ unsigned LastOpIdx = MI->getNumOperands() - 1;
+ for (auto &Op : enumerate(reverse(MCID.operands()))) {
+ const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
+ if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
continue;
- if (MO.isDef()) {
- CurrentPredicate.insert(MI);
- IsDef = true;
- } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
- CurrentBlock->addInst(MI, CurrentPredicate);
+ if (ARM::isVpred(Op.value().OperandType)) {
+ VPTState::addInst(MI);
IsUse = true;
- } else {
+ } else if (MI->getOpcode() != ARM::MVE_VPST) {
LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
return false;
}
}
- // If we find a vpr def that is not already predicated on the vctp, we've
- // got disjoint predicates that may not be equivalent when we do the
- // conversion.
- if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
- return false;
- }
-
- uint64_t Flags = MCID.TSFlags;
- if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
- return true;
-
// If we find an instruction that has been marked as not valid for tail
// predication, only allow the instruction if it's contained within a valid
// VPT block.
- if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
- return false;
+ bool RequiresExplicitPredication =
+ (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
+ if (isDomainMVE(MI) && RequiresExplicitPredication) {
+ LLVM_DEBUG(if (!IsUse)
+ dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
+ return IsUse;
}
// If the instruction is already explicitly predicated, then the conversion
- // will be fine, but ensure that all memory operations are predicated.
- return !IsUse && MI->mayLoadOrStore() ? false : true;
+ // will be fine, but ensure that all store operations are predicated.
+ if (MI->mayStore())
+ return IsUse;
+
+ // If this instruction defines the VPR, update the predicate for the
+ // proceeding instructions.
+ if (isVectorPredicate(MI)) {
+ // Clear the existing predicate when we're not in VPT Active state,
+ // otherwise we add to it.
+ if (!isVectorPredicated(MI))
+ VPTState::resetPredicate(MI);
+ else
+ VPTState::addPredicate(MI);
+ }
+
+ // Finally once the predicate has been modified, we can start a new VPT
+ // block if necessary.
+ if (isVPTOpcode(MI->getOpcode()))
+ VPTState::CreateVPTBlock(MI);
+
+ return true;
}
bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -1049,7 +1220,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
bool Changed = false;
for (auto ML : *MLI) {
- if (!ML->getParentLoop())
+ if (ML->isOutermost())
Changed |= ProcessLoop(ML);
}
Changed |= RevertNonLoops();
@@ -1108,6 +1279,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
LoLoop.Dec = &MI;
else if (MI.getOpcode() == ARM::t2LoopEnd)
LoLoop.End = &MI;
+ else if (MI.getOpcode() == ARM::t2LoopEndDec)
+ LoLoop.End = LoLoop.Dec = &MI;
else if (isLoopStart(MI))
LoLoop.Start = &MI;
else if (MI.getDesc().isCall()) {
@@ -1130,15 +1303,18 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
return false;
}
- // Check that the only instruction using LoopDec is LoopEnd.
+ // Check that the only instruction using LoopDec is LoopEnd. This can only
+ // happen when the Dec and End are separate, not a single t2LoopEndDec.
// TODO: Check for copy chains that really have no effect.
- SmallPtrSet<MachineInstr*, 2> Uses;
- RDA->getReachingLocalUses(LoLoop.Dec, ARM::LR, Uses);
- if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
- LoLoop.Revert = true;
+ if (LoLoop.Dec != LoLoop.End) {
+ SmallPtrSet<MachineInstr *, 2> Uses;
+ RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
+ if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
+ LoLoop.Revert = true;
+ }
}
- LoLoop.CheckLegality(BBUtils.get());
+ LoLoop.Validate(BBUtils.get());
Expand(LoLoop);
return true;
}
@@ -1149,23 +1325,16 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
- MachineBasicBlock *MBB = MI->getParent();
- MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(ARM::t2CMPri));
- MIB.add(MI->getOperand(0));
- MIB.addImm(0);
- MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::NoRegister);
-
MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
- MIB.add(MI->getOperand(1)); // branch target
- MIB.addImm(ARMCC::EQ); // condition code
- MIB.addReg(ARM::CPSR);
- MI->eraseFromParent();
+ RevertWhileLoopStart(MI, TII, BrOpc);
+}
+
+void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
+ RevertDoLoopStart(MI, TII);
}
bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
@@ -1180,23 +1349,10 @@ bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
}
// If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
- bool SetFlags = RDA->isSafeToDefRegAt(MI, ARM::CPSR, Ignore);
-
- MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(ARM::t2SUBri));
- MIB.addDef(ARM::LR);
- MIB.add(MI->getOperand(1));
- MIB.add(MI->getOperand(2));
- MIB.addImm(ARMCC::AL);
- MIB.addReg(0);
-
- if (SetFlags) {
- MIB.addReg(ARM::CPSR);
- MIB->getOperand(5).setIsDef(true);
- } else
- MIB.addReg(0);
+ bool SetFlags =
+ RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
- MI->eraseFromParent();
+ llvm::RevertLoopDec(MI, TII, SetFlags);
return SetFlags;
}
@@ -1204,27 +1360,39 @@ bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
- MachineBasicBlock *MBB = MI->getParent();
- // Create cmp
- if (!SkipCmp) {
- MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(ARM::t2CMPri));
- MIB.addReg(ARM::LR);
- MIB.addImm(0);
- MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::NoRegister);
- }
-
MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;
- // Create bne
+ llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
+}
+
+// Generate a subs, or sub and cmp, and a branch instead of an LE.
+void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
+ assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
+ MachineBasicBlock *MBB = MI->getParent();
+
MachineInstrBuilder MIB =
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
- MIB.add(MI->getOperand(1)); // branch target
- MIB.addImm(ARMCC::NE); // condition code
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
+ MIB.addDef(ARM::LR);
+ MIB.add(MI->getOperand(1));
+ MIB.addImm(1);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ MIB.addReg(ARM::CPSR);
+ MIB->getOperand(5).setIsDef(true);
+
+ MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
+ unsigned BrOpc =
+ BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
+
+ // Create bne
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+ MIB.add(MI->getOperand(2)); // branch target
+ MIB.addImm(ARMCC::NE); // condition code
MIB.addReg(ARM::CPSR);
+
MI->eraseFromParent();
}
@@ -1235,7 +1403,7 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
//
// $lr = big-itercount-expression
// ..
-// t2DoLoopStart renamable $lr
+// $lr = t2DoLoopStart renamable $lr
// vector.body:
// ..
// $vpr = MVE_VCTP32 renamable $r3
@@ -1258,7 +1426,8 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
- MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0);
+ MachineInstr *Def =
+ RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0);
if (!Def) {
LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
return;
@@ -1266,56 +1435,9 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
// Collect and remove the users of iteration count.
SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
- LoLoop.End, LoLoop.InsertPt };
- SmallPtrSet<MachineInstr*, 2> Remove;
- if (RDA->isSafeToRemove(Def, Remove, Killed))
- LoLoop.ToRemove.insert(Remove.begin(), Remove.end());
- else {
+ LoLoop.End };
+ if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
- return;
- }
-
- // Collect the dead code and the MBBs in which they reside.
- RDA->collectKilledOperands(Def, Killed);
- SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
- for (auto *MI : Killed)
- BasicBlocks.insert(MI->getParent());
-
- // Collect IT blocks in all affected basic blocks.
- std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
- for (auto *MBB : BasicBlocks) {
- for (auto &MI : *MBB) {
- if (MI.getOpcode() != ARM::t2IT)
- continue;
- RDA->getReachingLocalUses(&MI, ARM::ITSTATE, ITBlocks[&MI]);
- }
- }
-
- // If we're removing all of the instructions within an IT block, then
- // also remove the IT instruction.
- SmallPtrSet<MachineInstr*, 2> ModifiedITs;
- for (auto *MI : Killed) {
- if (MachineOperand *MO = MI->findRegisterUseOperand(ARM::ITSTATE)) {
- MachineInstr *IT = RDA->getMIOperand(MI, *MO);
- auto &CurrentBlock = ITBlocks[IT];
- CurrentBlock.erase(MI);
- if (CurrentBlock.empty())
- ModifiedITs.erase(IT);
- else
- ModifiedITs.insert(IT);
- }
- }
-
- // Delete the killed instructions only if we don't have any IT blocks that
- // need to be modified because we need to fixup the mask.
- // TODO: Handle cases where IT blocks are modified.
- if (ModifiedITs.empty()) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Will remove iteration count:\n";
- for (auto *MI : Killed)
- dbgs() << " - " << *MI);
- LoLoop.ToRemove.insert(Killed.begin(), Killed.end());
- } else
- LLVM_DEBUG(dbgs() << "ARM Loops: Would need to modify IT block(s).\n");
}
MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
@@ -1324,84 +1446,25 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
// calculate the number of loop iterations.
IterationCountDCE(LoLoop);
- MachineInstr *InsertPt = LoLoop.InsertPt;
+ MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
MachineInstr *Start = LoLoop.Start;
- MachineBasicBlock *MBB = InsertPt->getParent();
- bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+ MachineBasicBlock *MBB = LoLoop.StartInsertBB;
unsigned Opc = LoLoop.getStartOpcode();
- MachineOperand &Count = LoLoop.getCount();
+ MachineOperand &Count = LoLoop.getLoopStartOperand();
MachineInstrBuilder MIB =
- BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
+ BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
MIB.addDef(ARM::LR);
MIB.add(Count);
- if (!IsDo)
+ if (!isDo(Start))
MIB.add(Start->getOperand(1));
- // If we're inserting at a mov lr, then remove it as it's redundant.
- if (InsertPt != Start)
- LoLoop.ToRemove.insert(InsertPt);
LoLoop.ToRemove.insert(Start);
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
return &*MIB;
}
-void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const {
- LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n");
- auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) {
- MachineBasicBlock *MBB = InsertPt.getParent();
- MachineInstrBuilder MIB =
- BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR));
- MIB.addDef(To);
- MIB.addReg(From);
- MIB.addReg(From);
- MIB.addImm(0);
- MIB.addReg(0);
- MIB.addReg(To);
- LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB);
- };
-
- for (auto &Reduction : LoLoop.Reductions) {
- MachineInstr &Copy = Reduction->Copy;
- MachineInstr &Reduce = Reduction->Reduce;
- Register DestReg = Copy.getOperand(0).getReg();
-
- // Change the initialiser if present
- if (Reduction->Init) {
- MachineInstr *Init = Reduction->Init;
-
- for (unsigned i = 0; i < Init->getNumOperands(); ++i) {
- MachineOperand &MO = Init->getOperand(i);
- if (MO.isReg() && MO.isUse() && MO.isTied() &&
- Init->findTiedOperandIdx(i) == 0)
- Init->getOperand(i).setReg(DestReg);
- }
- Init->getOperand(0).setReg(DestReg);
- LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init);
- } else
- BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg());
-
- // Change the reducing op to write to the register that is used to copy
- // its value on the next iteration. Also update the tied-def operand.
- Reduce.getOperand(0).setReg(DestReg);
- Reduce.getOperand(5).setReg(DestReg);
- LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce);
-
- // Instead of a vpsel, just copy the register into the necessary one.
- MachineInstr &VPSEL = Reduction->VPSEL;
- if (VPSEL.getOperand(0).getReg() != DestReg)
- BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg);
-
- // Remove the unnecessary instructions.
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n"
- << " - " << Copy
- << " - " << VPSEL << "\n");
- Copy.eraseFromParent();
- VPSEL.eraseFromParent();
- }
-}
-
void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
auto RemovePredicate = [](MachineInstr *MI) {
LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
@@ -1414,23 +1477,39 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
llvm_unreachable("trying to unpredicate a non-predicated instruction");
};
- // There are a few scenarios which we have to fix up:
- // 1. VPT Blocks with non-uniform predicates:
- // - a. When the divergent instruction is a vctp
- // - b. When the block uses a vpst, and is only predicated on the vctp
- // - c. When the block uses a vpt and (optionally) contains one or more
- // vctp.
- // 2. VPT Blocks with uniform predicates:
- // - a. The block uses a vpst, and is only predicated on the vctp
for (auto &Block : LoLoop.getVPTBlocks()) {
- SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
- if (Block.HasNonUniformPredicate()) {
- PredicatedMI *Divergent = Block.getDivergent();
- if (isVCTP(Divergent->MI)) {
- // The vctp will be removed, so the block mask of the vp(s)t will need
- // to be recomputed.
- LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
- } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+ SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+
+ auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
+ assert(TheVCMP && "Replacing a removed or non-existent VCMP");
+ // Replace the VCMP with a VPT
+ MachineInstrBuilder MIB =
+ BuildMI(*At->getParent(), At, At->getDebugLoc(),
+ TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
+ MIB.addImm(ARMVCC::Then);
+ // Register one
+ MIB.add(TheVCMP->getOperand(1));
+ // Register two
+ MIB.add(TheVCMP->getOperand(2));
+ // The comparison code, e.g. ge, eq, lt
+ MIB.add(TheVCMP->getOperand(3));
+ LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
+ LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+ LoLoop.ToRemove.insert(TheVCMP);
+ TheVCMP = nullptr;
+ };
+
+ if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
+ MachineInstr *VPST = Insts.front();
+ if (VPTState::hasUniformPredicate(Block)) {
+ // A vpt block starting with VPST, is only predicated upon vctp and has no
+ // internal vpr defs:
+ // - Remove vpst.
+ // - Unpredicate the remaining instructions.
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+ for (unsigned i = 1; i < Insts.size(); ++i)
+ RemovePredicate(Insts[i]);
+ } else {
// The VPT block has a non-uniform predicate but it uses a vpst and its
// entry is guarded only by a vctp, which means we:
// - Need to remove the original vpst.
@@ -1438,73 +1517,83 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
// we come across the divergent vpr def.
// - Insert a new vpst to predicate the instruction(s) that following
// the divergent vpr def.
- // TODO: We could be producing more VPT blocks than necessary and could
- // fold the newly created one into a proceeding one.
- for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
- E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+ MachineInstr *Divergent = VPTState::getDivergent(Block);
+ auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
+ bool DivergentNextIsPredicated =
+ getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
+
+ for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
+ I != E; ++I)
RemovePredicate(&*I);
- unsigned Size = 0;
- auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
- auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
- MachineInstr *InsertAt = nullptr;
- while (I != E) {
- InsertAt = &*I;
- ++Size;
- ++I;
+ // Check if the instruction defining vpr is a vcmp so it can be combined
+ // with the VPST This should be the divergent instruction
+ MachineInstr *VCMP =
+ VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
+
+ if (DivergentNextIsPredicated) {
+ // Insert a VPST at the divergent only if the next instruction
+ // would actually use it. A VCMP following a VPST can be
+ // merged into a VPT so do that instead if the VCMP exists.
+ if (!VCMP) {
+ // Create a VPST (with a null mask for now, we'll recompute it
+ // later)
+ MachineInstrBuilder MIB =
+ BuildMI(*Divergent->getParent(), Divergent,
+ Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
+ MIB.addImm(0);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+ LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+ } else {
+ // No RDA checks are necessary here since the VPST would have been
+ // directly after the VCMP
+ ReplaceVCMPWithVPT(VCMP, VCMP);
+ }
}
- // Create a VPST (with a null mask for now, we'll recompute it later).
- MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
- InsertAt->getDebugLoc(),
- TII->get(ARM::MVE_VPST));
- MIB.addImm(0);
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
- LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
- LoLoop.ToRemove.insert(Block.getPredicateThen());
- LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
}
- // Else, if the block uses a vpt, iterate over the block, removing the
- // extra VCTPs it may contain.
- else if (Block.isVPT()) {
- bool RemovedVCTP = false;
- for (PredicatedMI &Elt : Block.getInsts()) {
- MachineInstr *MI = Elt.MI;
- if (isVCTP(MI)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
- LoLoop.ToRemove.insert(MI);
- RemovedVCTP = true;
- continue;
- }
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+ LoLoop.ToRemove.insert(VPST);
+ } else if (Block.containsVCTP()) {
+ // The vctp will be removed, so either the entire block will be dead or
+ // the block mask of the vp(s)t will need to be recomputed.
+ MachineInstr *VPST = Insts.front();
+ if (Block.size() == 2) {
+ assert(VPST->getOpcode() == ARM::MVE_VPST &&
+ "Found a VPST in an otherwise empty vpt block");
+ LoLoop.ToRemove.insert(VPST);
+ } else
+ LoLoop.BlockMasksToRecompute.insert(VPST);
+ } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
+ // If this block starts with a VPST then attempt to merge it with the
+ // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
+ // block that no longer exists
+ MachineInstr *VPST = Insts.front();
+ auto Next = ++MachineBasicBlock::iterator(VPST);
+ assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
+ "The instruction after a VPST must be predicated");
+ (void)Next;
+ MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
+ if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
+ !LoLoop.ToRemove.contains(VprDef)) {
+ MachineInstr *VCMP = VprDef;
+ // The VCMP and VPST can only be merged if the VCMP's operands will have
+ // the same values at the VPST.
+ // If any of the instructions between the VCMP and VPST are predicated
+ // then a different code path is expected to have merged the VCMP and
+ // VPST already.
+ if (!std::any_of(++MachineBasicBlock::iterator(VCMP),
+ MachineBasicBlock::iterator(VPST), hasVPRUse) &&
+ RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
+ RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
+ ReplaceVCMPWithVPT(VCMP, VPST);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
+ LoLoop.ToRemove.insert(VPST);
}
- if (RemovedVCTP)
- LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
}
- } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
- // A vpt block starting with VPST, is only predicated upon vctp and has no
- // internal vpr defs:
- // - Remove vpst.
- // - Unpredicate the remaining instructions.
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
- LoLoop.ToRemove.insert(Block.getPredicateThen());
- for (auto &PredMI : Insts)
- RemovePredicate(PredMI.MI);
- }
- }
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
- // Remove the "main" VCTP
- LoLoop.ToRemove.insert(LoLoop.VCTP);
- LLVM_DEBUG(dbgs() << " " << *LoLoop.VCTP);
- // Remove remaining secondary VCTPs
- for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
- // All VCTPs that aren't marked for removal yet should be unpredicated ones.
- // The predicated ones should have already been marked for removal when
- // visiting the VPT blocks.
- if (LoLoop.ToRemove.insert(VCTP).second) {
- assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
- "Removing Predicated VCTP without updating the block mask!");
- LLVM_DEBUG(dbgs() << " " << *VCTP);
}
}
+
+ LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
}
void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
@@ -1518,8 +1607,9 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
TII->get(Opc));
MIB.addDef(ARM::LR);
- MIB.add(End->getOperand(0));
- MIB.add(End->getOperand(1));
+ unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
+ MIB.add(End->getOperand(Off + 0));
+ MIB.add(End->getOperand(Off + 1));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
LoLoop.ToRemove.insert(LoLoop.Dec);
LoLoop.ToRemove.insert(End);
@@ -1547,18 +1637,18 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
RevertWhile(LoLoop.Start);
else
- LoLoop.Start->eraseFromParent();
- bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec);
- RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
+ RevertDo(LoLoop.Start);
+ if (LoLoop.Dec == LoLoop.End)
+ RevertLoopEndDec(LoLoop.End);
+ else
+ RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
} else {
LoLoop.Start = ExpandLoopStart(LoLoop);
RemoveDeadBranch(LoLoop.Start);
LoLoop.End = ExpandLoopEnd(LoLoop);
RemoveDeadBranch(LoLoop.End);
- if (LoLoop.IsTailPredicationLegal()) {
+ if (LoLoop.IsTailPredicationLegal())
ConvertVPTBlocks(LoLoop);
- FixupReductions(LoLoop);
- }
for (auto *I : LoLoop.ToRemove) {
LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
I->eraseFromParent();
@@ -1595,6 +1685,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
SmallVector<MachineInstr*, 4> Starts;
SmallVector<MachineInstr*, 4> Decs;
SmallVector<MachineInstr*, 4> Ends;
+ SmallVector<MachineInstr *, 4> EndDecs;
for (auto &I : MBB) {
if (isLoopStart(I))
@@ -1603,9 +1694,11 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
Decs.push_back(&I);
else if (I.getOpcode() == ARM::t2LoopEnd)
Ends.push_back(&I);
+ else if (I.getOpcode() == ARM::t2LoopEndDec)
+ EndDecs.push_back(&I);
}
- if (Starts.empty() && Decs.empty() && Ends.empty())
+ if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty())
continue;
Changed = true;
@@ -1614,13 +1707,15 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
if (Start->getOpcode() == ARM::t2WhileLoopStart)
RevertWhile(Start);
else
- Start->eraseFromParent();
+ RevertDo(Start);
}
for (auto *Dec : Decs)
RevertLoopDec(Dec);
for (auto *End : Ends)
RevertLoopEnd(End);
+ for (auto *End : EndDecs)
+ RevertLoopEndDec(End);
}
return Changed;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index e750649ce86c..9a7c1f541aa2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsARM.h"
@@ -201,8 +202,7 @@ namespace {
public:
WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
: NewLd(Wide) {
- for (auto *I : Lds)
- Loads.push_back(I);
+ append_range(Loads, Lds);
}
LoadInst *getLoad() {
return NewLd;
@@ -374,7 +374,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
DepMap RAWDeps;
// Record any writes that may alias a load.
- const auto Size = LocationSize::unknown();
+ const auto Size = LocationSize::beforeOrAfterPointer();
for (auto Write : Writes) {
for (auto Read : Loads) {
MemoryLocation ReadLoc =
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
index 1ae71be9f760..2dc097566d14 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
@@ -77,6 +77,8 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">,
AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
+def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">,
+ AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">,
AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">;
@@ -187,6 +189,9 @@ let RecomputePerFunction = 1 in {
def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
" TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
"Subtarget->hasMinSize())">;
+ def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>;
+ def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>;
+
}
def UseMulOps : Predicate<"Subtarget->useMulOps()">;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index f9dbfef4c113..1a7f10a13ed3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -156,10 +156,10 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
"Subclass not added?");
assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
"Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnoip_and_tcGPRRegClassID)) &&
"Subclass not added?");
- assert(RBGPR.covers(
- *TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
+ assert(RBGPR.covers(*TRI.getRegClass(
+ ARM::tGPREven_and_GPRnoip_and_tcGPRRegClassID)) &&
"Subclass not added?");
assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
"Subclass not added?");
@@ -182,10 +182,12 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
switch (RC.getID()) {
case GPRRegClassID:
case GPRwithAPSRRegClassID:
+ case GPRnoipRegClassID:
case GPRnopcRegClassID:
+ case GPRnoip_and_GPRnopcRegClassID:
case rGPRRegClassID:
case GPRspRegClassID:
- case tGPR_and_tcGPRRegClassID:
+ case GPRnoip_and_tcGPRRegClassID:
case tcGPRRegClassID:
case tGPRRegClassID:
case tGPREvenRegClassID:
@@ -193,7 +195,7 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
case tGPR_and_tGPREvenRegClassID:
case tGPR_and_tGPROddRegClassID:
case tGPREven_and_tcGPRRegClassID:
- case tGPREven_and_tGPR_and_tcGPRRegClassID:
+ case tGPREven_and_GPRnoip_and_tcGPRRegClassID:
case tGPROdd_and_tcGPRRegClassID:
return getRegBank(ARM::GPRRegBankID);
case HPRRegClassID:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
index a384b0dc757c..fe3243315d68 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -235,6 +235,23 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
let DiagnosticString = "operand must be a register in range [r0, r15]";
}
+// Register set that excludes registers that are reserved for procedure calls.
+// This is used for pseudo-instructions that are actually implemented using a
+// procedure call.
+def GPRnoip : RegisterClass<"ARM", [i32], 32, (sub GPR, R12, LR)> {
+ // Allocate LR as the first CSR since it is always saved anyway.
+ // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't
+ // know how to spill them. If we make our prologue/epilogue code smarter at
+ // some point, we can go back to using the above allocation orders for the
+ // Thumb1 instructions that know how to use hi regs.
+ let AltOrders = [(add GPRnoip, GPRnoip), (trunc GPRnoip, 8),
+ (add (trunc GPRnoip, 8), (shl GPRnoip, 8))];
+ let AltOrderSelect = [{
+ return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
+ }];
+ let DiagnosticString = "operand must be a register in range [r0, r14]";
+}
+
// GPRs without the PC. Some ARM instructions do not allow the PC in
// certain operand slots, particularly as the destination. Primarily
// useful for disassembly.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
new file mode 100644
index 000000000000..cfcc7d5a0408
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSLSHardening.cpp
@@ -0,0 +1,416 @@
+//===- ARMSLSHardening.cpp - Harden Straight Line Missspeculation ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to insert code to mitigate against side channel
+// vulnerabilities that may happen under straight line miss-speculation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/IndirectThunks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-sls-hardening"
+
+#define ARM_SLS_HARDENING_NAME "ARM sls hardening pass"
+
+namespace {
+
+class ARMSLSHardening : public MachineFunctionPass {
+public:
+ const TargetInstrInfo *TII;
+ const ARMSubtarget *ST;
+
+ static char ID;
+
+ ARMSLSHardening() : MachineFunctionPass(ID) {
+ initializeARMSLSHardeningPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override { return ARM_SLS_HARDENING_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const;
+ bool hardenIndirectCalls(MachineBasicBlock &MBB) const;
+ MachineBasicBlock &
+ ConvertIndirectCallToIndirectJump(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator) const;
+};
+
+} // end anonymous namespace
+
+char ARMSLSHardening::ID = 0;
+
+INITIALIZE_PASS(ARMSLSHardening, "arm-sls-hardening",
+ ARM_SLS_HARDENING_NAME, false, false)
+
+static void insertSpeculationBarrier(const ARMSubtarget *ST,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL,
+ bool AlwaysUseISBDSB = false) {
+ assert(MBBI != MBB.begin() &&
+ "Must not insert SpeculationBarrierEndBB as only instruction in MBB.");
+ assert(std::prev(MBBI)->isBarrier() &&
+ "SpeculationBarrierEndBB must only follow unconditional control flow "
+ "instructions.");
+ assert(std::prev(MBBI)->isTerminator() &&
+ "SpeculationBarrierEndBB must only follow terminators.");
+ const TargetInstrInfo *TII = ST->getInstrInfo();
+ assert(ST->hasDataBarrier() || ST->hasSB());
+ bool ProduceSB = ST->hasSB() && !AlwaysUseISBDSB;
+ unsigned BarrierOpc =
+ ProduceSB ? (ST->isThumb() ? ARM::t2SpeculationBarrierSBEndBB
+ : ARM::SpeculationBarrierSBEndBB)
+ : (ST->isThumb() ? ARM::t2SpeculationBarrierISBDSBEndBB
+ : ARM::SpeculationBarrierISBDSBEndBB);
+ if (MBBI == MBB.end() || !isSpeculationBarrierEndBBOpcode(MBBI->getOpcode()))
+ BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc));
+}
+
+bool ARMSLSHardening::runOnMachineFunction(MachineFunction &MF) {
+ ST = &MF.getSubtarget<ARMSubtarget>();
+ TII = MF.getSubtarget().getInstrInfo();
+
+ bool Modified = false;
+ for (auto &MBB : MF) {
+ Modified |= hardenReturnsAndBRs(MBB);
+ Modified |= hardenIndirectCalls(MBB);
+ }
+
+ return Modified;
+}
+
+bool ARMSLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
+ if (!ST->hardenSlsRetBr())
+ return false;
+ assert(!ST->isThumb1Only());
+ bool Modified = false;
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end();
+ MachineBasicBlock::iterator NextMBBI;
+ for (; MBBI != E; MBBI = NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ NextMBBI = std::next(MBBI);
+ if (isIndirectControlFlowNotComingBack(MI)) {
+ assert(MI.isTerminator());
+ assert(!TII->isPredicated(MI));
+ insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc());
+ Modified = true;
+ }
+ }
+ return Modified;
+}
+
+static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
+
+static const struct ThunkNameRegMode {
+ const char* Name;
+ Register Reg;
+ bool isThumb;
+} SLSBLRThunks[] = {
+ {"__llvm_slsblr_thunk_arm_r0", ARM::R0, false},
+ {"__llvm_slsblr_thunk_arm_r1", ARM::R1, false},
+ {"__llvm_slsblr_thunk_arm_r2", ARM::R2, false},
+ {"__llvm_slsblr_thunk_arm_r3", ARM::R3, false},
+ {"__llvm_slsblr_thunk_arm_r4", ARM::R4, false},
+ {"__llvm_slsblr_thunk_arm_r5", ARM::R5, false},
+ {"__llvm_slsblr_thunk_arm_r6", ARM::R6, false},
+ {"__llvm_slsblr_thunk_arm_r7", ARM::R7, false},
+ {"__llvm_slsblr_thunk_arm_r8", ARM::R8, false},
+ {"__llvm_slsblr_thunk_arm_r9", ARM::R9, false},
+ {"__llvm_slsblr_thunk_arm_r10", ARM::R10, false},
+ {"__llvm_slsblr_thunk_arm_r11", ARM::R11, false},
+ {"__llvm_slsblr_thunk_arm_sp", ARM::SP, false},
+ {"__llvm_slsblr_thunk_arm_pc", ARM::PC, false},
+ {"__llvm_slsblr_thunk_thumb_r0", ARM::R0, true},
+ {"__llvm_slsblr_thunk_thumb_r1", ARM::R1, true},
+ {"__llvm_slsblr_thunk_thumb_r2", ARM::R2, true},
+ {"__llvm_slsblr_thunk_thumb_r3", ARM::R3, true},
+ {"__llvm_slsblr_thunk_thumb_r4", ARM::R4, true},
+ {"__llvm_slsblr_thunk_thumb_r5", ARM::R5, true},
+ {"__llvm_slsblr_thunk_thumb_r6", ARM::R6, true},
+ {"__llvm_slsblr_thunk_thumb_r7", ARM::R7, true},
+ {"__llvm_slsblr_thunk_thumb_r8", ARM::R8, true},
+ {"__llvm_slsblr_thunk_thumb_r9", ARM::R9, true},
+ {"__llvm_slsblr_thunk_thumb_r10", ARM::R10, true},
+ {"__llvm_slsblr_thunk_thumb_r11", ARM::R11, true},
+ {"__llvm_slsblr_thunk_thumb_sp", ARM::SP, true},
+ {"__llvm_slsblr_thunk_thumb_pc", ARM::PC, true},
+};
+
+namespace {
+struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
+ const char *getThunkPrefix() { return SLSBLRNamePrefix; }
+ bool mayUseThunk(const MachineFunction &MF) {
+ // FIXME: This could also check if there are any indirect calls in the
+ // function to more accurately reflect if a thunk will be needed.
+ return MF.getSubtarget<ARMSubtarget>().hardenSlsBlr();
+ }
+ void insertThunks(MachineModuleInfo &MMI);
+ void populateThunk(MachineFunction &MF);
+};
+} // namespace
+
+void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+ // FIXME: It probably would be possible to filter which thunks to produce
+ // based on which registers are actually used in indirect calls in this
+ // function. But would that be a worthwhile optimization?
+ for (auto T : SLSBLRThunks)
+ createThunkFunction(MMI, T.Name);
+}
+
+void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+ // FIXME: How to better communicate Register number, rather than through
+ // name and lookup table?
+ assert(MF.getName().startswith(getThunkPrefix()));
+ auto ThunkIt = llvm::find_if(
+ SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); });
+ assert(ThunkIt != std::end(SLSBLRThunks));
+ Register ThunkReg = ThunkIt->Reg;
+ bool isThumb = ThunkIt->isThumb;
+
+ const TargetInstrInfo *TII = MF.getSubtarget<ARMSubtarget>().getInstrInfo();
+ MachineBasicBlock *Entry = &MF.front();
+ Entry->clear();
+
+ // These thunks need to consist of the following instructions:
+ // __llvm_slsblr_thunk_(arm/thumb)_rN:
+ // bx rN
+ // barrierInsts
+ Entry->addLiveIn(ThunkReg);
+ if (isThumb)
+ BuildMI(Entry, DebugLoc(), TII->get(ARM::tBX))
+ .addReg(ThunkReg)
+ .add(predOps(ARMCC::AL));
+ else
+ BuildMI(Entry, DebugLoc(), TII->get(ARM::BX))
+ .addReg(ThunkReg);
+
+ // Make sure the thunks do not make use of the SB extension in case there is
+ // a function somewhere that will call to it that for some reason disabled
+ // the SB extension locally on that function, even though it's enabled for
+ // the module otherwise. Therefore set AlwaysUseISBSDB to true.
+ insertSpeculationBarrier(&MF.getSubtarget<ARMSubtarget>(), *Entry,
+ Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/);
+}
+
+MachineBasicBlock &ARMSLSHardening::ConvertIndirectCallToIndirectJump(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ // Transform an indirect call to an indirect jump as follows:
+ // Before:
+ // |-----------------------------|
+ // | ... |
+ // | instI |
+ // | BLX rN |
+ // | instJ |
+ // | ... |
+ // |-----------------------------|
+ //
+ // After:
+ // |---------- -------------------------|
+ // | ... |
+ // | instI |
+ // | *call* __llvm_slsblr_thunk_mode_xN |
+ // | instJ |
+ // | ... |
+ // |--------------------------------------|
+ //
+ // __llvm_slsblr_thunk_mode_xN:
+ // |-----------------------------|
+ // | BX rN |
+ // | barrierInsts |
+ // |-----------------------------|
+ //
+ // The __llvm_slsblr_thunk_mode_xN thunks are created by the
+ // SLSBLRThunkInserter.
+ // This function merely needs to transform an indirect call to a direct call
+ // to __llvm_slsblr_thunk_xN.
+ MachineInstr &IndirectCall = *MBBI;
+ assert(isIndirectCall(IndirectCall) && !IndirectCall.isReturn());
+ int RegOpIdxOnIndirectCall = -1;
+ bool isThumb;
+ switch (IndirectCall.getOpcode()) {
+ case ARM::BLX: // !isThumb2
+ case ARM::BLX_noip: // !isThumb2
+ isThumb = false;
+ RegOpIdxOnIndirectCall = 0;
+ break;
+ case ARM::tBLXr: // isThumb2
+ case ARM::tBLXr_noip: // isThumb2
+ isThumb = true;
+ RegOpIdxOnIndirectCall = 2;
+ break;
+ default:
+ llvm_unreachable("unhandled Indirect Call");
+ }
+
+ Register Reg = IndirectCall.getOperand(RegOpIdxOnIndirectCall).getReg();
+ // Since linkers are allowed to clobber R12 on function calls, the above
+ // mitigation only works if the original indirect call instruction was not
+ // using R12. Code generation before must make sure that no indirect call
+ // using R12 was produced if the mitigation is enabled.
+ // Also, the transformation is incorrect if the indirect call uses LR, so
+ // also have to avoid that.
+ assert(Reg != ARM::R12 && Reg != ARM::LR);
+ bool RegIsKilled = IndirectCall.getOperand(RegOpIdxOnIndirectCall).isKill();
+
+ DebugLoc DL = IndirectCall.getDebugLoc();
+
+ MachineFunction &MF = *MBBI->getMF();
+ auto ThunkIt = llvm::find_if(SLSBLRThunks, [Reg, isThumb](auto T) {
+ return T.Reg == Reg && T.isThumb == isThumb;
+ });
+ assert(ThunkIt != std::end(SLSBLRThunks));
+ Module *M = MF.getFunction().getParent();
+ const GlobalValue *GV = cast<GlobalValue>(M->getNamedValue(ThunkIt->Name));
+
+ MachineInstr *BL =
+ isThumb ? BuildMI(MBB, MBBI, DL, TII->get(ARM::tBL))
+ .addImm(IndirectCall.getOperand(0).getImm())
+ .addReg(IndirectCall.getOperand(1).getReg())
+ .addGlobalAddress(GV)
+ : BuildMI(MBB, MBBI, DL, TII->get(ARM::BL)).addGlobalAddress(GV);
+
+ // Now copy the implicit operands from IndirectCall to BL and copy other
+ // necessary info.
+ // However, both IndirectCall and BL instructions implictly use SP and
+ // implicitly define LR. Blindly copying implicit operands would result in SP
+ // and LR operands to be present multiple times. While this may not be too
+ // much of an issue, let's avoid that for cleanliness, by removing those
+ // implicit operands from the BL created above before we copy over all
+ // implicit operands from the IndirectCall.
+ int ImpLROpIdx = -1;
+ int ImpSPOpIdx = -1;
+ for (unsigned OpIdx = BL->getNumExplicitOperands();
+ OpIdx < BL->getNumOperands(); OpIdx++) {
+ MachineOperand Op = BL->getOperand(OpIdx);
+ if (!Op.isReg())
+ continue;
+ if (Op.getReg() == ARM::LR && Op.isDef())
+ ImpLROpIdx = OpIdx;
+ if (Op.getReg() == ARM::SP && !Op.isDef())
+ ImpSPOpIdx = OpIdx;
+ }
+ assert(ImpLROpIdx != -1);
+ assert(ImpSPOpIdx != -1);
+ int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
+ int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
+ BL->RemoveOperand(FirstOpIdxToRemove);
+ BL->RemoveOperand(SecondOpIdxToRemove);
+ // Now copy over the implicit operands from the original IndirectCall
+ BL->copyImplicitOps(MF, IndirectCall);
+ MF.moveCallSiteInfo(&IndirectCall, BL);
+ // Also add the register called in the IndirectCall as being used in the
+ // called thunk.
+ BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
+ RegIsKilled /*isKill*/));
+ // Remove IndirectCallinstruction
+ MBB.erase(MBBI);
+ return MBB;
+}
+
+bool ARMSLSHardening::hardenIndirectCalls(MachineBasicBlock &MBB) const {
+ if (!ST->hardenSlsBlr())
+ return false;
+ bool Modified = false;
+ MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MachineBasicBlock::iterator NextMBBI;
+ for (; MBBI != E; MBBI = NextMBBI) {
+ MachineInstr &MI = *MBBI;
+ NextMBBI = std::next(MBBI);
+ // Tail calls are both indirect calls and "returns".
+ // They are also indirect jumps, so should be handled by sls-harden-retbr,
+ // rather than sls-harden-blr.
+ if (isIndirectCall(MI) && !MI.isReturn()) {
+ ConvertIndirectCallToIndirectJump(MBB, MBBI);
+ Modified = true;
+ }
+ }
+ return Modified;
+}
+
+
+
+FunctionPass *llvm::createARMSLSHardeningPass() {
+ return new ARMSLSHardening();
+}
+
+namespace {
+class ARMIndirectThunks : public MachineFunctionPass {
+public:
+ static char ID;
+
+ ARMIndirectThunks() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "ARM Indirect Thunks"; }
+
+ bool doInitialization(Module &M) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.addPreserved<MachineModuleInfoWrapperPass>();
+ }
+
+private:
+ std::tuple<SLSBLRThunkInserter> TIs;
+
+ // FIXME: When LLVM moves to C++17, these can become folds
+ template <typename... ThunkInserterT>
+ static void initTIs(Module &M,
+ std::tuple<ThunkInserterT...> &ThunkInserters) {
+ (void)std::initializer_list<int>{
+ (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+ }
+ template <typename... ThunkInserterT>
+ static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+ std::tuple<ThunkInserterT...> &ThunkInserters) {
+ bool Modified = false;
+ (void)std::initializer_list<int>{
+ Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+ return Modified;
+ }
+};
+
+} // end anonymous namespace
+
+char ARMIndirectThunks::ID = 0;
+
+FunctionPass *llvm::createARMIndirectThunks() {
+ return new ARMIndirectThunks();
+}
+
+bool ARMIndirectThunks::doInitialization(Module &M) {
+ initTIs(M, TIs);
+ return false;
+}
+
+bool ARMIndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << getPassName() << '\n');
+ auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ return runTIs(MMI, MF, TIs);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td
index ce74d325c4e5..53a2a6fec51e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSchedule.td
@@ -151,7 +151,60 @@ def : PredicateProlog<[{
(void)STI;
}]>;
-def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
+def IsPredicated : CheckFunctionPredicateWithTII<
+ "ARM_MC::isPredicated",
+ "isPredicated"
+>;
+def IsPredicatedPred : MCSchedPredicate<IsPredicated>;
+
+def IsCPSRDefined : CheckFunctionPredicateWithTII<
+ "ARM_MC::isCPSRDefined",
+ "ARMBaseInstrInfo::isCPSRDefined"
+>;
+
+def IsCPSRDefinedPred : MCSchedPredicate<IsCPSRDefined>;
+
+let FunctionMapper = "ARM_AM::getAM2ShiftOpc" in {
+ class CheckAM2NoShift<int n> : CheckImmOperand_s<n, "ARM_AM::no_shift">;
+ class CheckAM2ShiftLSL<int n> : CheckImmOperand_s<n, "ARM_AM::lsl">;
+}
+
+let FunctionMapper = "ARM_AM::getAM2Op" in {
+ class CheckAM2OpAdd<int n> : CheckImmOperand_s<n, "ARM_AM::add"> {}
+ class CheckAM2OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {}
+}
+
+let FunctionMapper = "ARM_AM::getAM2Offset" in {
+ class CheckAM2Offset<int n, int of> : CheckImmOperand<n, of> {}
+}
+
+def IsLDMBaseRegInList : CheckFunctionPredicate<
+ "ARM_MC::isLDMBaseRegInList", "ARM_MC::isLDMBaseRegInList"
+>;
+
+let FunctionMapper = "ARM_AM::getAM3Op" in {
+ class CheckAM3OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {}
+}
+
+// LDM, base reg in list
+def IsLDMBaseRegInListPred : MCSchedPredicate<IsLDMBaseRegInList>;
+
+class IsRegPCPred<int n> : MCSchedPredicate<CheckRegOperand<n, PC>>;
+
+class BranchWriteRes<int lat, int uops, list<ProcResourceKind> resl,
+ list<int> rcl, SchedWriteRes wr> :
+ SchedWriteRes<!listconcat(wr.ProcResources, resl)> {
+ let Latency = !add(wr.Latency, lat);
+ let ResourceCycles = !listconcat(wr.ResourceCycles, rcl);
+ let NumMicroOps = !add(wr.NumMicroOps, uops);
+ SchedWriteRes BaseWr = wr;
+}
+
+class CheckBranchForm<int n, BranchWriteRes br> :
+ SchedWriteVariant<[
+ SchedVar<IsRegPCPred<n>, [br]>,
+ SchedVar<NoSchedPred, [br.BaseWr]>
+ ]>;
//===----------------------------------------------------------------------===//
// Instruction Itinerary classes used for ARM
@@ -414,14 +467,3 @@ def IIC_VTBX2 : InstrItinClass;
def IIC_VTBX3 : InstrItinClass;
def IIC_VTBX4 : InstrItinClass;
def IIC_VDOTPROD : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// Processor instruction itineraries.
-
-include "ARMScheduleV6.td"
-include "ARMScheduleA8.td"
-include "ARMScheduleA9.td"
-include "ARMScheduleSwift.td"
-include "ARMScheduleR52.td"
-include "ARMScheduleA57.td"
-include "ARMScheduleM4.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
index d9a8d304c41f..0c610a4839f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -21,59 +21,47 @@
// Therefore, IssueWidth is set to the narrower of the two at three, while still
// modeling the machine as out-of-order.
-def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
+def IsCPSRDefinedAndPredicated : CheckAll<[IsCPSRDefined, IsPredicated]>;
def IsCPSRDefinedAndPredicatedPred :
- SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
+ MCSchedPredicate<IsCPSRDefinedAndPredicated>;
// Cortex A57 rev. r1p0 or later (false = r0px)
-def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
+def IsR1P0AndLaterPred : MCSchedPredicate<FalsePred>;
-// If Addrmode3 contains register offset (not immediate)
-def IsLdrAm3RegOffPred :
- SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
-// The same predicate with operand offset 2 and 3:
-def IsLdrAm3RegOffPredX2 :
- SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
-def IsLdrAm3RegOffPredX3 :
- SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
+def IsLdrAm3RegOffPred : MCSchedPredicate<CheckInvalidRegOperand<2>>;
+def IsLdrAm3RegOffPredX2 : MCSchedPredicate<CheckInvalidRegOperand<3>>;
+def IsLdrAm3RegOffPredX3 : MCSchedPredicate<CheckInvalidRegOperand<4>>;
// If Addrmode3 contains "minus register"
-def IsLdrAm3NegRegOffPred :
- SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
-// The same predicate with operand offset 2 and 3:
-def IsLdrAm3NegRegOffPredX2 :
- SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
-def IsLdrAm3NegRegOffPredX3 :
- SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
+class Am3NegativeRegOffset<int n> : MCSchedPredicate<CheckAll<[
+ CheckValidRegOperand<n>,
+ CheckAM3OpSub<!add(n, 1)>]>>;
+
+def IsLdrAm3NegRegOffPred : Am3NegativeRegOffset<2>;
+def IsLdrAm3NegRegOffPredX2 : Am3NegativeRegOffset<3>;
+def IsLdrAm3NegRegOffPredX3 : Am3NegativeRegOffset<4>;
// Load, scaled register offset, not plus LSL2
-def IsLdstsoScaledNotOptimalPredX0 :
- SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
-def IsLdstsoScaledNotOptimalPred :
- SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
-def IsLdstsoScaledNotOptimalPredX2 :
- SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
-
-// Load, scaled register offset
-def IsLdstsoScaledPred :
- SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
-def IsLdstsoScaledPredX2 :
- SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
-
-def IsLdstsoMinusRegPredX0 :
- SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
-def IsLdstsoMinusRegPred :
- SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
-def IsLdstsoMinusRegPredX2 :
- SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
-
-// Load, scaled register offset
-def IsLdrAm2ScaledPred :
- SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
-
-// LDM, base reg in list
-def IsLdmBaseRegInList :
- SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
+class ScaledRegNotPlusLsl2<int n> : CheckNot<
+ CheckAny<[
+ CheckAM2NoShift<n>,
+ CheckAll<[
+ CheckAM2OpAdd<n>,
+ CheckAM2ShiftLSL<n>,
+ CheckAM2Offset<n, 2>
+ ]>
+ ]>
+ >;
+
+def IsLdstsoScaledNotOptimalPredX0 : MCSchedPredicate<ScaledRegNotPlusLsl2<2>>;
+def IsLdstsoScaledNotOptimalPred : MCSchedPredicate<ScaledRegNotPlusLsl2<3>>;
+def IsLdstsoScaledNotOptimalPredX2 : MCSchedPredicate<ScaledRegNotPlusLsl2<4>>;
+
+def IsLdstsoScaledPredX2 : MCSchedPredicate<CheckNot<CheckAM2NoShift<4>>>;
+
+def IsLdstsoMinusRegPredX0 : MCSchedPredicate<CheckAM2OpSub<2>>;
+def IsLdstsoMinusRegPred : MCSchedPredicate<CheckAM2OpSub<3>>;
+def IsLdstsoMinusRegPredX2 : MCSchedPredicate<CheckAM2OpSub<4>>;
class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
list <SchedWriteRes> Writes = writes;
@@ -185,28 +173,29 @@ def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
+// Check branch forms of ALU ops:
+// check reg 0 for ARM_AM::PC
+// if so adds 2 cyc to latency, 1 uop, 1 res cycle for A57UnitB
+class A57BranchForm<SchedWriteRes non_br> :
+ BranchWriteRes<2, 1, [A57UnitB], [1], non_br>;
+
// shift by register, conditional or unconditional
// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
// Why more complex instruction uses more simple pipeline?
// May be an error in doc.
-def A57WriteALUsi : SchedWriteVariant<[
- // lsl #2, lsl #1, or lsr #1.
- SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
- SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
-]>;
def A57WriteALUsr : SchedWriteVariant<[
- SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
- SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+ SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
+ SchedVar<NoSchedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
]>;
def A57WriteALUSsr : SchedWriteVariant<[
- SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
- SchedVar<NoSchedPred, [A57Write_2cyc_1M]>
+ SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>,
+ SchedVar<NoSchedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]>
]>;
def A57ReadALUsr : SchedReadVariant<[
SchedVar<IsPredicatedPred, [ReadDefault]>,
SchedVar<NoSchedPred, [ReadDefault]>
]>;
-def : SchedAlias<WriteALUsi, A57WriteALUsi>;
+def : SchedAlias<WriteALUsi, CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>>;
def : SchedAlias<WriteALUsr, A57WriteALUsr>;
def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
def : SchedAlias<ReadALUsr, A57ReadALUsr>;
@@ -282,7 +271,11 @@ def : ReadAdvance<ReadMUL, 0>;
// from similar μops, allowing a typical sequence of multiply-accumulate μops
// to issue one every 1 cycle (sched advance = 2).
def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
-def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
+def A57WriteMLAL : SchedWriteVariant<[
+ SchedVar<IsCPSRDefinedPred, [A57Write_5cyc_1I_1M]>,
+ SchedVar<NoSchedPred, [A57Write_4cyc_1M]>
+]>;
+
def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
def : InstRW<[A57WriteMLA],
@@ -477,11 +470,11 @@ def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
"LDRB_POST_REG", "LDR(B?)T_POST$")>;
def A57WriteLdrTRegPost : SchedWriteVariant<[
- SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
+ SchedVar<IsLdstsoScaledPredX2, [A57Write_4cyc_1I_1L_1M]>,
SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]>
]>;
def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
- SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
+ SchedVar<IsLdstsoScaledPredX2, [A57WrBackThree]>,
SchedVar<NoSchedPred, [A57WrBackTwo]>
]>;
// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
@@ -517,8 +510,12 @@ def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
// --- Load multiple instructions ---
foreach NumAddr = 1-8 in {
- def A57LMAddrPred#NumAddr :
- SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
+ def A57LMAddrPred#NumAddr : MCSchedPredicate<CheckAny<[
+ CheckNumOperands<!add(!shl(NumAddr, 1), 2)>,
+ CheckNumOperands<!add(!shl(NumAddr, 1), 3)>]>>;
+ def A57LMAddrUpdPred#NumAddr : MCSchedPredicate<CheckAny<[
+ CheckNumOperands<!add(!shl(NumAddr, 1), 3)>,
+ CheckNumOperands<!add(!shl(NumAddr, 1), 4)>]>>;
}
def A57LDMOpsListNoregin : A57WriteLMOpsListType<
@@ -574,20 +571,20 @@ def A57LDMOpsList_Upd : A57WriteLMOpsListType<
A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
def A57WriteLDM_Upd : SchedWriteVariant<[
- SchedVar<A57LMAddrPred1, A57LDMOpsList_Upd.Writes[0-2]>,
- SchedVar<A57LMAddrPred2, A57LDMOpsList_Upd.Writes[0-4]>,
- SchedVar<A57LMAddrPred3, A57LDMOpsList_Upd.Writes[0-6]>,
- SchedVar<A57LMAddrPred4, A57LDMOpsList_Upd.Writes[0-8]>,
- SchedVar<A57LMAddrPred5, A57LDMOpsList_Upd.Writes[0-10]>,
- SchedVar<A57LMAddrPred6, A57LDMOpsList_Upd.Writes[0-12]>,
- SchedVar<A57LMAddrPred7, A57LDMOpsList_Upd.Writes[0-14]>,
- SchedVar<A57LMAddrPred8, A57LDMOpsList_Upd.Writes[0-16]>,
- SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]>
+ SchedVar<A57LMAddrUpdPred1, A57LDMOpsList_Upd.Writes[0-2]>,
+ SchedVar<A57LMAddrUpdPred2, A57LDMOpsList_Upd.Writes[0-4]>,
+ SchedVar<A57LMAddrUpdPred3, A57LDMOpsList_Upd.Writes[0-6]>,
+ SchedVar<A57LMAddrUpdPred4, A57LDMOpsList_Upd.Writes[0-8]>,
+ SchedVar<A57LMAddrUpdPred5, A57LDMOpsList_Upd.Writes[0-10]>,
+ SchedVar<A57LMAddrUpdPred6, A57LDMOpsList_Upd.Writes[0-12]>,
+ SchedVar<A57LMAddrUpdPred7, A57LDMOpsList_Upd.Writes[0-14]>,
+ SchedVar<A57LMAddrUpdPred8, A57LDMOpsList_Upd.Writes[0-16]>,
+ SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]>
]> { let Variadic=1; }
def A57WriteLDM : SchedWriteVariant<[
- SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
- SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]>
+ SchedVar<IsLDMBaseRegInListPred, [A57WriteLDMreginlist]>,
+ SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]>
]> { let Variadic=1; }
def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
@@ -834,7 +831,6 @@ def A57WriteVLDMuncond : SchedWriteVariant<[
SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond.Writes[0-9]>,
SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond.Writes[0-11]>,
SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond.Writes[0-13]>,
- SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond.Writes[0-15]>,
SchedVar<NoSchedPred, A57VLDMOpsListUncond.Writes[0-15]>
]> { let Variadic=1; }
@@ -855,7 +851,6 @@ def A57WriteVLDMcond : SchedWriteVariant<[
SchedVar<A57LMAddrPred5, A57VLDMOpsListCond.Writes[0-9]>,
SchedVar<A57LMAddrPred6, A57VLDMOpsListCond.Writes[0-11]>,
SchedVar<A57LMAddrPred7, A57VLDMOpsListCond.Writes[0-13]>,
- SchedVar<A57LMAddrPred8, A57VLDMOpsListCond.Writes[0-15]>,
SchedVar<NoSchedPred, A57VLDMOpsListCond.Writes[0-15]>
]> { let Variadic=1; }
@@ -883,7 +878,6 @@ def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond_Upd.Writes[0-9]>,
SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond_Upd.Writes[0-11]>,
SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond_Upd.Writes[0-13]>,
- SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond_Upd.Writes[0-15]>,
SchedVar<NoSchedPred, A57VLDMOpsListUncond_Upd.Writes[0-15]>
]> { let Variadic=1; }
@@ -904,7 +898,6 @@ def A57WriteVLDMcond_UPD : SchedWriteVariant<[
SchedVar<A57LMAddrPred5, A57VLDMOpsListCond_Upd.Writes[0-9]>,
SchedVar<A57LMAddrPred6, A57VLDMOpsListCond_Upd.Writes[0-11]>,
SchedVar<A57LMAddrPred7, A57VLDMOpsListCond_Upd.Writes[0-13]>,
- SchedVar<A57LMAddrPred8, A57VLDMOpsListCond_Upd.Writes[0-15]>,
SchedVar<NoSchedPred, A57VLDMOpsListCond_Upd.Writes[0-15]>
]> { let Variadic=1; }
@@ -1201,7 +1194,7 @@ def : InstRW<[A57Write_5cyc_1V], (instregex
// --- 3.16 ASIMD Miscellaneous Instructions ---
// ASIMD bitwise insert
-def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL", "VBSP")>;
// ASIMD count
def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
@@ -1490,7 +1483,7 @@ def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
// -----------------------------------------------------------------------------
// Common definitions
def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
-def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
+def : SchedAlias<WriteALU, CheckBranchForm<0, A57BranchForm<A57Write_1cyc_1I>>>;
def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
index 5ba61503686e..531b10bc5cfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
@@ -36,13 +36,16 @@ def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20;
let ResourceCycles = [20]; }
def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; }
-def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; }
-def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; }
+def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1;
+ let ResourceCycles = [1]; }
+def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2;
+ let ResourceCycles = [1]; }
def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; }
def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; }
def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; }
def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; }
-def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; }
+def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2;
+ let ResourceCycles = [1]; }
def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
let ResourceCycles = [32]; }
def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
@@ -68,7 +71,7 @@ foreach Lat = 4-16 in {
}
}
-def A57Write_4cyc_1M : SchedWriteRes<[A57UnitL]> { let Latency = 4; }
+def A57Write_4cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; }
def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td
index 3f0b71afd977..be7017a7b426 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -2525,8 +2525,8 @@ def : ReadAdvance<ReadFPMAC, 0>;
def : InstRW< [WriteALU],
(instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
"BICrr")>;
-def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
-def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+def : InstRW< [WriteALUsi], (instrs ANDrsi, ORRrsi, EORrsi, BICrsi)>;
+def : InstRW< [WriteALUsr], (instrs ANDrsr, ORRrsr, EORrsr, BICrsr)>;
def : SchedAlias<WriteCMP, A9WriteALU>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
new file mode 100644
index 000000000000..12296ad09218
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleM7.td
@@ -0,0 +1,488 @@
+//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM7Model : SchedMachineModel {
+ let IssueWidth = 2; // Dual issue for most instructions.
+ let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
+ let LoadLatency = 2; // Best case for load-use case.
+ let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
+ // but 4 works better
+ let CompleteModel = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
+// pipe. The stages relevant to scheduling are as follows:
+//
+// EX1: address generation shifts
+// EX2: fast load data ALUs FP operation
+// EX3: slow load data integer writeback FP operation
+// EX4: store data FP writeback
+//
+// There are shifters in both EX1 and EX2, and some instructions can be
+// flexibly allocated between them. EX2 is used as the "zero" point
+// for scheduling, so simple ALU operations executing in EX2 will have
+// ReadAdvance<0> (the default) for their source operands and Latency = 1.
+
+def M7UnitLoad : ProcResource<2> { let BufferSize = 0; }
+def M7UnitStore : ProcResource<1> { let BufferSize = 0; }
+def M7UnitALU : ProcResource<2>;
+def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitMAC : ProcResource<1> { let BufferSize = 0; }
+def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVFP : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVPort : ProcResource<2> { let BufferSize = 0; }
+def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; }
+
+//===---------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types with map ProcResources and set latency.
+
+let SchedModel = CortexM7Model in {
+
+def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
+
+// Basic ALU with shifts.
+let Latency = 1 in {
+ def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>;
+ def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>;
+ def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
+}
+
+// Compares.
+def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+
+// Multiplies.
+let Latency = 2 in {
+ def : WriteRes<WriteMUL16, [M7UnitMAC]>;
+ def : WriteRes<WriteMUL32, [M7UnitMAC]>;
+ def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
+ def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Multiply-accumulates.
+let Latency = 2 in {
+ def : WriteRes<WriteMAC16, [M7UnitMAC]>;
+ def : WriteRes<WriteMAC32, [M7UnitMAC]>;
+ def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
+ def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Divisions.
+// These cannot be dual-issued with any instructions.
+def : WriteRes<WriteDIV, [M7UnitALU]> {
+ let Latency = 7;
+ let SingleIssue = 1;
+}
+
+// Loads/Stores.
+def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; }
+def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
+def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; }
+
+// Branches.
+def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
+
+// Noop.
+def : WriteRes<WriteNoop, []> { let Latency = 0; }
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for floating-point instructions
+//
+// Floating point conversions.
+def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; }
+
+// The FP pipeline has a latency of 3 cycles.
+// ALU operations (32/64-bit). These go down the FP pipeline.
+def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 4;
+ let BeginGroup = 1;
+}
+
+// Multiplication
+def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 7;
+ let BeginGroup = 1;
+}
+
+// Multiply-accumulate. FPMAC goes down the FP Pipeline.
+def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
+def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 11;
+ let BeginGroup = 1;
+}
+
+// Division. Effective scheduling latency is 3, though real latency is larger
+def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
+def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 30;
+ let BeginGroup = 1;
+}
+
+// Square-root. Effective scheduling latency is 3; real latency is larger
+def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
+def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 30;
+ let BeginGroup = 1;
+}
+
+def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
+
+// Not used for M7, but needing definitions anyway
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+def M7SingleIssue : SchedWriteRes<[]> {
+ let SingleIssue = 1;
+ let NumMicroOps = 0;
+}
+def M7Slot0Only : SchedWriteRes<[]> {
+ let BeginGroup = 1;
+ let NumMicroOps = 0;
+}
+
+// What pipeline stage operands need to be ready for depending on
+// where they come from.
+def : ReadAdvance<ReadALUsr, 0>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 1>;
+def : ReadAdvance<ReadALU, 0>;
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 3>;
+def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1
+def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3
+def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4
+
+// Non general purpose instructions may not be dual issued. These
+// use both issue units.
+def M7NonGeneralPurpose : SchedWriteRes<[]> {
+ // Assume that these will go down the main ALU pipeline.
+ // In reality, many look likely to stall the whole pipeline.
+ let Latency = 3;
+ let SingleIssue = 1;
+}
+
+// List the non general purpose instructions.
+def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
+ "t2MSR", "t2DMB", "t2DSB", "t2ISB",
+ "t2HVC", "t2SMC", "t2UDF", "ERET",
+ "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for load/store
+//
+// Mark whether the loads/stores must be single-issue
+// Address operands are needed earlier
+// Data operands are needed later
+
+def M7BaseUpdate : SchedWriteRes<[]> {
+ let Latency = 0; // Update is bypassable out of EX1
+ let NumMicroOps = 0;
+}
+def M7LoadLatency1 : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+}
+def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; }
+
+// Byte and half-word loads should have greater latency than other loads.
+// So should load exclusive.
+
+def : InstRW<[M7SlowLoad],
+ (instregex "t2LDR(B|H|SB|SH)pc")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS],
+ (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
+ "tLDR(B|H)i")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
+ (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
+def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
+ (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
+
+// Exclusive loads/stores cannot be dual-issued
+def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
+ (instregex "t2LDREX$")>;
+def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
+ (instregex "t2LDREX(B|H)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STREX(B|H)?$")>;
+
+// Load/store multiples cannot be dual-issued. Note that default scheduling
+// occurs around read/write times of individual registers in the list; read
+// time for STM cannot be overridden because it is a variadic source operand.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)LDM(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)STM(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
+
+// Load/store doubles cannot be dual-issued.
+
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
+ M7Read_EX2, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STRD_(PRE|POST)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STRDi")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
+ (instregex "t2LDRD_(PRE|POST)")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
+ (instregex "t2LDRDi")>;
+
+// Word load / preload
+def : InstRW<[WriteLd],
+ (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
+def : InstRW<[WriteLd, M7Read_ISS],
+ (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
+def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
+ (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
+def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
+ (instregex "t2LDR_(POST|PRE)")>;
+
+// Stores
+def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STR(B|H)?_(POST|PRE)")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
+ (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
+
+// TBB/TBH - single-issue only; takes two cycles to issue
+
+def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
+ let NumMicroOps = 2;
+ let SingleIssue = 1;
+}
+
+def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
+
+// VFP loads and stores
+
+def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
+def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 2;
+ let SingleIssue = 1;
+}
+def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
+def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
+ let SingleIssue = 1;
+}
+
+def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>;
+def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>;
+def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>;
+def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>;
+
+// Load/store multiples cannot be dual-issued.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "VLDM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "VSTM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for ALU
+//
+
+// Shifted ALU operands are read a cycle early.
+def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
+
+def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
+ (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
+ "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
+ "t2MOVsr(a|l)")>;
+def : InstRW<[WriteALUsi, M7Read_ISS],
+ (instregex "t2MVNs")>;
+
+// Treat pure shift operations (except for RRX) as if they used the EX1
+// shifter but have timing as if they used the EX2 shifter as they usually
+// can choose the EX2 shifter when needed. Will miss a few dual-issue cases,
+// but the results prove to be better than trying to get them exact.
+
+def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
+def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
+
+// Instructions that use the shifter, but have normal timing.
+
+def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
+
+// Instructions which are slot zero only but otherwise normal.
+
+def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
+
+// MAC operations that don't have SchedRW set.
+
+def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
+
+// Divides are special because they stall for their latency, and so look like a
+// single-cycle as far as scheduling opportunities go. By putting WriteALU
+// first, we make the operand latency 1, but keep the instruction latency 7.
+
+def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
+
+// DSP extension operations
+
+def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+ let Latency = 1;
+ let BeginGroup = 1;
+}
+def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+ let Latency = 2;
+ let BeginGroup = 1;
+}
+def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+ let Latency = 1;
+ let BeginGroup = 1;
+}
+def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+ let Latency = 0; // Bypassable out of EX1
+ let BeginGroup = 1;
+}
+def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+ let Latency = 2;
+ let BeginGroup = 1;
+}
+
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
+ (instregex "t2(S|U)SAT")>;
+def : InstRW<[M7WriteSIMD1, ReadALU],
+ (instregex "(t|t2)(S|U)XT(B|H)")>;
+def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
+ (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
+ "t2SEL")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
+ (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
+ (instregex "t2QD(ADD|SUB)")>;
+def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
+ (instregex "t2(RBIT|REV)", "tREV")>;
+def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
+ (instregex "t2(SBFX|UBFX)")>;
+def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
+ (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
+ (instregex "t2USADA8")>;
+
+// MSR/MRS
+def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for FP operations
+//
+
+// Effective scheduling latency is really 3 for nearly all FP operations,
+// even if their true latency is higher.
+def M7WriteVFPLatOverride : SchedWriteRes<[]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+
+// Instructions which are missing default schedules.
+def : InstRW<[WriteFPALU32],
+ (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+ (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
+
+// VCMP
+def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
+def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 0;
+ let BeginGroup = 1;
+}
+def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
+def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
+
+ // VMRS/VMSR
+def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
+def : InstRW<[M7VMSR], (instregex "VMSR")>;
+
+// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
+def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
+ (instregex "VSEL.*S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
+ ReadALU, ReadALU, M7Read_ISS],
+ (instregex "VSEL.*D$")>;
+
+// VMOV
+def : InstRW<[WriteFPMOV],
+ (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+ (instregex "VMOVD$")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+ (instregex "FCONSTD")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
+ (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
+
+// Larger-latency overrides.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
+ (instregex "V(MUL|NMUL)D")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+ (instregex "V(ADD|SUB)D")>;
+
+// Multiply-accumulate. Chained SP timing is correct; rest need overrides
+// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
+// making it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+ ReadFPMAC, ReadFPMUL, ReadFPMUL],
+ (instregex "V(N)?ML(A|S)D$")>;
+
+// Single-precision fused MACs look like latency 5 with advance of 2.
+
+def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
+ let Latency = 5;
+ let NumMicroOps = 0;
+}
+def M7ReadFPMAC2 : SchedReadAdvance<2>;
+
+def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
+ M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
+ (instregex "VF(N)?M(A|S)S$")>;
+
+// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
+// it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+ ReadFPMAC, ReadFPMUL, ReadFPMUL],
+ (instregex "VF(N)?M(A|S)D$")>;
+
+} // SchedModel = CortexM7Model
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td
index d1cbf754b5a1..466acec6f76a 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -787,8 +787,8 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC
def : InstRW<[R52WriteFPALU_F3, R52Read_F2], (instregex "VBICi(v4i16|v2i32)")>;
def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
-def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>;
-def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)q")>;
def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1],
(instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
index e0e98bfa0e9b..d66b3065c7b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -558,8 +558,8 @@ let SchedModel = SwiftModel in {
(instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
"VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
"VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
- "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF",
- "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+ "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF", "VBIT",
+ "VBSL", "VBSP", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
def : InstRW<[SwiftWriteP1TwoCycle],
(instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 46802037c2aa..5cb608b74ace 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -97,9 +97,9 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const ARMBaseTargetMachine &TM, bool IsLittle,
bool MinSize)
- : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
- CPUString(CPU), OptMinSize(MinSize), IsLittle(IsLittle),
- TargetTriple(TT), Options(TM.Options), TM(TM),
+ : ARMGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+ UseMulOps(UseFusedMulOps), CPUString(CPU), OptMinSize(MinSize),
+ IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
FrameLowering(initializeFrameLowering(CPU, FS)),
// At this point initializeSubtargetDependencies has been called so
// we can query directly.
@@ -185,7 +185,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
else
ArchFS = std::string(FS);
}
- ParseSubtargetFeatures(CPUString, ArchFS);
+ ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, ArchFS);
// FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
// Assert this for now to make the change obvious.
@@ -237,7 +237,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
switch (IT) {
case DefaultIT:
- RestrictIT = hasV8Ops();
+ RestrictIT = hasV8Ops() && !hasMinSize();
break;
case RestrictedIT:
RestrictIT = true;
@@ -294,11 +294,13 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
case CortexA76:
case CortexA77:
case CortexA78:
+ case CortexA78C:
case CortexR4:
case CortexR4F:
case CortexR5:
case CortexR7:
case CortexM3:
+ case CortexM7:
case CortexR52:
case CortexX1:
break;
@@ -314,6 +316,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
PreISelOperandLatencyAdjustment = 1;
break;
case NeoverseN1:
+ case NeoverseN2:
+ case NeoverseV1:
break;
case Swift:
MaxInterleaveFactor = 2;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
index 2703e385dd81..fd9b94fdaa23 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -63,9 +63,11 @@ protected:
CortexA76,
CortexA77,
CortexA78,
+ CortexA78C,
CortexA8,
CortexA9,
CortexM3,
+ CortexM7,
CortexR4,
CortexR4F,
CortexR5,
@@ -76,6 +78,8 @@ protected:
Krait,
Kryo,
NeoverseN1,
+ NeoverseN2,
+ NeoverseV1,
Swift
};
enum ARMProcClassEnum {
@@ -163,6 +167,7 @@ protected:
bool HasV8_4aOps = false;
bool HasV8_5aOps = false;
bool HasV8_6aOps = false;
+ bool HasV8_7aOps = false;
bool HasV8MBaselineOps = false;
bool HasV8MMainlineOps = false;
bool HasV8_1MMainlineOps = false;
@@ -461,6 +466,13 @@ protected:
/// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
bool NegativeImmediates = true;
+ /// Harden against Straight Line Speculation for Returns and Indirect
+ /// Branches.
+ bool HardenSlsRetBr = false;
+
+ /// Harden against Straight Line Speculation for indirect calls.
+ bool HardenSlsBlr = false;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
@@ -526,7 +538,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
/// initializeSubtargetDependencies - Initializes using a CPU and feature string
/// so that we can use initializer lists for subtarget initialization.
@@ -594,6 +606,7 @@ public:
bool hasV8_4aOps() const { return HasV8_4aOps; }
bool hasV8_5aOps() const { return HasV8_5aOps; }
bool hasV8_6aOps() const { return HasV8_6aOps; }
+ bool hasV8_7aOps() const { return HasV8_7aOps; }
bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
@@ -614,6 +627,7 @@ public:
bool isCortexA15() const { return ARMProcFamily == CortexA15; }
bool isSwift() const { return ARMProcFamily == Swift; }
bool isCortexM3() const { return ARMProcFamily == CortexM3; }
+ bool isCortexM7() const { return ARMProcFamily == CortexM7; }
bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
bool isCortexR5() const { return ARMProcFamily == CortexR5; }
bool isKrait() const { return ARMProcFamily == Krait; }
@@ -901,6 +915,9 @@ public:
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const override;
unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
+
+ bool hardenSlsRetBr() const { return HardenSlsRetBr; }
+ bool hardenSlsBlr() const { return HardenSlsBlr; }
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 9ead5fa4308c..237ef54c8339 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -99,7 +99,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
initializeMVEVPTOptimisationsPass(Registry);
initializeMVETailPredicationPass(Registry);
initializeARMLowOverheadLoopsPass(Registry);
+ initializeARMBlockPlacementPass(Registry);
initializeMVEGatherScatterLoweringPass(Registry);
+ initializeARMSLSHardeningPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -251,7 +253,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
// ARM supports the MachineOutliner.
setMachineOutliner(true);
- setSupportsDefaultOutlining(false);
+ setSupportsDefaultOutlining(true);
}
ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
@@ -261,12 +263,10 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
@@ -409,7 +409,8 @@ void ARMPassConfig::addIRPasses() {
// ldrex/strex loops to simplify this, but it needs tidying up.
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
addPass(createCFGSimplificationPass(
- 1, false, false, true, true, [this](const Function &F) {
+ SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true),
+ [this](const Function &F) {
const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
}));
@@ -471,7 +472,7 @@ bool ARMPassConfig::addInstSelector() {
}
bool ARMPassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
@@ -539,6 +540,9 @@ void ARMPassConfig::addPreSched2() {
addPass(&PostMachineSchedulerID);
addPass(&PostRASchedulerID);
}
+
+ addPass(createARMIndirectThunks());
+ addPass(createARMSLSHardeningPass());
}
void ARMPassConfig::addPreEmitPass() {
@@ -549,9 +553,11 @@ void ARMPassConfig::addPreEmitPass() {
return MF.getSubtarget<ARMSubtarget>().isThumb2();
}));
- // Don't optimize barriers at -O0.
- if (getOptLevel() != CodeGenOpt::None)
+ // Don't optimize barriers or block placement at -O0.
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createARMBlockPlacementPass());
addPass(createARMOptimizeBarriersPass());
+ }
}
void ARMPassConfig::addPreEmitPass2() {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
index ac55d2bdcc2b..8428092bf179 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -72,6 +72,12 @@ public:
}
bool targetSchedulesPostRAScheduling() const override { return true; };
+
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
};
/// ARM/Thumb little endian target machine.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index bea4e157a131..890193401373 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,14 +20,18 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
@@ -46,10 +50,38 @@ static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
+static cl::opt<bool>
+ AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
+ cl::desc("Enable the generation of WLS loops"));
+
extern cl::opt<TailPredication::Mode> EnableTailPredication;
extern cl::opt<bool> EnableMaskedGatherScatters;
+extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
+
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
+ InstCombiner::BuilderTy &Builder) {
+ auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+ if (!IntrAlign)
+ return nullptr;
+
+ unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
+ ? MemAlign
+ : IntrAlign->getLimitedValue();
+
+ if (!isPowerOf2_32(Alignment))
+ return nullptr;
+
+ auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+ PointerType::get(II.getType(), 0));
+ return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
+}
+
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -82,6 +114,138 @@ bool ARMTTIImpl::shouldFavorPostInc() const {
return false;
}
+Optional<Instruction *>
+ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ using namespace PatternMatch;
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::arm_neon_vld1: {
+ Align MemAlign =
+ getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+ &IC.getAssumptionCache(), &IC.getDominatorTree());
+ if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+ }
+
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane: {
+ Align MemAlign =
+ getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+ &IC.getAssumptionCache(), &IC.getDominatorTree());
+ unsigned AlignArg = II.getNumArgOperands() - 1;
+ Value *AlignArgOp = II.getArgOperand(AlignArg);
+ MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
+ if (Align && *Align < MemAlign) {
+ return IC.replaceOperand(
+ II, AlignArg,
+ ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
+ false));
+ }
+ break;
+ }
+
+ case Intrinsic::arm_mve_pred_i2v: {
+ Value *Arg = II.getArgOperand(0);
+ Value *ArgArg;
+ if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+ PatternMatch::m_Value(ArgArg))) &&
+ II.getType() == ArgArg->getType()) {
+ return IC.replaceInstUsesWith(II, ArgArg);
+ }
+ Constant *XorMask;
+ if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+ PatternMatch::m_Value(ArgArg)),
+ PatternMatch::m_Constant(XorMask))) &&
+ II.getType() == ArgArg->getType()) {
+ if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+ if (CI->getValue().trunc(16).isAllOnesValue()) {
+ auto TrueVector = IC.Builder.CreateVectorSplat(
+ cast<FixedVectorType>(II.getType())->getNumElements(),
+ IC.Builder.getTrue());
+ return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+ }
+ }
+ }
+ KnownBits ScalarKnown(32);
+ if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
+ ScalarKnown, 0)) {
+ return &II;
+ }
+ break;
+ }
+ case Intrinsic::arm_mve_pred_v2i: {
+ Value *Arg = II.getArgOperand(0);
+ Value *ArgArg;
+ if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
+ PatternMatch::m_Value(ArgArg)))) {
+ return IC.replaceInstUsesWith(II, ArgArg);
+ }
+ if (!II.getMetadata(LLVMContext::MD_range)) {
+ Type *IntTy32 = Type::getInt32Ty(II.getContext());
+ Metadata *M[] = {
+ ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
+ ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
+ II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
+ return &II;
+ }
+ break;
+ }
+ case Intrinsic::arm_mve_vadc:
+ case Intrinsic::arm_mve_vadc_predicated: {
+ unsigned CarryOp =
+ (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
+ assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
+ "Bad type for intrinsic!");
+
+ KnownBits CarryKnown(32);
+ if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
+ CarryKnown)) {
+ return &II;
+ }
+ break;
+ }
+ case Intrinsic::arm_mve_vmldava: {
+ Instruction *I = cast<Instruction>(&II);
+ if (I->hasOneUse()) {
+ auto *User = cast<Instruction>(*I->user_begin());
+ Value *OpZ;
+ if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
+ match(I->getOperand(3), m_Zero())) {
+ Value *OpX = I->getOperand(4);
+ Value *OpY = I->getOperand(5);
+ Type *OpTy = OpX->getType();
+
+ IC.Builder.SetInsertPoint(User);
+ Value *V =
+ IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
+ {I->getOperand(0), I->getOperand(1),
+ I->getOperand(2), OpZ, OpX, OpY});
+
+ IC.replaceInstUsesWith(*User, V);
+ return IC.eraseInstFromFunction(*User);
+ }
+ }
+ return None;
+ }
+ }
+ return None;
+}
+
int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
@@ -125,8 +289,43 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
return 1;
}
-int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind) {
+// Checks whether Inst is part of a min(max()) or max(min()) pattern
+// that will match to an SSAT instruction
+static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
+ Value *LHS, *RHS;
+ ConstantInt *C;
+ SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
+
+ if (InstSPF == SPF_SMAX &&
+ PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
+ C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
+
+ auto isSSatMin = [&](Value *MinInst) {
+ if (isa<SelectInst>(MinInst)) {
+ Value *MinLHS, *MinRHS;
+ ConstantInt *MinC;
+ SelectPatternFlavor MinSPF =
+ matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
+ if (MinSPF == SPF_SMIN &&
+ PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
+ MinC->getValue() == ((-Imm) - 1))
+ return true;
+ }
+ return false;
+ };
+
+ if (isSSatMin(Inst->getOperand(1)) ||
+ (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
+ isSSatMin(*(++Inst->user_begin())))))
+ return true;
+ }
+ return false;
+}
+
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
// not), but that the alternative is worse.
@@ -165,10 +364,33 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
return 0;
+ // Ensures negative constant of min(max()) or max(min()) patterns that
+ // match to SSAT instructions don't get hoisted
+ if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
+ Ty->getIntegerBitWidth() <= 32) {
+ if (isSSATMinMaxPattern(Inst, Imm) ||
+ (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
+ isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
+ return 0;
+ }
+
return getIntImmCost(Imm, Ty, CostKind);
}
+int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ if (CostKind == TTI::TCK_RecipThroughput &&
+ (ST->hasNEON() || ST->hasMVEIntegerOps())) {
+ // FIXME: The vectorizer is highly sensistive to the cost of these
+ // instructions, which suggests that it may be using the costs incorrectly.
+ // But, for now, just make them free to avoid performance regressions for
+ // vector targets.
+ return 0;
+ }
+ return BaseT::getCFInstrCost(Opcode, CostKind);
+}
+
int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -180,15 +402,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Cost == 0 ? 0 : 1;
return Cost;
};
+ auto IsLegalFPType = [this](EVT VT) {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
+ (EltVT == MVT::f64 && ST->hasFP64()) ||
+ (EltVT == MVT::f16 && ST->hasFullFP16());
+ };
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
-
- // The extend of a load is free
- if (I && isa<LoadInst>(I->getOperand(0))) {
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+
+ // Extending masked load/Truncating masked stores is expensive because we
+ // currently don't split them. This means that we'll likely end up
+ // loading/storing each element individually (hence the high cost).
+ if ((ST->hasMVEIntegerOps() &&
+ (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
+ Opcode == Instruction::SExt)) ||
+ (ST->hasMVEFloatOps() &&
+ (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
+ IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
+ if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
+ return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
+
+ // The extend of other kinds of load is free
+ if (CCH == TTI::CastContextHint::Normal ||
+ CCH == TTI::CastContextHint::Masked) {
static const TypeConversionCostTblEntry LoadConversionTbl[] = {
{ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
{ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
@@ -242,33 +484,32 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
- }
- // The truncate of a store is free. This is the mirror of extends above.
- if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) {
- static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+ // The truncate of a store is free. This is the mirror of extends above.
+ static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
{ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
{ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
{ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
{ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
+ {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
{ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
{ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
};
if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
if (const auto *Entry =
- ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(),
- DstTy.getSimpleVT()))
+ ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
+ SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
- static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+ static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
{ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
{ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
};
if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
if (const auto *Entry =
- ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(),
- DstTy.getSimpleVT()))
+ ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
+ SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
}
@@ -504,19 +745,25 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
int Lanes = 1;
if (SrcTy.isFixedLengthVector())
Lanes = SrcTy.getVectorNumElements();
- auto IsLegal = [this](EVT VT) {
- EVT EltVT = VT.getScalarType();
- return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
- (EltVT == MVT::f64 && ST->hasFP64()) ||
- (EltVT == MVT::f16 && ST->hasFullFP16());
- };
- if (IsLegal(SrcTy) && IsLegal(DstTy))
+ if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
return Lanes;
else
return Lanes * CallCost;
}
+ if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
+ SrcTy.isFixedLengthVector()) {
+ // Treat a truncate with larger than legal source (128bits for MVE) as
+ // expensive, 2 instructions per lane.
+ if ((SrcTy.getScalarType() == MVT::i8 ||
+ SrcTy.getScalarType() == MVT::i16 ||
+ SrcTy.getScalarType() == MVT::i32) &&
+ SrcTy.getSizeInBits() > 128 &&
+ SrcTy.getSizeInBits() > DstTy.getSizeInBits())
+ return SrcTy.getVectorNumElements() * 2;
+ }
+
// Scalar integer conversion costs.
static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
// i16 -> i64 requires two dependent operations.
@@ -540,7 +787,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
? ST->getMVEVectorCostFactor()
: 1;
return AdjustCost(
- BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+ BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -580,15 +827,37 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
- // TODO: Handle other cost kinds.
- if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
-
int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+ // Thumb scalar code size cost for select.
+ if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
+ ST->isThumb() && !ValTy->isVectorTy()) {
+ // Assume expensive structs.
+ if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
+ return TTI::TCC_Expensive;
+
+ // Select costs can vary because they:
+ // - may require one or more conditional mov (including an IT),
+ // - can't operate directly on immediates,
+ // - require live flags, which we can't copy around easily.
+ int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
+
+ // Possible IT instruction for Thumb2, or more for Thumb1.
+ ++Cost;
+
+ // i1 values may need rematerialising by using mov immediates and/or
+ // flag setting instructions.
+ if (ValTy->isIntegerTy(1))
+ ++Cost;
+
+ return Cost;
+ }
+
// On NEON a vector select gets lowered to vbsl.
- if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+ if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
// Lowering of some vector selects is currently far from perfect.
static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
@@ -609,11 +878,15 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
return LT.first;
}
- int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
- ? ST->getMVEVectorCostFactor()
- : 1;
- return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
- I);
+ // Default to cheap (throughput/size of 1 instruction) but adjust throughput
+ // for "multiple beats" potentially needed by MVE instructions.
+ int BaseCost = 1;
+ if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
+ ValTy->isVectorTy())
+ BaseCost = ST->getMVEVectorCostFactor();
+
+ return BaseCost *
+ BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -695,39 +968,83 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
(EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
}
-int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
- const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
- assert(MI && "MemcpyInst expected");
- ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
+/// Given a memcpy/memset/memmove instruction, return the number of memory
+/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
+/// call is used.
+int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
+ MemOp MOp;
+ unsigned DstAddrSpace = ~0u;
+ unsigned SrcAddrSpace = ~0u;
+ const Function *F = I->getParent()->getParent();
- // To model the cost of a library call, we assume 1 for the call, and
- // 3 for the argument setup.
- const unsigned LibCallCost = 4;
+ if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
+ ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
+ // If 'size' is not a constant, a library call will be generated.
+ if (!C)
+ return -1;
- // If 'size' is not a constant, a library call will be generated.
- if (!C)
- return LibCallCost;
+ const unsigned Size = C->getValue().getZExtValue();
+ const Align DstAlign = *MC->getDestAlign();
+ const Align SrcAlign = *MC->getSourceAlign();
- const unsigned Size = C->getValue().getZExtValue();
- const Align DstAlign = *MI->getDestAlign();
- const Align SrcAlign = *MI->getSourceAlign();
- const Function *F = I->getParent()->getParent();
- const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
- std::vector<EVT> MemOps;
+ MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
+ /*IsVolatile*/ false);
+ DstAddrSpace = MC->getDestAddressSpace();
+ SrcAddrSpace = MC->getSourceAddressSpace();
+ }
+ else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
+ ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
+ // If 'size' is not a constant, a library call will be generated.
+ if (!C)
+ return -1;
+
+ const unsigned Size = C->getValue().getZExtValue();
+ const Align DstAlign = *MS->getDestAlign();
+
+ MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
+ /*IsZeroMemset*/ false, /*IsVolatile*/ false);
+ DstAddrSpace = MS->getDestAddressSpace();
+ }
+ else
+ llvm_unreachable("Expected a memcpy/move or memset!");
+
+ unsigned Limit, Factor = 2;
+ switch(I->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
+ break;
+ case Intrinsic::memmove:
+ Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
+ break;
+ case Intrinsic::memset:
+ Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
+ Factor = 1;
+ break;
+ default:
+ llvm_unreachable("Expected a memcpy/move or memset!");
+ }
// MemOps will be poplulated with a list of data types that needs to be
// loaded and stored. That's why we multiply the number of elements by 2 to
// get the cost for this memcpy.
+ std::vector<EVT> MemOps;
if (getTLI()->findOptimalMemOpLowering(
- MemOps, Limit,
- MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
- /*IsVolatile*/ true),
- MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
- F->getAttributes()))
- return MemOps.size() * 2;
+ MemOps, Limit, MOp, DstAddrSpace,
+ SrcAddrSpace, F->getAttributes()))
+ return MemOps.size() * Factor;
// If we can't find an optimal memop lowering, return the default cost
- return LibCallCost;
+ return -1;
+}
+
+int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
+ int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
+
+ // To model the cost of a library call, we assume 1 for the call, and
+ // 3 for the argument setup.
+ if (NumOps == -1)
+ return 4;
+ return NumOps;
}
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
@@ -832,13 +1149,22 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
TTI::OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
- // TODO: Handle more cost kinds.
- if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
- Op2Info, Opd1PropInfo,
- Opd2PropInfo, Args, CxtI);
-
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+ if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
+ // Make operations on i1 relatively expensive as this often involves
+ // combining predicates. AND and XOR should be easier to handle with IT
+ // blocks.
+ switch (ISDOpcode) {
+ default:
+ break;
+ case ISD::AND:
+ case ISD::XOR:
+ return 2;
+ case ISD::OR:
+ return 3;
+ }
+ }
+
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->hasNEON()) {
@@ -933,9 +1259,12 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
if (LooksLikeAFreeShift())
return 0;
- int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
- ? ST->getMVEVectorCostFactor()
- : 1;
+ // Default to cheap (throughput/size of 1 instruction) but adjust throughput
+ // for "multiple beats" potentially needed by MVE instructions.
+ int BaseCost = 1;
+ if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() &&
+ Ty->isVectorTy())
+ BaseCost = ST->getMVEVectorCostFactor();
// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
// without treating floats as more expensive that scalars or increasing the
@@ -1002,6 +1331,24 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
CostKind, I);
}
+unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+ Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
+ if (ST->hasMVEIntegerOps()) {
+ if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
+ return ST->getMVEVectorCostFactor();
+ if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
+ return ST->getMVEVectorCostFactor();
+ }
+ if (!isa<FixedVectorType>(Src))
+ return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ // Scalar cost, which is currently very high due to the efficiency of the
+ // generated code.
+ return cast<FixedVectorType>(Src)->getNumElements() * 8;
+}
+
int ARMTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -1032,7 +1379,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
// promoted differently). The cost of 2 here is then a load and vrev or
// vmovn.
if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
- VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
+ VecTy->isIntOrIntVectorTy() &&
+ DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
return 2 * BaseCost;
}
@@ -1065,13 +1413,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
// multiplied by the number of elements being loaded. This is possibly very
// conservative, but even so we still end up vectorising loops because the
// cost per iteration for many loops is lower than for scalar loops.
- unsigned VectorCost = NumElems * LT.first;
+ unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor();
// The scalarization cost should be a lot higher. We use the number of vector
// elements plus the scalarization overhead.
unsigned ScalarCost =
NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
- if (Alignment < EltSize / 8)
+ if (EltSize < 8 || Alignment < EltSize / 8)
return ScalarCost;
unsigned ExtSize = EltSize;
@@ -1140,6 +1488,92 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
return ScalarCost;
}
+int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind) {
+ EVT ValVT = TLI->getValueType(DL, ValTy);
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ static const CostTblEntry CostTblAdd[]{
+ {ISD::ADD, MVT::v16i8, 1},
+ {ISD::ADD, MVT::v8i16, 1},
+ {ISD::ADD, MVT::v4i32, 1},
+ };
+ if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
+ return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first;
+
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+ CostKind);
+}
+
+InstructionCost
+ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
+ Type *ResTy, VectorType *ValTy,
+ TTI::TargetCostKind CostKind) {
+ EVT ValVT = TLI->getValueType(DL, ValTy);
+ EVT ResVT = TLI->getValueType(DL, ResTy);
+ if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
+ (LT.second == MVT::v8i16 &&
+ ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
+ (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
+ return ST->getMVEVectorCostFactor() * LT.first;
+ }
+
+ return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
+ CostKind);
+}
+
+int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ switch (ICA.getID()) {
+ case Intrinsic::get_active_lane_mask:
+ // Currently we make a somewhat optimistic assumption that
+ // active_lane_mask's are always free. In reality it may be freely folded
+ // into a tail predicated loop, expanded into a VCPT or expanded into a lot
+ // of add/icmp code. We may need to improve this in the future, but being
+ // able to detect if it is free or not involves looking at a lot of other
+ // code. We currently assume that the vectorizer inserted these, and knew
+ // what it was doing in adding one.
+ if (ST->hasMVEIntegerOps())
+ return 0;
+ break;
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat: {
+ if (!ST->hasMVEIntegerOps())
+ break;
+ // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
+ Type *VT = ICA.getReturnType();
+ if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
+ VT = VectorType::get(VT, ICA.getVectorFactor());
+
+ std::pair<int, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, VT);
+ if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v16i8) {
+ // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
+ // need to extend the type, as it uses shr(qadd(shl, shl)).
+ unsigned Instrs = LT.second.getScalarSizeInBits() ==
+ ICA.getReturnType()->getScalarSizeInBits()
+ ? 1
+ : 4;
+ return LT.first * ST->getMVEVectorCostFactor() * Instrs;
+ }
+ break;
+ }
+ }
+
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+}
+
bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
BaseT::isLoweredToCall(F);
@@ -1201,6 +1635,93 @@ bool ARMTTIImpl::isLoweredToCall(const Function *F) {
return BaseT::isLoweredToCall(F);
}
+bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
+ unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
+ EVT VT = TLI->getValueType(DL, I.getType(), true);
+ if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
+ return true;
+
+ // Check if an intrinsic will be lowered to a call and assume that any
+ // other CallInst will generate a bl.
+ if (auto *Call = dyn_cast<CallInst>(&I)) {
+ if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
+ switch(II->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ return getNumMemOps(II) == -1;
+ default:
+ if (const Function *F = Call->getCalledFunction())
+ return isLoweredToCall(F);
+ }
+ }
+ return true;
+ }
+
+ // FPv5 provides conversions between integer, double-precision,
+ // single-precision, and half-precision formats.
+ switch (I.getOpcode()) {
+ default:
+ break;
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ return !ST->hasFPARMv8Base();
+ }
+
+ // FIXME: Unfortunately the approach of checking the Operation Action does
+ // not catch all cases of Legalization that use library calls. Our
+ // Legalization step categorizes some transformations into library calls as
+ // Custom, Expand or even Legal when doing type legalization. So for now
+ // we have to special case for instance the SDIV of 64bit integers and the
+ // use of floating point emulation.
+ if (VT.isInteger() && VT.getSizeInBits() >= 64) {
+ switch (ISD) {
+ default:
+ break;
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM:
+ case ISD::SDIVREM:
+ case ISD::UDIVREM:
+ return true;
+ }
+ }
+
+ // Assume all other non-float operations are supported.
+ if (!VT.isFloatingPoint())
+ return false;
+
+ // We'll need a library call to handle most floats when using soft.
+ if (TLI->useSoftFloat()) {
+ switch (I.getOpcode()) {
+ default:
+ return true;
+ case Instruction::Alloca:
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::Select:
+ case Instruction::PHI:
+ return false;
+ }
+ }
+
+ // We'll need a libcall to perform double precision operations on a single
+ // precision only FPU.
+ if (I.getType()->isDoubleTy() && !ST->hasFP64())
+ return true;
+
+ // Likewise for half precision arithmetic.
+ if (I.getType()->isHalfTy() && !ST->hasFullFP16())
+ return true;
+
+ return false;
+}
+
bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
@@ -1235,93 +1756,13 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
// point in generating a hardware loop if that's going to happen.
- auto MaybeCall = [this](Instruction &I) {
- const ARMTargetLowering *TLI = getTLI();
- unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
- EVT VT = TLI->getValueType(DL, I.getType(), true);
- if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
- return true;
-
- // Check if an intrinsic will be lowered to a call and assume that any
- // other CallInst will generate a bl.
- if (auto *Call = dyn_cast<CallInst>(&I)) {
- if (isa<IntrinsicInst>(Call)) {
- if (const Function *F = Call->getCalledFunction())
- return isLoweredToCall(F);
- }
- return true;
- }
-
- // FPv5 provides conversions between integer, double-precision,
- // single-precision, and half-precision formats.
- switch (I.getOpcode()) {
- default:
- break;
- case Instruction::FPToSI:
- case Instruction::FPToUI:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- return !ST->hasFPARMv8Base();
- }
-
- // FIXME: Unfortunately the approach of checking the Operation Action does
- // not catch all cases of Legalization that use library calls. Our
- // Legalization step categorizes some transformations into library calls as
- // Custom, Expand or even Legal when doing type legalization. So for now
- // we have to special case for instance the SDIV of 64bit integers and the
- // use of floating point emulation.
- if (VT.isInteger() && VT.getSizeInBits() >= 64) {
- switch (ISD) {
- default:
- break;
- case ISD::SDIV:
- case ISD::UDIV:
- case ISD::SREM:
- case ISD::UREM:
- case ISD::SDIVREM:
- case ISD::UDIVREM:
- return true;
- }
- }
-
- // Assume all other non-float operations are supported.
- if (!VT.isFloatingPoint())
- return false;
-
- // We'll need a library call to handle most floats when using soft.
- if (TLI->useSoftFloat()) {
- switch (I.getOpcode()) {
- default:
- return true;
- case Instruction::Alloca:
- case Instruction::Load:
- case Instruction::Store:
- case Instruction::Select:
- case Instruction::PHI:
- return false;
- }
- }
-
- // We'll need a libcall to perform double precision operations on a single
- // precision only FPU.
- if (I.getType()->isDoubleTy() && !ST->hasFP64())
- return true;
-
- // Likewise for half precision arithmetic.
- if (I.getType()->isHalfTy() && !ST->hasFullFP16())
- return true;
-
- return false;
- };
auto IsHardwareLoopIntrinsic = [](Instruction &I) {
if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
switch (Call->getIntrinsicID()) {
default:
break;
- case Intrinsic::set_loop_iterations:
+ case Intrinsic::start_loop_iterations:
case Intrinsic::test_set_loop_iterations:
case Intrinsic::loop_decrement:
case Intrinsic::loop_decrement_reg:
@@ -1332,14 +1773,24 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
};
// Scan the instructions to see if there's any that we know will turn into a
- // call or if this loop is already a low-overhead loop.
+ // call or if this loop is already a low-overhead loop or will become a tail
+ // predicated loop.
+ bool IsTailPredLoop = false;
auto ScanLoop = [&](Loop *L) {
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) {
+ if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
+ isa<InlineAsm>(I)) {
LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
return false;
}
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ IsTailPredLoop |=
+ II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
+ II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
}
}
return true;
@@ -1360,7 +1811,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;
- HWLoopInfo.PerformEntryTest = true;
+ HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
@@ -1408,35 +1859,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const LoopAccessInfo *LAI) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
- // If there are live-out values, it is probably a reduction, which needs a
- // final reduction step after the loop. MVE has a VADDV instruction to reduce
- // integer vectors, but doesn't have an equivalent one for float vectors. A
- // live-out value that is not recognised as a reduction will result in the
- // tail-predicated loop to be reverted to a non-predicated loop and this is
- // very expensive, i.e. it has a significant performance impact. So, in this
- // case it's better not to tail-predicate the loop, which is what we check
- // here. Thus, we allow only 1 live-out value, which has to be an integer
- // reduction, which matches the loops supported by ARMLowOverheadLoops.
- // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
- // sync with each other.
+ // If there are live-out values, it is probably a reduction. We can predicate
+ // most reduction operations freely under MVE using a combination of
+ // prefer-predicated-reduction-select and inloop reductions. We limit this to
+ // floating point and integer reductions, but don't check for operators
+ // specifically here. If the value ends up not being a reduction (and so the
+ // vectorizer cannot tailfold the loop), we should fall back to standard
+ // vectorization automatically.
SmallVector< Instruction *, 8 > LiveOuts;
LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
- bool IntReductionsDisabled =
+ bool ReductionsDisabled =
EnableTailPredication == TailPredication::EnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabledNoReductions;
for (auto *I : LiveOuts) {
- if (!I->getType()->isIntegerTy()) {
- LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
+ if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
+ !I->getType()->isHalfTy()) {
+ LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
"live-out value\n");
return false;
}
- if (I->getOpcode() != Instruction::Add) {
- LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
- return false;
- }
- if (IntReductionsDisabled) {
- LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
+ if (ReductionsDisabled) {
+ LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
return false;
}
}
@@ -1445,7 +1889,6 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
PredicatedScalarEvolution PSE = LAI->getPSE();
SmallVector<Instruction *, 16> LoadStores;
int ICmpCount = 0;
- int Stride = 0;
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -1464,22 +1907,38 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
return false;
}
-
if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
int64_t NextStride = getPtrStride(PSE, Ptr, L);
- // TODO: for now only allow consecutive strides of 1. We could support
- // other strides as long as it is uniform, but let's keep it simple for
- // now.
- if (Stride == 0 && NextStride == 1) {
- Stride = NextStride;
+ if (NextStride == 1) {
+ // TODO: for now only allow consecutive strides of 1. We could support
+ // other strides as long as it is uniform, but let's keep it simple
+ // for now.
continue;
- }
- if (Stride != NextStride) {
- LLVM_DEBUG(dbgs() << "Different strides found, can't "
- "tail-predicate\n.");
+ } else if (NextStride == -1 ||
+ (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
+ (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
+ LLVM_DEBUG(dbgs()
+ << "Consecutive strides of 2 found, vld2/vstr2 can't "
+ "be tail-predicated\n.");
return false;
+ // TODO: don't tail predicate if there is a reversed load?
+ } else if (EnableMaskedGatherScatters) {
+ // Gather/scatters do allow loading from arbitrary strides, at
+ // least if they are loop invariant.
+ // TODO: Loop variant strides should in theory work, too, but
+ // this requires further testing.
+ const SCEV *PtrScev =
+ replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
+ if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
+ const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+ if (PSE.getSE()->isLoopInvariant(Step, L))
+ continue;
+ }
}
+ LLVM_DEBUG(dbgs() << "Bad stride found, can't "
+ "tail-predicate\n.");
+ return false;
}
}
}
@@ -1512,7 +1971,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
return false;
}
- assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
+ assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {
@@ -1580,6 +2039,10 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
return;
+ // Don't unroll vectorized loops, including the remainder loop
+ if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+ return;
+
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining.
unsigned Cost = 0;
@@ -1598,9 +2061,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
return;
}
- SmallVector<const Value*, 4> Operands(I.value_op_begin(),
- I.value_op_end());
- Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize);
+ SmallVector<const Value*, 4> Operands(I.operand_values());
+ Cost +=
+ getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
}
}
@@ -1629,3 +2092,24 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return ST->hasMVEIntegerOps();
}
+
+bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ switch (Opcode) {
+ case Instruction::Add:
+ return ScalarBits <= 64;
+ default:
+ return false;
+ }
+}
+
+bool ARMTTIImpl::preferPredicatedReductionSelect(
+ unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
+ if (!ST->hasMVEIntegerOps())
+ return false;
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 7bf6de4bffe0..7f045080e320 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -113,6 +113,9 @@ public:
return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
}
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+
/// \name Scalar TTI Implementations
/// @{
@@ -123,7 +126,8 @@ public:
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
/// @}
@@ -177,40 +181,31 @@ public:
int getMemcpyCost(const Instruction *I);
+ int getNumMemOps(const IntrinsicInst *I) const;
+
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
VectorType *SubTp);
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
- bool shouldExpandReduction(const IntrinsicInst *II) const {
- switch (II->getIntrinsicID()) {
- case Intrinsic::experimental_vector_reduce_v2_fadd:
- case Intrinsic::experimental_vector_reduce_v2_fmul:
- // We don't have legalization support for ordered FP reductions.
- if (!II->getFastMathFlags().allowReassoc())
- return true;
- // Can't legalize reductions with soft floats.
- return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs();
-
- case Intrinsic::experimental_vector_reduce_fmin:
- case Intrinsic::experimental_vector_reduce_fmax:
- // Can't legalize reductions with soft floats, and NoNan will create
- // fminimum which we do not know how to lower.
- return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs() ||
- !II->getFastMathFlags().noNaNs();
-
- default:
- // Don't expand anything else, let legalization deal with it.
- return false;
- }
- }
+ bool preferInLoopReduction(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
+
+ bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
+
+ int getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::TargetCostKind CostKind,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
@@ -234,6 +229,10 @@ public:
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
+ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind);
+
int getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace,
@@ -245,6 +244,17 @@ public:
Align Alignment, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
+ int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
+ Type *ResTy, VectorType *ValTy,
+ TTI::TargetCostKind CostKind);
+
+ int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+
+ bool maybeLoweredToCall(Instruction &I);
bool isLoweredToCall(const Function *F);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 05f870b90ecd..52577d75ddf5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3087,7 +3087,6 @@ public:
// This is container for the immediate that we will create the constant
// pool from
addExpr(Inst, getConstantPoolImm());
- return;
}
void addMemTBBOperands(MCInst &Inst, unsigned N) const {
@@ -6240,10 +6239,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
StringRef IDVal = Parser.getTok().getIdentifier();
const auto &Prefix =
- std::find_if(std::begin(PrefixEntries), std::end(PrefixEntries),
- [&IDVal](const PrefixEntry &PE) {
- return PE.Spelling == IDVal;
- });
+ llvm::find_if(PrefixEntries, [&IDVal](const PrefixEntry &PE) {
+ return PE.Spelling == IDVal;
+ });
if (Prefix == std::end(PrefixEntries)) {
Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
return true;
@@ -10309,11 +10307,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
!HasWideQualifier) {
// The operands aren't the same for tMOV[S]r... (no cc_out)
MCInst TmpInst;
- TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+ unsigned Op = Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr;
+ TmpInst.setOpcode(Op);
TmpInst.addOperand(Inst.getOperand(0));
TmpInst.addOperand(Inst.getOperand(1));
- TmpInst.addOperand(Inst.getOperand(2));
- TmpInst.addOperand(Inst.getOperand(3));
+ if (Op == ARM::tMOVr) {
+ TmpInst.addOperand(Inst.getOperand(2));
+ TmpInst.addOperand(Inst.getOperand(3));
+ }
Inst = TmpInst;
return true;
}
@@ -10598,6 +10599,12 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
(isThumb() && !hasV8Ops()))
return Match_InvalidOperand;
break;
+ case ARM::t2TBB:
+ case ARM::t2TBH:
+ // Rn = sp is only allowed with ARMv8-A
+ if (!hasV8Ops() && (Inst.getOperand(0).getReg() == ARM::SP))
+ return Match_RequiresV8;
+ break;
default:
break;
}
@@ -11128,7 +11135,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
bool WasThumb = isThumb();
Triple T;
MCSubtargetInfo &STI = copySTI();
- STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
+ STI.setDefaultFeatures("", /*TuneCPU*/ "",
+ ("+" + ARM::getArchName(ID)).str());
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
FixModeAfterArchChange(WasThumb, L);
@@ -11241,7 +11249,7 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
bool WasThumb = isThumb();
MCSubtargetInfo &STI = copySTI();
- STI.setDefaultFeatures(CPU, "");
+ STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, "");
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
FixModeAfterArchChange(WasThumb, L);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 54ff0d9966cb..8ea323a9ced5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -860,7 +860,8 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
VCCPos + 2, MCOI::TIED_TO);
assert(TiedOp >= 0 &&
"Inactive register in vpred_r is not tied to an output!");
- MI.insert(VCCI, MI.getOperand(TiedOp));
+ // Copy the operand to ensure it's not invalidated when MI grows.
+ MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
}
} else if (VCC != ARMVCC::None) {
Check(S, SoftFail);
@@ -4529,12 +4530,14 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
static DecodeStatus
DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
+ const FeatureBitset &FeatureBits =
+ ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
DecodeStatus S = MCDisassembler::Success;
unsigned Rn = fieldFromInstruction(Insn, 16, 4);
unsigned Rm = fieldFromInstruction(Insn, 0, 4);
- if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+ if (Rn == 13 && !FeatureBits[ARM::HasV8Ops]) S = MCDisassembler::SoftFail;
if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
return MCDisassembler::Fail;
if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 24a9fabf0979..8459b4ff2a14 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -205,6 +205,20 @@ namespace ARM_AM {
return V;
}
+ /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained
+ /// by two SOImmVal, that -V = First + Second.
+ /// "R+V" can be optimized to (sub (sub R, First), Second).
+ /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second).
+ inline bool isSOImmTwoPartValNeg(unsigned V) {
+ unsigned First;
+ if (!isSOImmTwoPartVal(-V))
+ return false;
+ // Return false if ~(-First) is not a SoImmval.
+ First = getSOImmTwoPartFirst(-V);
+ First = ~(-First);
+ return !(rotr32(~255U, getSOImmValRotate(First)) & First);
+ }
+
/// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
/// by a left shift. Returns the shift amount to use.
inline unsigned getThumbImmValShift(unsigned Imm) {
@@ -673,6 +687,18 @@ namespace ARM_AM {
return getFP16Imm(FPImm.bitcastToAPInt());
}
+ /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding
+ /// for it. Otherwise return -1 like getFP16Imm.
+ inline int getFP32FP16Imm(const APInt &Imm) {
+ if (Imm.getActiveBits() > 16)
+ return -1;
+ return ARM_AM::getFP16Imm(Imm.trunc(16));
+ }
+
+ inline int getFP32FP16Imm(const APFloat &FPImm) {
+ return getFP32FP16Imm(FPImm.bitcastToAPInt());
+ }
+
/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
/// floating-point value. If the value cannot be represented as an 8-bit
/// floating-point value, then return -1.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 9ad595c016c4..b02aef3c338b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1010,6 +1010,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
case ARM::fixup_t2_condbranch:
case ARM::fixup_t2_uncondbranch:
case ARM::fixup_t2_pcrel_10:
+ case ARM::fixup_t2_pcrel_9:
case ARM::fixup_t2_adr_pcrel_12:
case ARM::fixup_arm_thumb_bl:
case ARM::fixup_arm_thumb_blx:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 74cd2e681ded..ecd96114e8a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -254,7 +254,7 @@ namespace ARMII {
MO_OPTION_MASK = 0x3,
/// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
- /// reference is actually to the ".refptrp.FOO" symbol. This is used for
+ /// reference is actually to the ".refptr.FOO" symbol. This is used for
/// stub symbols on windows.
MO_COFFSTUB = 0x4,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 876741d6c343..07ca5c29f0ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -644,7 +644,6 @@ private:
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
- Symbol->setExternal(false);
}
void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F,
@@ -654,7 +653,6 @@ private:
emitLabelAtPos(Symbol, Loc, F, Offset);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
- Symbol->setExternal(false);
}
void emitThumbFunc(MCSymbol *Func) override {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 37cb731ff001..d975d799e079 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -30,6 +30,7 @@ public:
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 765613cf347d..40e8e244e312 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -87,6 +87,7 @@ void ARMCOFFMCAsmInfoMicrosoft::anchor() { }
ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
AlignmentIsInBytes = false;
+ SupportsDebugInformation = true;
ExceptionsType = ExceptionHandling::WinEH;
PrivateGlobalPrefix = "$M";
PrivateLabelPrefix = "$M";
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 05d73ccf6ff2..774f2507b8d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -11,11 +11,13 @@
//===----------------------------------------------------------------------===//
#include "ARMMCTargetDesc.h"
+#include "ARMAddressingModes.h"
#include "ARMBaseInfo.h"
#include "ARMInstPrinter.h"
#include "ARMMCAsmInfo.h"
#include "TargetInfo/ARMTargetInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCELFStreamer.h"
@@ -180,6 +182,23 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
return ARMArchFeature;
}
+bool ARM_MC::isPredicated(const MCInst &MI, const MCInstrInfo *MCII) {
+ const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+ int PredOpIdx = Desc.findFirstPredOperandIdx();
+ return PredOpIdx != -1 && MI.getOperand(PredOpIdx).getImm() != ARMCC::AL;
+}
+
+bool ARM_MC::isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII) {
+ const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+ for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+ const MCOperand &MO = MI.getOperand(I);
+ if (MO.isReg() && MO.getReg() == ARM::CPSR &&
+ Desc.OpInfo[I].isOptionalDef())
+ return true;
+ }
+ return false;
+}
+
MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
@@ -190,7 +209,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
ArchFS = std::string(FS);
}
- return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
+ return createARMMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
}
static MCInstrInfo *createARMMCInstrInfo() {
@@ -199,9 +218,120 @@ static MCInstrInfo *createARMMCInstrInfo() {
return X;
}
+void ARM_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
+ // Mapping from CodeView to MC register id.
+ static const struct {
+ codeview::RegisterId CVReg;
+ MCPhysReg Reg;
+ } RegMap[] = {
+ {codeview::RegisterId::ARM_R0, ARM::R0},
+ {codeview::RegisterId::ARM_R1, ARM::R1},
+ {codeview::RegisterId::ARM_R2, ARM::R2},
+ {codeview::RegisterId::ARM_R3, ARM::R3},
+ {codeview::RegisterId::ARM_R4, ARM::R4},
+ {codeview::RegisterId::ARM_R5, ARM::R5},
+ {codeview::RegisterId::ARM_R6, ARM::R6},
+ {codeview::RegisterId::ARM_R7, ARM::R7},
+ {codeview::RegisterId::ARM_R8, ARM::R8},
+ {codeview::RegisterId::ARM_R9, ARM::R9},
+ {codeview::RegisterId::ARM_R10, ARM::R10},
+ {codeview::RegisterId::ARM_R11, ARM::R11},
+ {codeview::RegisterId::ARM_R12, ARM::R12},
+ {codeview::RegisterId::ARM_SP, ARM::SP},
+ {codeview::RegisterId::ARM_LR, ARM::LR},
+ {codeview::RegisterId::ARM_PC, ARM::PC},
+ {codeview::RegisterId::ARM_CPSR, ARM::CPSR},
+ {codeview::RegisterId::ARM_FPSCR, ARM::FPSCR},
+ {codeview::RegisterId::ARM_FPEXC, ARM::FPEXC},
+ {codeview::RegisterId::ARM_FS0, ARM::S0},
+ {codeview::RegisterId::ARM_FS1, ARM::S1},
+ {codeview::RegisterId::ARM_FS2, ARM::S2},
+ {codeview::RegisterId::ARM_FS3, ARM::S3},
+ {codeview::RegisterId::ARM_FS4, ARM::S4},
+ {codeview::RegisterId::ARM_FS5, ARM::S5},
+ {codeview::RegisterId::ARM_FS6, ARM::S6},
+ {codeview::RegisterId::ARM_FS7, ARM::S7},
+ {codeview::RegisterId::ARM_FS8, ARM::S8},
+ {codeview::RegisterId::ARM_FS9, ARM::S9},
+ {codeview::RegisterId::ARM_FS10, ARM::S10},
+ {codeview::RegisterId::ARM_FS11, ARM::S11},
+ {codeview::RegisterId::ARM_FS12, ARM::S12},
+ {codeview::RegisterId::ARM_FS13, ARM::S13},
+ {codeview::RegisterId::ARM_FS14, ARM::S14},
+ {codeview::RegisterId::ARM_FS15, ARM::S15},
+ {codeview::RegisterId::ARM_FS16, ARM::S16},
+ {codeview::RegisterId::ARM_FS17, ARM::S17},
+ {codeview::RegisterId::ARM_FS18, ARM::S18},
+ {codeview::RegisterId::ARM_FS19, ARM::S19},
+ {codeview::RegisterId::ARM_FS20, ARM::S20},
+ {codeview::RegisterId::ARM_FS21, ARM::S21},
+ {codeview::RegisterId::ARM_FS22, ARM::S22},
+ {codeview::RegisterId::ARM_FS23, ARM::S23},
+ {codeview::RegisterId::ARM_FS24, ARM::S24},
+ {codeview::RegisterId::ARM_FS25, ARM::S25},
+ {codeview::RegisterId::ARM_FS26, ARM::S26},
+ {codeview::RegisterId::ARM_FS27, ARM::S27},
+ {codeview::RegisterId::ARM_FS28, ARM::S28},
+ {codeview::RegisterId::ARM_FS29, ARM::S29},
+ {codeview::RegisterId::ARM_FS30, ARM::S30},
+ {codeview::RegisterId::ARM_FS31, ARM::S31},
+ {codeview::RegisterId::ARM_ND0, ARM::D0},
+ {codeview::RegisterId::ARM_ND1, ARM::D1},
+ {codeview::RegisterId::ARM_ND2, ARM::D2},
+ {codeview::RegisterId::ARM_ND3, ARM::D3},
+ {codeview::RegisterId::ARM_ND4, ARM::D4},
+ {codeview::RegisterId::ARM_ND5, ARM::D5},
+ {codeview::RegisterId::ARM_ND6, ARM::D6},
+ {codeview::RegisterId::ARM_ND7, ARM::D7},
+ {codeview::RegisterId::ARM_ND8, ARM::D8},
+ {codeview::RegisterId::ARM_ND9, ARM::D9},
+ {codeview::RegisterId::ARM_ND10, ARM::D10},
+ {codeview::RegisterId::ARM_ND11, ARM::D11},
+ {codeview::RegisterId::ARM_ND12, ARM::D12},
+ {codeview::RegisterId::ARM_ND13, ARM::D13},
+ {codeview::RegisterId::ARM_ND14, ARM::D14},
+ {codeview::RegisterId::ARM_ND15, ARM::D15},
+ {codeview::RegisterId::ARM_ND16, ARM::D16},
+ {codeview::RegisterId::ARM_ND17, ARM::D17},
+ {codeview::RegisterId::ARM_ND18, ARM::D18},
+ {codeview::RegisterId::ARM_ND19, ARM::D19},
+ {codeview::RegisterId::ARM_ND20, ARM::D20},
+ {codeview::RegisterId::ARM_ND21, ARM::D21},
+ {codeview::RegisterId::ARM_ND22, ARM::D22},
+ {codeview::RegisterId::ARM_ND23, ARM::D23},
+ {codeview::RegisterId::ARM_ND24, ARM::D24},
+ {codeview::RegisterId::ARM_ND25, ARM::D25},
+ {codeview::RegisterId::ARM_ND26, ARM::D26},
+ {codeview::RegisterId::ARM_ND27, ARM::D27},
+ {codeview::RegisterId::ARM_ND28, ARM::D28},
+ {codeview::RegisterId::ARM_ND29, ARM::D29},
+ {codeview::RegisterId::ARM_ND30, ARM::D30},
+ {codeview::RegisterId::ARM_ND31, ARM::D31},
+ {codeview::RegisterId::ARM_NQ0, ARM::Q0},
+ {codeview::RegisterId::ARM_NQ1, ARM::Q1},
+ {codeview::RegisterId::ARM_NQ2, ARM::Q2},
+ {codeview::RegisterId::ARM_NQ3, ARM::Q3},
+ {codeview::RegisterId::ARM_NQ4, ARM::Q4},
+ {codeview::RegisterId::ARM_NQ5, ARM::Q5},
+ {codeview::RegisterId::ARM_NQ6, ARM::Q6},
+ {codeview::RegisterId::ARM_NQ7, ARM::Q7},
+ {codeview::RegisterId::ARM_NQ8, ARM::Q8},
+ {codeview::RegisterId::ARM_NQ9, ARM::Q9},
+ {codeview::RegisterId::ARM_NQ10, ARM::Q10},
+ {codeview::RegisterId::ARM_NQ11, ARM::Q11},
+ {codeview::RegisterId::ARM_NQ12, ARM::Q12},
+ {codeview::RegisterId::ARM_NQ13, ARM::Q13},
+ {codeview::RegisterId::ARM_NQ14, ARM::Q14},
+ {codeview::RegisterId::ARM_NQ15, ARM::Q15},
+ };
+ for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
+ MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+}
+
static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
MCRegisterInfo *X = new MCRegisterInfo();
InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
+ ARM_MC::initLLVMToCVRegMapping(X);
return X;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 7cfe6881b456..5a0874f0ef1f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -41,6 +41,21 @@ class raw_pwrite_stream;
namespace ARM_MC {
std::string ParseARMTriple(const Triple &TT, StringRef CPU);
+void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
+
+bool isPredicated(const MCInst &MI, const MCInstrInfo *MCII);
+bool isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII);
+
+template<class Inst>
+bool isLDMBaseRegInList(const Inst &MI) {
+ auto BaseReg = MI.getOperand(0).getReg();
+ for (unsigned I = 1, E = MI.getNumOperands(); I < E; ++I) {
+ const auto &Op = MI.getOperand(I);
+ if (Op.isReg() && Op.getReg() == BaseReg)
+ return true;
+ }
+ return false;
+}
/// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc.
/// do not need to go through TargetRegistry.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 4d7ad6cd60cb..81f113b8302f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -44,10 +44,10 @@
using namespace llvm;
-#define DEBUG_TYPE "mve-gather-scatter-lowering"
+#define DEBUG_TYPE "arm-mve-gather-scatter-lowering"
cl::opt<bool> EnableMaskedGatherScatters(
- "enable-arm-maskedgatscat", cl::Hidden, cl::init(false),
+ "enable-arm-maskedgatscat", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of masked gathers and scatters"));
namespace {
@@ -84,7 +84,7 @@ private:
// Check for a getelementptr and deduce base and offsets from it, on success
// returning the base directly and the offsets indirectly using the Offsets
// argument
- Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP,
+ Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
IRBuilder<> &Builder);
// Compute the scale of this gather/scatter instruction
int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
@@ -132,6 +132,11 @@ private:
Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
Value *Ptr, unsigned TypeScale,
IRBuilder<> &Builder);
+
+ // Optimise the base and offsets of the given address
+ bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
+ // Try to fold consecutive geps together into one
+ Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
// Check whether these offsets could be moved out of the loop they're in
bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
// Pushes the given add out of the loop
@@ -167,7 +172,49 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
return false;
}
-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty,
+static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
+ // Offsets that are not of type <N x i32> are sign extended by the
+ // getelementptr instruction, and MVE gathers/scatters treat the offset as
+ // unsigned. Thus, if the element size is smaller than 32, we can only allow
+ // positive offsets - i.e., the offsets are not allowed to be variables we
+ // can't look into.
+ // Additionally, <N x i32> offsets have to either originate from a zext of a
+ // vector with element types smaller or equal the type of the gather we're
+ // looking at, or consist of constants that we can check are small enough
+ // to fit into the gather type.
+ // Thus we check that 0 < value < 2^TargetElemSize.
+ unsigned TargetElemSize = 128 / TargetElemCount;
+ unsigned OffsetElemSize = cast<FixedVectorType>(Offsets->getType())
+ ->getElementType()
+ ->getScalarSizeInBits();
+ if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) {
+ Constant *ConstOff = dyn_cast<Constant>(Offsets);
+ if (!ConstOff)
+ return false;
+ int64_t TargetElemMaxSize = (1ULL << TargetElemSize);
+ auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) {
+ ConstantInt *OConst = dyn_cast<ConstantInt>(OffsetElem);
+ if (!OConst)
+ return false;
+ int SExtValue = OConst->getSExtValue();
+ if (SExtValue >= TargetElemMaxSize || SExtValue < 0)
+ return false;
+ return true;
+ };
+ if (isa<FixedVectorType>(ConstOff->getType())) {
+ for (unsigned i = 0; i < TargetElemCount; i++) {
+ if (!CheckValueSize(ConstOff->getAggregateElement(i)))
+ return false;
+ }
+ } else {
+ if (!CheckValueSize(ConstOff))
+ return false;
+ }
+ }
+ return true;
+}
+
+Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
GetElementPtrInst *GEP,
IRBuilder<> &Builder) {
if (!GEP) {
@@ -178,40 +225,43 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty,
LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
<< " Looking at intrinsic for base + vector of offsets\n");
Value *GEPPtr = GEP->getPointerOperand();
- if (GEPPtr->getType()->isVectorTy()) {
+ Offsets = GEP->getOperand(1);
+ if (GEPPtr->getType()->isVectorTy() ||
+ !isa<FixedVectorType>(Offsets->getType()))
return nullptr;
- }
+
if (GEP->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many"
<< " operands. Expanding.\n");
return nullptr;
}
Offsets = GEP->getOperand(1);
+ unsigned OffsetsElemCount =
+ cast<FixedVectorType>(Offsets->getType())->getNumElements();
// Paranoid check whether the number of parallel lanes is the same
- assert(cast<FixedVectorType>(Ty)->getNumElements() ==
- cast<FixedVectorType>(Offsets->getType())->getNumElements());
- // Only <N x i32> offsets can be integrated into an arm gather, any smaller
- // type would have to be sign extended by the gep - and arm gathers can only
- // zero extend. Additionally, the offsets do have to originate from a zext of
- // a vector with element types smaller or equal the type of the gather we're
- // looking at
- if (Offsets->getType()->getScalarSizeInBits() != 32)
- return nullptr;
- if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
+ assert(Ty->getNumElements() == OffsetsElemCount);
+
+ ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets);
+ if (ZextOffs)
Offsets = ZextOffs->getOperand(0);
- else if (!(cast<FixedVectorType>(Offsets->getType())->getNumElements() == 4 &&
- Offsets->getType()->getScalarSizeInBits() == 32))
- return nullptr;
+ FixedVectorType *OffsetType = cast<FixedVectorType>(Offsets->getType());
+
+ // If the offsets are already being zext-ed to <N x i32>, that relieves us of
+ // having to make sure that they won't overflow.
+ if (!ZextOffs || cast<FixedVectorType>(ZextOffs->getDestTy())
+ ->getElementType()
+ ->getScalarSizeInBits() != 32)
+ if (!checkOffsetSize(Offsets, OffsetsElemCount))
+ return nullptr;
+ // The offset sizes have been checked; if any truncating or zext-ing is
+ // required to fix them, do that now
if (Ty != Offsets->getType()) {
- if ((Ty->getScalarSizeInBits() <
- Offsets->getType()->getScalarSizeInBits())) {
- LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
- << " Can't create intrinsic.\n");
- return nullptr;
+ if ((Ty->getElementType()->getScalarSizeInBits() <
+ OffsetType->getElementType()->getScalarSizeInBits())) {
+ Offsets = Builder.CreateTrunc(Offsets, Ty);
} else {
- Offsets = Builder.CreateZExt(
- Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
+ Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty));
}
}
// If none of the checks failed, return the gep's base pointer
@@ -426,7 +476,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
- Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder);
+ Value *BasePtr =
+ checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
if (!BasePtr)
return nullptr;
// Check whether the offset is a constant increment that could be merged into
@@ -566,7 +617,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
- Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder);
+ Value *BasePtr =
+ checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
if (!BasePtr)
return nullptr;
// Check whether the offset is a constant increment that could be merged into
@@ -801,7 +853,6 @@ void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
Phi->removeIncomingValue((unsigned)0);
Phi->removeIncomingValue((unsigned)0);
- return;
}
// Check whether all usages of this instruction are as offsets of
@@ -887,11 +938,10 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
return false;
// The phi must be an induction variable
- Instruction *Op;
int IncrementingBlock = -1;
for (int i = 0; i < 2; i++)
- if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr)
+ if (auto *Op = dyn_cast<Instruction>(Phi->getIncomingValue(i)))
if (Op->getOpcode() == Instruction::Add &&
(Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
IncrementingBlock = i;
@@ -978,6 +1028,128 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
return true;
}
+static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
+ IRBuilder<> &Builder) {
+ // Splat the non-vector value to a vector of the given type - if the value is
+ // a constant (and its value isn't too big), we can even use this opportunity
+ // to scale it to the size of the vector elements
+ auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) {
+ ConstantInt *Const;
+ if ((Const = dyn_cast<ConstantInt>(NonVectorVal)) &&
+ VT->getElementType() != NonVectorVal->getType()) {
+ unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits();
+ uint64_t N = Const->getZExtValue();
+ if (N < (unsigned)(1 << (TargetElemSize - 1))) {
+ NonVectorVal = Builder.CreateVectorSplat(
+ VT->getNumElements(), Builder.getIntN(TargetElemSize, N));
+ return;
+ }
+ }
+ NonVectorVal =
+ Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal);
+ };
+
+ FixedVectorType *XElType = dyn_cast<FixedVectorType>(X->getType());
+ FixedVectorType *YElType = dyn_cast<FixedVectorType>(Y->getType());
+ // If one of X, Y is not a vector, we have to splat it in order
+ // to add the two of them.
+ if (XElType && !YElType) {
+ FixSummands(XElType, Y);
+ YElType = cast<FixedVectorType>(Y->getType());
+ } else if (YElType && !XElType) {
+ FixSummands(YElType, X);
+ XElType = cast<FixedVectorType>(X->getType());
+ }
+ assert(XElType && YElType && "Unknown vector types");
+ // Check that the summands are of compatible types
+ if (XElType != YElType) {
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n");
+ return nullptr;
+ }
+
+ if (XElType->getElementType()->getScalarSizeInBits() != 32) {
+ // Check that by adding the vectors we do not accidentally
+ // create an overflow
+ Constant *ConstX = dyn_cast<Constant>(X);
+ Constant *ConstY = dyn_cast<Constant>(Y);
+ if (!ConstX || !ConstY)
+ return nullptr;
+ unsigned TargetElemSize = 128 / XElType->getNumElements();
+ for (unsigned i = 0; i < XElType->getNumElements(); i++) {
+ ConstantInt *ConstXEl =
+ dyn_cast<ConstantInt>(ConstX->getAggregateElement(i));
+ ConstantInt *ConstYEl =
+ dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
+ if (!ConstXEl || !ConstYEl ||
+ ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
+ (unsigned)(1 << (TargetElemSize - 1)))
+ return nullptr;
+ }
+ }
+
+ Value *Add = Builder.CreateAdd(X, Y);
+
+ FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
+ if (checkOffsetSize(Add, GEPType->getNumElements()))
+ return Add;
+ else
+ return nullptr;
+}
+
+Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
+ Value *&Offsets,
+ IRBuilder<> &Builder) {
+ Value *GEPPtr = GEP->getPointerOperand();
+ Offsets = GEP->getOperand(1);
+ // We only merge geps with constant offsets, because only for those
+ // we can make sure that we do not cause an overflow
+ if (!isa<Constant>(Offsets))
+ return nullptr;
+ GetElementPtrInst *BaseGEP;
+ if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
+ // Merge the two geps into one
+ Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
+ if (!BaseBasePtr)
+ return nullptr;
+ Offsets =
+ CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
+ if (Offsets == nullptr)
+ return nullptr;
+ return BaseBasePtr;
+ }
+ return GEPPtr;
+}
+
+bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
+ LoopInfo *LI) {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Address);
+ if (!GEP)
+ return false;
+ bool Changed = false;
+ if (GEP->hasOneUse() &&
+ dyn_cast<GetElementPtrInst>(GEP->getPointerOperand())) {
+ IRBuilder<> Builder(GEP->getContext());
+ Builder.SetInsertPoint(GEP);
+ Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
+ Value *Offsets;
+ Value *Base = foldGEP(GEP, Offsets, Builder);
+ // We only want to merge the geps if there is a real chance that they can be
+ // used by an MVE gather; thus the offset has to have the correct size
+ // (always i32 if it is not of vector type) and the base has to be a
+ // pointer.
+ if (Offsets && Base && Base != GEP) {
+ PointerType *BaseType = cast<PointerType>(Base->getType());
+ GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
+ BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP);
+ GEP->replaceAllUsesWith(NewAddress);
+ GEP = NewAddress;
+ Changed = true;
+ }
+ }
+ Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI);
+ return Changed;
+}
+
bool MVEGatherScatterLowering::runOnFunction(Function &F) {
if (!EnableMaskedGatherScatters)
return false;
@@ -995,22 +1167,17 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
- if (II && II->getIntrinsicID() == Intrinsic::masked_gather) {
+ if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
+ isa<FixedVectorType>(II->getType())) {
Gathers.push_back(II);
- if (isa<GetElementPtrInst>(II->getArgOperand(0)))
- Changed |= optimiseOffsets(
- cast<Instruction>(II->getArgOperand(0))->getOperand(1),
- II->getParent(), LI);
- } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) {
+ Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI);
+ } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter &&
+ isa<FixedVectorType>(II->getArgOperand(0)->getType())) {
Scatters.push_back(II);
- if (isa<GetElementPtrInst>(II->getArgOperand(1)))
- Changed |= optimiseOffsets(
- cast<Instruction>(II->getArgOperand(1))->getOperand(1),
- II->getParent(), LI);
+ Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI);
}
}
}
-
for (unsigned i = 0; i < Gathers.size(); i++) {
IntrinsicInst *I = Gathers[i];
Value *L = lowerGather(I);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
new file mode 100644
index 000000000000..9ab5d92729fe
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredUtils.h
@@ -0,0 +1,157 @@
+//===-- MVETailPredUtils.h - Tail predication utility functions -*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for low overhead and tail predicated
+// loops, shared between the ARMLowOverheadLoops pass and anywhere else that
+// needs them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H
+#define LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+namespace llvm {
+
+static inline unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unhandled vctp opcode");
+ break;
+ case ARM::MVE_VCTP8:
+ return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
+ case ARM::MVE_VCTP16:
+ return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
+ case ARM::MVE_VCTP32:
+ return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
+ case ARM::MVE_VCTP64:
+ return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
+ }
+ return 0;
+}
+
+static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unhandled vctp opcode");
+ case ARM::MVE_VCTP8:
+ return 16;
+ case ARM::MVE_VCTP16:
+ return 8;
+ case ARM::MVE_VCTP32:
+ return 4;
+ case ARM::MVE_VCTP64:
+ return 2;
+ }
+ return 0;
+}
+
+static inline bool isVCTP(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case ARM::MVE_VCTP8:
+ case ARM::MVE_VCTP16:
+ case ARM::MVE_VCTP32:
+ case ARM::MVE_VCTP64:
+ return true;
+ }
+ return false;
+}
+
+static inline bool isLoopStart(MachineInstr &MI) {
+ return MI.getOpcode() == ARM::t2DoLoopStart ||
+ MI.getOpcode() == ARM::t2DoLoopStartTP ||
+ MI.getOpcode() == ARM::t2WhileLoopStart;
+}
+
+// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
+// beq that branches to the exit branch.
+inline void RevertWhileLoopStart(MachineInstr *MI, const TargetInstrInfo *TII,
+ unsigned BrOpc = ARM::t2Bcc) {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // Cmp
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
+ MIB.add(MI->getOperand(0));
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+
+ // Branch
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+ MIB.add(MI->getOperand(1)); // branch target
+ MIB.addImm(ARMCC::EQ); // condition code
+ MIB.addReg(ARM::CPSR);
+
+ MI->eraseFromParent();
+}
+
+inline void RevertDoLoopStart(MachineInstr *MI, const TargetInstrInfo *TII) {
+ MachineBasicBlock *MBB = MI->getParent();
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr))
+ .add(MI->getOperand(0))
+ .add(MI->getOperand(1))
+ .add(predOps(ARMCC::AL));
+
+ MI->eraseFromParent();
+}
+
+inline void RevertLoopDec(MachineInstr *MI, const TargetInstrInfo *TII,
+ bool SetFlags = false) {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
+ MIB.add(MI->getOperand(0));
+ MIB.add(MI->getOperand(1));
+ MIB.add(MI->getOperand(2));
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(0);
+
+ if (SetFlags) {
+ MIB.addReg(ARM::CPSR);
+ MIB->getOperand(5).setIsDef(true);
+ } else
+ MIB.addReg(0);
+
+ MI->eraseFromParent();
+}
+
+// Generate a subs, or sub and cmp, and a branch instead of an LE.
+inline void RevertLoopEnd(MachineInstr *MI, const TargetInstrInfo *TII,
+ unsigned BrOpc = ARM::t2Bcc, bool SkipCmp = false) {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // Create cmp
+ if (!SkipCmp) {
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
+ MIB.add(MI->getOperand(0));
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ }
+
+ // Create bne
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
+ MIB.add(MI->getOperand(1)); // branch target
+ MIB.addImm(ARMCC::NE); // condition code
+ MIB.addReg(ARM::CPSR);
+ MI->eraseFromParent();
+}
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
index 5bf3522ab2e6..b705208660df 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -22,23 +22,13 @@
/// The HardwareLoops pass inserts intrinsics identifying loops that the
/// backend will attempt to convert into a low-overhead loop. The vectorizer is
/// responsible for generating a vectorized loop in which the lanes are
-/// predicated upon the iteration counter. This pass looks at these predicated
-/// vector loops, that are targets for low-overhead loops, and prepares it for
-/// code generation. Once the vectorizer has produced a masked loop, there's a
-/// couple of final forms:
-/// - A tail-predicated loop, with implicit predication.
-/// - A loop containing multiple VCPT instructions, predicating multiple VPT
-/// blocks of instructions operating on different vector types.
-///
-/// This pass:
-/// 1) Checks if the predicates of the masked load/store instructions are
-/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
-/// the Backedge Taken Count (BTC) of the scalar loop as its second argument,
-/// which we extract to set up the number of elements processed by the loop.
-/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
-/// specific VCTP intrinsic to represent the effect of tail predication.
-/// This will be picked up by the ARM Low-overhead loop pass, which performs
-/// the final transformation to a DLSTP or WLSTP tail-predicated loop.
+/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these
+/// get.active.lane.mask intrinsic and attempts to convert them to VCTP
+/// instructions. This will be picked up by the ARM Low-overhead loop pass later
+/// in the backend, which performs the final transformation to a DLSTP or WLSTP
+/// tail-predicated loop.
+//
+//===----------------------------------------------------------------------===//
#include "ARM.h"
#include "ARMSubtarget.h"
@@ -57,6 +47,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
@@ -66,8 +57,8 @@ using namespace llvm;
#define DESC "Transform predicated vector loops to use MVE tail predication"
cl::opt<TailPredication::Mode> EnableTailPredication(
- "tail-predication", cl::desc("MVE tail-predication options"),
- cl::init(TailPredication::Disabled),
+ "tail-predication", cl::desc("MVE tail-predication pass options"),
+ cl::init(TailPredication::Enabled),
cl::values(clEnumValN(TailPredication::Disabled, "disabled",
"Don't tail-predicate loops"),
clEnumValN(TailPredication::EnabledNoReductions,
@@ -112,23 +103,18 @@ public:
bool runOnLoop(Loop *L, LPPassManager&) override;
private:
- /// Perform the relevant checks on the loop and convert if possible.
- bool TryConvert(Value *TripCount);
+ /// Perform the relevant checks on the loop and convert active lane masks if
+ /// possible.
+ bool TryConvertActiveLaneMask(Value *TripCount);
- /// Return whether this is a vectorized loop, that contains masked
- /// load/stores.
- bool IsPredicatedVectorLoop();
-
- /// Perform checks on the arguments of @llvm.get.active.lane.mask
- /// intrinsic: check if the first is a loop induction variable, and for the
- /// the second check that no overflow can occur in the expression that use
- /// this backedge-taken count.
- bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
- FixedVectorType *VecTy);
+ /// Perform several checks on the arguments of @llvm.get.active.lane.mask
+ /// intrinsic. E.g., check that the loop induction variable and the element
+ /// count are of the form we expect, and also perform overflow checks for
+ /// the new expressions that are created.
+ bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount);
/// Insert the intrinsic to represent the effect of tail predication.
- void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
- FixedVectorType *VecTy);
+ void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount);
/// Rematerialize the iteration count in exit blocks, which enables
/// ARMLowOverheadLoops to better optimise away loop update statements inside
@@ -138,25 +124,6 @@ private:
} // end namespace
-static bool IsDecrement(Instruction &I) {
- auto *Call = dyn_cast<IntrinsicInst>(&I);
- if (!Call)
- return false;
-
- Intrinsic::ID ID = Call->getIntrinsicID();
- return ID == Intrinsic::loop_decrement_reg;
-}
-
-static bool IsMasked(Instruction *I) {
- auto *Call = dyn_cast<IntrinsicInst>(I);
- if (!Call)
- return false;
-
- Intrinsic::ID ID = Call->getIntrinsicID();
- // TODO: Support gather/scatter expand/compress operations.
- return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
-}
-
bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (skipLoop(L) || !EnableTailPredication)
return false;
@@ -188,7 +155,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
continue;
Intrinsic::ID ID = Call->getIntrinsicID();
- if (ID == Intrinsic::set_loop_iterations ||
+ if (ID == Intrinsic::start_loop_iterations ||
ID == Intrinsic::test_set_loop_iterations)
return cast<IntrinsicInst>(&I);
}
@@ -207,148 +174,23 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
return false;
}
- // Search for the hardware loop intrinic that decrements the loop counter.
- IntrinsicInst *Decrement = nullptr;
- for (auto *BB : L->getBlocks()) {
- for (auto &I : *BB) {
- if (IsDecrement(I)) {
- Decrement = cast<IntrinsicInst>(&I);
- break;
- }
- }
- }
-
- if (!Decrement)
- return false;
-
- LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
- << *Decrement << "\n");
-
- if (!TryConvert(Setup->getArgOperand(0))) {
- LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
- return false;
- }
-
- return true;
-}
-
-static FixedVectorType *getVectorType(IntrinsicInst *I) {
- unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
- auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
- auto *VecTy = cast<FixedVectorType>(PtrTy->getElementType());
- assert(VecTy && "No scalable vectors expected here");
- return VecTy;
-}
+ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n");
-bool MVETailPredication::IsPredicatedVectorLoop() {
- // Check that the loop contains at least one masked load/store intrinsic.
- // We only support 'normal' vector instructions - other than masked
- // load/stores.
- bool ActiveLaneMask = false;
- for (auto *BB : L->getBlocks()) {
- for (auto &I : *BB) {
- auto *Int = dyn_cast<IntrinsicInst>(&I);
- if (!Int)
- continue;
+ bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0));
- switch (Int->getIntrinsicID()) {
- case Intrinsic::get_active_lane_mask:
- ActiveLaneMask = true;
- LLVM_FALLTHROUGH;
- case Intrinsic::sadd_sat:
- case Intrinsic::uadd_sat:
- case Intrinsic::ssub_sat:
- case Intrinsic::usub_sat:
- continue;
- case Intrinsic::fma:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::round:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::fabs:
- if (ST->hasMVEFloatOps())
- continue;
- LLVM_FALLTHROUGH;
- default:
- break;
- }
-
- if (IsMasked(&I)) {
- auto *VecTy = getVectorType(Int);
- unsigned Lanes = VecTy->getNumElements();
- unsigned ElementWidth = VecTy->getScalarSizeInBits();
- // MVE vectors are 128-bit, but don't support 128 x i1.
- // TODO: Can we support vectors larger than 128-bits?
- unsigned MaxWidth = TTI->getRegisterBitWidth(true);
- if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
- return false;
- MaskedInsts.push_back(cast<IntrinsicInst>(&I));
- continue;
- }
-
- for (const Use &U : Int->args()) {
- if (isa<VectorType>(U->getType()))
- return false;
- }
- }
- }
-
- if (!ActiveLaneMask) {
- LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
- return false;
- }
- return !MaskedInsts.empty();
-}
-
-// Look through the exit block to see whether there's a duplicate predicate
-// instruction. This can happen when we need to perform a select on values
-// from the last and previous iteration. Instead of doing a straight
-// replacement of that predicate with the vctp, clone the vctp and place it
-// in the block. This means that the VPR doesn't have to be live into the
-// exit block which should make it easier to convert this loop into a proper
-// tail predicated loop.
-static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
- BasicBlock *Exit = L->getUniqueExitBlock();
- if (!Exit) {
- LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
- return;
- }
-
- // Drop references and add operands to check for dead.
- SmallPtrSet<Instruction*, 4> Dead;
- while (!MaybeDead.empty()) {
- auto *I = MaybeDead.front();
- MaybeDead.remove(I);
- if (I->hasNUsesOrMore(1))
- continue;
-
- for (auto &U : I->operands())
- if (auto *OpI = dyn_cast<Instruction>(U))
- MaybeDead.insert(OpI);
-
- Dead.insert(I);
- }
-
- for (auto *I : Dead) {
- LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
- I->eraseFromParent();
- }
-
- for (auto I : L->blocks())
- DeleteDeadPHIs(I);
+ return Changed;
}
// The active lane intrinsic has this form:
//
-// @llvm.get.active.lane.mask(IV, BTC)
+// @llvm.get.active.lane.mask(IV, TC)
//
// Here we perform checks that this intrinsic behaves as expected,
// which means:
//
-// 1) The element count, which is calculated with BTC + 1, cannot overflow.
-// 2) The element count needs to be sufficiently large that the decrement of
-// element counter doesn't overflow, which means that we need to prove:
+// 1) Check that the TripCount (TC) belongs to this loop (originally).
+// 2) The element count (TC) needs to be sufficiently large that the decrement
+// of element counter doesn't overflow, which means that we need to prove:
// ceil(ElementCount / VectorWidth) >= TripCount
// by rounding up ElementCount up:
// ((ElementCount + (VectorWidth - 1)) / VectorWidth
@@ -357,109 +199,118 @@ static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
// 3) The IV must be an induction phi with an increment equal to the
// vector width.
bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
- Value *TripCount, FixedVectorType *VecTy) {
+ Value *TripCount) {
bool ForceTailPredication =
EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabled;
- // 1) Test whether entry to the loop is protected by a conditional
- // BTC + 1 < 0. In other words, if the scalar trip count overflows,
- // becomes negative, we shouldn't enter the loop and creating
- // tripcount expression BTC + 1 is not safe. So, check that BTC
- // isn't max. This is evaluated in unsigned, because the semantics
- // of @get.active.lane.mask is a ULE comparison.
-
- int VectorWidth = VecTy->getNumElements();
- auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
- auto *BTC = SE->getSCEV(BackedgeTakenCount);
-
- if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) &&
- !ForceTailPredication) {
- LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: ";
- BTC->dump());
- return false;
- }
- // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
- //
- // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
- //
- // 2.1) First prove overflow can't happen in:
- //
- // ElementCount + (VectorWidth - 1)
- //
- // Because of a lack of context, it is difficult to get a useful bounds on
- // this expression. But since ElementCount uses the same variables as the
- // TripCount (TC), for which we can find meaningful value ranges, we use that
- // instead and assert that:
- //
- // upperbound(TC) <= UINT_MAX - VectorWidth
- //
+ Value *ElemCount = ActiveLaneMask->getOperand(1);
+ auto *EC= SE->getSCEV(ElemCount);
auto *TC = SE->getSCEV(TripCount);
- unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
- auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
- uint64_t MaxMinusVW = Diff.getZExtValue();
- uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
-
- if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
- LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
- dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
- dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";);
+ int VectorWidth =
+ cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
+ if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16)
return false;
- }
+ ConstantInt *ConstElemCount = nullptr;
- // 2.2) Make sure overflow doesn't happen in final expression:
- // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
- // To do this, compare the full ranges of these subexpressions:
- //
- // Range(Ceil) <= Range(TC)
- //
- // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
- // values (and not constants), we have to compensate for the lowerbound value
- // range to be off by 1. The reason is that BTC lives in the preheader in
- // this form:
- //
- // %trip.count.minus = add nsw nuw i32 %N, -1
- //
- // For the loop to be executed, %N has to be >= 1 and as a result the value
- // range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
- //
- // %5 = add nuw nsw i32 %4, 1
- // call void @llvm.set.loop.iterations.i32(i32 %5)
- //
- // where %5 is some expression using %N, which needs to have a lower bound of
- // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
- // we first add 0 to TC such that we can do the <= comparison on both sets.
- //
- auto *One = SE->getOne(TripCount->getType());
- // ElementCount = BTC + 1
- auto *ElementCount = SE->getAddExpr(BTC, One);
- // Tmp = ElementCount + (VW-1)
- auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount,
- SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
- // Ceil = ElementCount + (VW-1) / VW
- auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
- SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
-
- ConstantRange RangeCeil = SE->getSignedRange(Ceil) ;
- ConstantRange RangeTC = SE->getSignedRange(TC) ;
- if (!RangeTC.isSingleElement()) {
- auto ZeroRange =
- ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
- RangeTC = RangeTC.unionWith(ZeroRange);
- }
- if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
- LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
+ // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
+ // this loop. The scalar tripcount corresponds the number of elements
+ // processed by the loop, so we will refer to that from this point on.
+ if (!SE->isLoopInvariant(EC, L)) {
+ LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");
return false;
}
- // 3) Find out if IV is an induction phi. Note that We can't use Loop
+ if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) {
+ ConstantInt *TC = dyn_cast<ConstantInt>(TripCount);
+ if (!TC) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in "
+ "set.loop.iterations\n");
+ return false;
+ }
+
+ // Calculate 2 tripcount values and check that they are consistent with
+ // each other. The TripCount for a predicated vector loop body is
+ // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we
+ // work it out here.
+ uint64_t TC1 = TC->getZExtValue();
+ uint64_t TC2 =
+ (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth;
+
+ // If the tripcount values are inconsistent, we can't insert the VCTP and
+ // trigger tail-predication; keep the intrinsic as a get.active.lane.mask
+ // and legalize this.
+ if (TC1 != TC2) {
+ LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
+ << TC1 << " from set.loop.iterations, and "
+ << TC2 << " from get.active.lane.mask\n");
+ return false;
+ }
+ } else if (!ForceTailPredication) {
+ // 2) We need to prove that the sub expression that we create in the
+ // tail-predicated loop body, which calculates the remaining elements to be
+ // processed, is non-negative, i.e. it doesn't overflow:
+ //
+ // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0
+ //
+ // This is true if:
+ //
+ // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth
+ //
+ // which what we will be using here.
+ //
+ auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth));
+ // ElementCount + (VW-1):
+ auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
+ SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
+
+ // Ceil = ElementCount + (VW-1) / VW
+ auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW);
+
+ // Prevent unused variable warnings with TC
+ (void)TC;
+ LLVM_DEBUG(
+ dbgs() << "ARM TP: Analysing overflow behaviour for:\n";
+ dbgs() << "ARM TP: - TripCount = "; TC->dump();
+ dbgs() << "ARM TP: - ElemCount = "; EC->dump();
+ dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n";
+ dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump();
+ );
+
+ // As an example, almost all the tripcount expressions (produced by the
+ // vectoriser) look like this:
+ //
+ // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4)
+ //
+ // and "ElementCount + (VW-1) / VW":
+ //
+ // Ceil = ((3 + %N) /u 4)
+ //
+ // Check for equality of TC and Ceil by calculating SCEV expression
+ // TC - Ceil and test it for zero.
+ //
+ bool Zero = SE->getMinusSCEV(
+ SE->getBackedgeTakenCount(L),
+ SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
+ SE->getNegativeSCEV(VW)),
+ VW))
+ ->isZero();
+
+ if (!Zero) {
+ LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n");
+ return false;
+ }
+ }
+
+ // 3) Find out if IV is an induction phi. Note that we can't use Loop
// helpers here to get the induction variable, because the hardware loop is
- // no longer in loopsimplify form, and also the hwloop intrinsic use a
- // different counter. Using SCEV, we check that the induction is of the
+ // no longer in loopsimplify form, and also the hwloop intrinsic uses a
+ // different counter. Using SCEV, we check that the induction is of the
// form i = i + 4, where the increment must be equal to the VectorWidth.
auto *IV = ActiveLaneMask->getOperand(0);
auto *IVExpr = SE->getSCEV(IV);
auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
+
if (!AddExpr) {
LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
return false;
@@ -469,6 +320,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
return false;
}
+ auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0));
+ if (!Base || !Base->isZero()) {
+ LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n");
+ return false;
+ }
auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
if (!Step) {
LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
@@ -479,68 +335,29 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
if (VectorWidth == StepValue)
return true;
- LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
- "vector width " << VectorWidth << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue
+ << " doesn't match vector width " << VectorWidth << "\n");
return false;
}
-// Materialize NumElements in the preheader block.
-static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
- // First, check the preheader if it not already exist:
- //
- // preheader:
- // %BTC = add i32 %N, -1
- // ..
- // vector.body:
- //
- // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
- // but instead can just return %N.
- for (auto &I : *Preheader) {
- if (I.getOpcode() != Instruction::Add || &I != BTC)
- continue;
- ConstantInt *MinusOne = nullptr;
- if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
- continue;
- if (MinusOne->getSExtValue() == -1) {
- LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
- return I.getOperand(0);
- }
- }
-
- // But we do need to materialise BTC if it is not already there,
- // e.g. if it is a constant.
- IRBuilder<> Builder(Preheader->getTerminator());
- Value *NumElements = Builder.CreateAdd(BTC,
- ConstantInt::get(BTC->getType(), 1), "num.elements");
- LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
- return NumElements;
-}
-
void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
- Value *TripCount, FixedVectorType *VecTy) {
+ Value *TripCount) {
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
- unsigned VectorWidth = VecTy->getNumElements();
-
- // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
- // is one less than the trip count. So we need to find or create
- // %num.elements = %BTC + 1 in the preheader.
- Value *BTC = ActiveLaneMask->getOperand(1);
- Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
- Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
+ unsigned VectorWidth =
+ cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
// Insert a phi to count the number of elements processed by the loop.
- Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() );
+ Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI());
PHINode *Processed = Builder.CreatePHI(Ty, 2);
- Processed->addIncoming(NumElements, L->getLoopPreheader());
+ Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
- // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus
- // represent the effect of tail predication.
+ // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and
+ // thus represent the effect of tail predication.
Builder.SetInsertPoint(ActiveLaneMask);
- ConstantInt *Factor =
- ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
+ ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
Intrinsic::ID VCTPID;
switch (VectorWidth) {
@@ -569,42 +386,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
<< "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
}
-bool MVETailPredication::TryConvert(Value *TripCount) {
- if (!IsPredicatedVectorLoop()) {
- LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
+bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) {
+ SmallVector<IntrinsicInst *, 4> ActiveLaneMasks;
+ for (auto *BB : L->getBlocks())
+ for (auto &I : *BB)
+ if (auto *Int = dyn_cast<IntrinsicInst>(&I))
+ if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask)
+ ActiveLaneMasks.push_back(Int);
+
+ if (ActiveLaneMasks.empty())
return false;
- }
LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
- SetVector<Instruction*> Predicates;
-
- // Walk through the masked intrinsics and try to find whether the predicate
- // operand is generated by intrinsic @llvm.get.active.lane.mask().
- for (auto *I : MaskedInsts) {
- unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3;
- auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
- if (!Predicate || Predicates.count(Predicate))
- continue;
-
- auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
- if (!ActiveLaneMask ||
- ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
- continue;
-
- Predicates.insert(Predicate);
+
+ for (auto *ActiveLaneMask : ActiveLaneMasks) {
LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
<< *ActiveLaneMask << "\n");
- auto *VecTy = getVectorType(I);
- if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+ if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) {
LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
return false;
}
LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
- InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
+ InsertVCTPIntrinsic(ActiveLaneMask, TripCount);
}
- Cleanup(Predicates, L);
+ // Remove dead instructions and now dead phis.
+ for (auto *II : ActiveLaneMasks)
+ RecursivelyDeleteTriviallyDeadInstructions(II);
+ for (auto I : L->blocks())
+ DeleteDeadPHIs(I);
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index dc769ae526bc..9a710b784fd1 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -270,26 +270,33 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
MIBuilder.add(VCMP->getOperand(1));
MIBuilder.add(VCMP->getOperand(2));
MIBuilder.add(VCMP->getOperand(3));
+
+ // We need to remove any kill flags between the original VCMP and the new
+ // insertion point.
+ for (MachineInstr &MII :
+ make_range(VCMP->getIterator(), MI->getIterator())) {
+ MII.clearRegisterKills(VCMP->getOperand(1).getReg(), TRI);
+ MII.clearRegisterKills(VCMP->getOperand(2).getReg(), TRI);
+ }
+
VCMP->eraseFromParent();
} else {
MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST));
MIBuilder.addImm((uint64_t)BlockMask);
}
+ // Erase all dead instructions (VPNOT's). Do that now so that they do not
+ // mess with the bundle creation.
+ for (MachineInstr *DeadMI : DeadInstructions)
+ DeadMI->eraseFromParent();
+ DeadInstructions.clear();
+
finalizeBundle(
Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter);
Modified = true;
}
- // Erase all dead instructions
- for (MachineInstr *DeadMI : DeadInstructions) {
- if (DeadMI->isInsideBundle())
- DeadMI->eraseFromBundle();
- else
- DeadMI->eraseFromParent();
- }
-
return Modified;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
index 382ddd4572c7..00e4449769f4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -6,28 +6,28 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file This pass does a few optimisations related to MVE VPT blocks before
-/// register allocation is performed. The goal is to maximize the sizes of the
-/// blocks that will be created by the MVE VPT Block Insertion pass (which runs
-/// after register allocation). The first optimisation done by this pass is the
-/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass
-/// can delete them later to create larger VPT blocks.
-/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when
-/// inside a block of predicated instructions. This is done to avoid
-/// spill/reloads of VPR in the middle of a block, which prevents the Block
-/// Insertion pass from creating large blocks.
-//
+/// \file This pass does a few optimisations related to Tail predicated loops
+/// and MVE VPT blocks before register allocation is performed. For VPT blocks
+/// the goal is to maximize the sizes of the blocks that will be created by the
+/// MVE VPT Block Insertion pass (which runs after register allocation). For
+/// tail predicated loops we transform the loop into something that will
+/// hopefully make the backend ARMLowOverheadLoops pass's job easier.
+///
//===----------------------------------------------------------------------===//
#include "ARM.h"
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MVETailPredUtils.h"
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
#include <cassert>
@@ -35,6 +35,11 @@ using namespace llvm;
#define DEBUG_TYPE "arm-mve-vpt-opts"
+static cl::opt<bool>
+MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
+ cl::desc("Enable merging Loop End and Dec instructions."),
+ cl::init(true));
+
namespace {
class MVEVPTOptimisations : public MachineFunctionPass {
public:
@@ -46,25 +51,314 @@ public:
bool runOnMachineFunction(MachineFunction &Fn) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
StringRef getPassName() const override {
- return "ARM MVE VPT Optimisation Pass";
+ return "ARM MVE TailPred and VPT Optimisation Pass";
}
private:
+ bool MergeLoopEnd(MachineLoop *ML);
+ bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
MachineInstr &Instr,
MachineOperand &User,
Register Target);
bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
+ bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
+ bool ConvertVPSEL(MachineBasicBlock &MBB);
};
char MVEVPTOptimisations::ID = 0;
} // end anonymous namespace
-INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE,
- "ARM MVE VPT Optimisations pass", false, false)
+INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE,
+ "ARM MVE TailPred and VPT Optimisations pass", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE,
+ "ARM MVE TailPred and VPT Optimisations pass", false, false)
+
+static MachineInstr *LookThroughCOPY(MachineInstr *MI,
+ MachineRegisterInfo *MRI) {
+ while (MI && MI->getOpcode() == TargetOpcode::COPY &&
+ MI->getOperand(1).getReg().isVirtual())
+ MI = MRI->getVRegDef(MI->getOperand(1).getReg());
+ return MI;
+}
+
+// Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
+// corresponding PHI that make up a low overhead loop. Only handles 'do' loops
+// at the moment, returning a t2DoLoopStart in LoopStart.
+static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
+ MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
+ MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
+ MachineBasicBlock *Header = ML->getHeader();
+ MachineBasicBlock *Latch = ML->getLoopLatch();
+ if (!Header || !Latch) {
+ LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n");
+ return false;
+ }
+
+ // Find the loop end from the terminators.
+ LoopEnd = nullptr;
+ for (auto &T : Latch->terminators()) {
+ if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
+ LoopEnd = &T;
+ break;
+ }
+ if (T.getOpcode() == ARM::t2LoopEndDec &&
+ T.getOperand(2).getMBB() == Header) {
+ LoopEnd = &T;
+ break;
+ }
+ }
+ if (!LoopEnd) {
+ LLVM_DEBUG(dbgs() << " no LoopEnd\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd);
+
+ // Find the dec from the use of the end. There may be copies between
+ // instructions. We expect the loop to loop like:
+ // $vs = t2DoLoopStart ...
+ // loop:
+ // $vp = phi [ $vs ], [ $vd ]
+ // ...
+ // $vd = t2LoopDec $vp
+ // ...
+ // t2LoopEnd $vd, loop
+ if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
+ LoopDec = LoopEnd;
+ else {
+ LoopDec =
+ LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
+ if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
+ LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n");
+ return false;
+ }
+ }
+ LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec);
+
+ LoopPhi =
+ LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
+ if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
+ LoopPhi->getNumOperands() != 5 ||
+ (LoopPhi->getOperand(2).getMBB() != Latch &&
+ LoopPhi->getOperand(4).getMBB() != Latch)) {
+ LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi);
+
+ Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
+ ? LoopPhi->getOperand(3).getReg()
+ : LoopPhi->getOperand(1).getReg();
+ LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
+ if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) {
+ LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart);
+
+ return true;
+}
+
+// This function converts loops with t2LoopEnd and t2LoopEnd instructions into
+// a single t2LoopEndDec instruction. To do that it needs to make sure that LR
+// will be valid to be used for the low overhead loop, which means nothing else
+// is using LR (especially calls) and there are no superfluous copies in the
+// loop. The t2LoopEndDec is a branching terminator that produces a value (the
+// decrement) around the loop edge, which means we need to be careful that they
+// will be valid to allocate without any spilling.
+bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
+ if (!MergeEndDec)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
+ << "\n");
+
+ MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
+ if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
+ return false;
+
+ // Check if there is an illegal instruction (a call) in the low overhead loop
+ // and if so revert it now before we get any further.
+ for (MachineBasicBlock *MBB : ML->blocks()) {
+ for (MachineInstr &MI : *MBB) {
+ if (MI.isCall()) {
+ LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI);
+ RevertDoLoopStart(LoopStart, TII);
+ RevertLoopDec(LoopDec, TII);
+ RevertLoopEnd(LoopEnd, TII);
+ return true;
+ }
+ }
+ }
+
+ // Remove any copies from the loop, to ensure the phi that remains is both
+ // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
+ // that cannot spill, we need to be careful what remains in the loop.
+ Register PhiReg = LoopPhi->getOperand(0).getReg();
+ Register DecReg = LoopDec->getOperand(0).getReg();
+ Register StartReg = LoopStart->getOperand(0).getReg();
+ // Ensure the uses are expected, and collect any copies we want to remove.
+ SmallVector<MachineInstr *, 4> Copies;
+ auto CheckUsers = [&Copies](Register BaseReg,
+ ArrayRef<MachineInstr *> ExpectedUsers,
+ MachineRegisterInfo *MRI) {
+ SmallVector<Register, 4> Worklist;
+ Worklist.push_back(BaseReg);
+ while (!Worklist.empty()) {
+ Register Reg = Worklist.pop_back_val();
+ for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+ if (count(ExpectedUsers, &MI))
+ continue;
+ if (MI.getOpcode() != TargetOpcode::COPY ||
+ !MI.getOperand(0).getReg().isVirtual()) {
+ LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
+ return false;
+ }
+ Worklist.push_back(MI.getOperand(0).getReg());
+ Copies.push_back(&MI);
+ }
+ }
+ return true;
+ };
+ if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
+ !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
+ !CheckUsers(StartReg, {LoopPhi}, MRI))
+ return false;
+
+ MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
+ MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
+ MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
+
+ if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
+ LoopPhi->getOperand(3).setReg(StartReg);
+ LoopPhi->getOperand(1).setReg(DecReg);
+ } else {
+ LoopPhi->getOperand(1).setReg(StartReg);
+ LoopPhi->getOperand(3).setReg(DecReg);
+ }
+
+ // Replace the loop dec and loop end as a single instruction.
+ MachineInstrBuilder MI =
+ BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
+ TII->get(ARM::t2LoopEndDec), DecReg)
+ .addReg(PhiReg)
+ .add(LoopEnd->getOperand(1));
+ (void)MI;
+ LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());
+
+ LoopDec->eraseFromParent();
+ LoopEnd->eraseFromParent();
+ for (auto *MI : Copies)
+ MI->eraseFromParent();
+ return true;
+}
+
+// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
+// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
+// instruction, making the backend ARMLowOverheadLoops passes job of finding the
+// VCTP operand much simpler.
+bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
+ MachineDominatorTree *DT) {
+ LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
+ << ML->getHeader()->getName() << "\n");
+
+ // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
+ // in the loop.
+ MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
+ if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
+ return false;
+ if (LoopDec != LoopEnd)
+ return false;
+
+ SmallVector<MachineInstr *, 4> VCTPs;
+ for (MachineBasicBlock *BB : ML->blocks())
+ for (MachineInstr &MI : *BB)
+ if (isVCTP(&MI))
+ VCTPs.push_back(&MI);
+
+ if (VCTPs.empty()) {
+ LLVM_DEBUG(dbgs() << " no VCTPs\n");
+ return false;
+ }
+
+ // Check all VCTPs are the same.
+ MachineInstr *FirstVCTP = *VCTPs.begin();
+ for (MachineInstr *VCTP : VCTPs) {
+ LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP);
+ if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
+ VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
+ LLVM_DEBUG(dbgs() << " VCTP's are not identical\n");
+ return false;
+ }
+ }
+
+ // Check for the register being used can be setup before the loop. We expect
+ // this to be:
+ // $vx = ...
+ // loop:
+ // $vp = PHI [ $vx ], [ $vd ]
+ // ..
+ // $vpr = VCTP $vp
+ // ..
+ // $vd = t2SUBri $vp, #n
+ // ..
+ Register CountReg = FirstVCTP->getOperand(1).getReg();
+ if (!CountReg.isVirtual()) {
+ LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n");
+ return false;
+ }
+ MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
+ if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
+ Phi->getNumOperands() != 5 ||
+ (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
+ Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
+ LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n");
+ return false;
+ }
+ CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
+ ? Phi->getOperand(3).getReg()
+ : Phi->getOperand(1).getReg();
+
+ // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
+ // the preheader and add the new CountReg to it. We attempt to place it late
+ // in the preheader, but may need to move that earlier based on uses.
+ MachineBasicBlock *MBB = LoopStart->getParent();
+ MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
+ for (MachineInstr &Use :
+ MRI->use_instructions(LoopStart->getOperand(0).getReg()))
+ if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
+ !DT->dominates(ML->getHeader(), Use.getParent())) {
+ LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n");
+ return false;
+ }
+
+ MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
+ TII->get(ARM::t2DoLoopStartTP))
+ .add(LoopStart->getOperand(0))
+ .add(LoopStart->getOperand(1))
+ .addReg(CountReg);
+ (void)MI;
+ LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with "
+ << *MI.getInstr());
+ MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
+ LoopStart->eraseFromParent();
+
+ return true;
+}
// Returns true if Opcode is any VCMP Opcode.
static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
@@ -356,7 +650,7 @@ bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
}
for (MachineInstr *DeadInstruction : DeadInstructions)
- DeadInstruction->removeFromParent();
+ DeadInstruction->eraseFromParent();
return Modified;
}
@@ -430,7 +724,130 @@ bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
}
for (MachineInstr *DeadInstruction : DeadInstructions)
- DeadInstruction->removeFromParent();
+ DeadInstruction->eraseFromParent();
+
+ return !DeadInstructions.empty();
+}
+
+bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
+ MachineDominatorTree *DT) {
+ // Scan through the block, looking for instructions that use constants moves
+ // into VPR that are the negative of one another. These are expected to be
+ // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
+ // mask is kept it or and VPNOT's of it are added or reused as we scan through
+ // the function.
+ unsigned LastVPTImm = 0;
+ Register LastVPTReg = 0;
+ SmallSet<MachineInstr *, 4> DeadInstructions;
+
+ for (MachineInstr &Instr : MBB.instrs()) {
+ // Look for predicated MVE instructions.
+ int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
+ if (PIdx == -1)
+ continue;
+ Register VPR = Instr.getOperand(PIdx + 1).getReg();
+ if (!VPR.isVirtual())
+ continue;
+
+ // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
+ MachineInstr *Copy = MRI->getVRegDef(VPR);
+ if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
+ !Copy->getOperand(1).getReg().isVirtual() ||
+ MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
+ LastVPTReg = 0;
+ continue;
+ }
+ Register GPR = Copy->getOperand(1).getReg();
+
+ // Find the Immediate used by the copy.
+ auto getImm = [&](Register GPR) -> unsigned {
+ MachineInstr *Def = MRI->getVRegDef(GPR);
+ if (Def && (Def->getOpcode() == ARM::t2MOVi ||
+ Def->getOpcode() == ARM::t2MOVi16))
+ return Def->getOperand(1).getImm();
+ return -1U;
+ };
+ unsigned Imm = getImm(GPR);
+ if (Imm == -1U) {
+ LastVPTReg = 0;
+ continue;
+ }
+
+ unsigned NotImm = ~Imm & 0xffff;
+ if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
+ Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
+ if (MRI->use_empty(VPR)) {
+ DeadInstructions.insert(Copy);
+ if (MRI->hasOneUse(GPR))
+ DeadInstructions.insert(MRI->getVRegDef(GPR));
+ }
+ LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr);
+ } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
+ // We have found the not of a previous constant. Create a VPNot of the
+ // earlier predicate reg and use it instead of the copy.
+ Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
+ auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
+ TII->get(ARM::MVE_VPNOT), NewVPR)
+ .addReg(LastVPTReg);
+ addUnpredicatedMveVpredNOp(VPNot);
+
+ // Use the new register and check if the def is now dead.
+ Instr.getOperand(PIdx + 1).setReg(NewVPR);
+ if (MRI->use_empty(VPR)) {
+ DeadInstructions.insert(Copy);
+ if (MRI->hasOneUse(GPR))
+ DeadInstructions.insert(MRI->getVRegDef(GPR));
+ }
+ LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at "
+ << Instr);
+ VPR = NewVPR;
+ }
+
+ LastVPTImm = Imm;
+ LastVPTReg = VPR;
+ }
+
+ for (MachineInstr *DI : DeadInstructions)
+ DI->eraseFromParent();
+
+ return !DeadInstructions.empty();
+}
+
+// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
+// somewhat blunt approximation to allow tail predicated with vpsel
+// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
+// different semantics under tail predication. Until that is modelled we just
+// convert to a VMOVT (via a predicated VORR) instead.
+bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
+ bool HasVCTP = false;
+ SmallVector<MachineInstr *, 4> DeadInstructions;
+
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (isVCTP(&MI)) {
+ HasVCTP = true;
+ continue;
+ }
+
+ if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
+ continue;
+
+ MachineInstrBuilder MIBuilder =
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(1))
+ .addImm(ARMVCC::Then)
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(2));
+ // Silence unused variable warning in release builds.
+ (void)MIBuilder;
+ LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
+ dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump());
+ DeadInstructions.push_back(&MI);
+ }
+
+ for (MachineInstr *DeadInstruction : DeadInstructions)
+ DeadInstruction->eraseFromParent();
return !DeadInstructions.empty();
}
@@ -439,19 +856,28 @@ bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
- if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+ if (!STI.isThumb2() || !STI.hasLOB())
return false;
TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
MRI = &Fn.getRegInfo();
+ MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
+ MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
<< "********** Function: " << Fn.getName() << '\n');
bool Modified = false;
+ for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
+ Modified |= MergeLoopEnd(ML);
+ Modified |= ConvertTailPredLoop(ML, DT);
+ }
+
for (MachineBasicBlock &MBB : Fn) {
+ Modified |= ReplaceConstByVPNOTs(MBB, DT);
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
+ Modified |= ConvertVPSEL(MBB);
}
LLVM_DEBUG(dbgs() << "**************************************\n");
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 48c6b47f2154..d728572e2858 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -12,6 +12,7 @@
#include "Thumb2InstrInfo.h"
#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -38,6 +39,11 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
cl::desc("Use old-style Thumb2 if-conversion heuristics"),
cl::init(false));
+static cl::opt<bool>
+PreferNoCSEL("prefer-no-csel", cl::Hidden,
+ cl::desc("Prefer predicated Move to CSEL"),
+ cl::init(false));
+
Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
: ARMBaseInstrInfo(STI) {}
@@ -118,6 +124,31 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
}
+MachineInstr *
+Thumb2InstrInfo::optimizeSelect(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+ bool PreferFalse) const {
+ // Try to use the base optimizeSelect, which uses canFoldIntoMOVCC to fold the
+ // MOVCC into another instruction. If that fails on 8.1-M fall back to using a
+ // CSEL.
+ MachineInstr *RV = ARMBaseInstrInfo::optimizeSelect(MI, SeenMIs, PreferFalse);
+ if (!RV && getSubtarget().hasV8_1MMainlineOps() && !PreferNoCSEL) {
+ Register DestReg = MI.getOperand(0).getReg();
+
+ if (!DestReg.isVirtual())
+ return nullptr;
+
+ MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ get(ARM::t2CSEL), DestReg)
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(3));
+ SeenMIs.insert(NewMI);
+ return NewMI;
+ }
+ return RV;
+}
+
void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, MCRegister DestReg,
@@ -227,6 +258,22 @@ void Thumb2InstrInfo::expandLoadStackGuard(
expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
}
+MachineInstr *Thumb2InstrInfo::commuteInstructionImpl(MachineInstr &MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ switch (MI.getOpcode()) {
+ case ARM::MVE_VMAXNMAf16:
+ case ARM::MVE_VMAXNMAf32:
+ case ARM::MVE_VMINNMAf16:
+ case ARM::MVE_VMINNMAf32:
+ // Don't allow predicated instructions to be commuted.
+ if (getVPTInstrPredicate(MI) != ARMVCC::None)
+ return nullptr;
+ }
+ return ARMBaseInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
const DebugLoc &dl, Register DestReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index ec3763632239..808167bfdcbc 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -60,6 +60,14 @@ public:
///
const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+ MachineInstr *optimizeSelect(MachineInstr &MI,
+ SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+ bool) const override;
+
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
private:
void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index ae661594bdc9..0f7e19038673 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -43,7 +43,7 @@
using namespace llvm;
-#define DEBUG_TYPE "t2-reduce-size"
+#define DEBUG_TYPE "thumb2-reduce-size"
#define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass"
STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones");
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td b/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td
index 6730f2e1673e..9507aa40c3d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRDevices.td
@@ -195,7 +195,7 @@ def FamilyAVR6 : Family<"avr6",
def FamilyTiny : Family<"avrtiny",
[FamilyAVR0, FeatureBREAK, FeatureSRAM,
- FeatureTinyEncoding, FeatureMMR]>;
+ FeatureTinyEncoding]>;
def FamilyXMEGA : Family<"xmega",
[FamilyAVR0, FeatureLPM, FeatureIJMPCALL, FeatureADDSUBIW,
@@ -286,8 +286,10 @@ def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny441", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny841", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>;
def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>;
@@ -307,19 +309,23 @@ def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>;
def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>;
def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>;
def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>;
-def : Device<"atmega8", FamilyAVR4, ELFArchAVR4>; // FIXME: family may be wrong
+def : Device<"atmega8", FamilyAVR2, ELFArchAVR4,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>;
-def : Device<"atmega8a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8a", FamilyAVR2, ELFArchAVR4,
+ [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>;
def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48pb", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega48p", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega88", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega88a", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88pb", FamilyAVR4, ELFArchAVR4>;
def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4,
[FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4,
@@ -351,6 +357,7 @@ def : Device<"atmega168", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega168a", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega168p", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega168pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168pb", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega169", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega169a", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega169p", FamilyAVR5, ELFArchAVR5>;
@@ -361,6 +368,7 @@ def : Device<"atmega323", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega324a", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega324p", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega324pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324pb", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega325", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega325a", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega325p", FamilyAVR5, ELFArchAVR5>;
@@ -371,6 +379,7 @@ def : Device<"atmega3250p", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega3250pa", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega328", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega328p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328pb", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega329", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega329a", FamilyAVR5, ELFArchAVR5>;
def : Device<"atmega329p", FamilyAVR5, ELFArchAVR5>;
@@ -451,9 +460,9 @@ def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>;
def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>;
def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>;
def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega32e5", FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega16e5", FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega8e5", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16e5", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega8e5", FamilyXMEGAU, ELFArchXMEGA2>;
def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>;
def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>;
def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 8ee69201e932..a48d3d134bb5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -416,6 +416,44 @@ bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
}
template <>
+bool AVRExpandPseudo::expand<AVR::NEGWRd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstLoReg, DstHiReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+ TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+ // Do NEG on the upper byte.
+ auto MIBHI =
+ buildMI(MBB, MBBI, AVR::NEGRd)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill));
+ // SREG is always implicitly dead
+ MIBHI->getOperand(2).setIsDead();
+
+ // Do NEG on the lower byte.
+ buildMI(MBB, MBBI, AVR::NEGRd)
+ .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+ // Do an extra SBCI.
+ auto MISBCI =
+ buildMI(MBB, MBBI, AVR::SBCIRdK)
+ .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstHiReg, getKillRegState(DstIsKill))
+ .addImm(0);
+ if (ImpIsDead)
+ MISBCI->getOperand(3).setIsDead();
+ // SREG is always implicitly killed
+ MISBCI->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
@@ -1438,6 +1476,111 @@ bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
return true;
}
+template <>
+bool AVRExpandPseudo::expand<AVR::LSLB7Rd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+
+ // ror r24
+ // clr r24
+ // ror r24
+
+ buildMI(MBB, MBBI, AVR::RORRd)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ auto MIRRC =
+ buildMI(MBB, MBBI, AVR::RORRd)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIRRC->getOperand(2).setIsDead();
+
+ // SREG is always implicitly killed
+ MIRRC->getOperand(3).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSRB7Rd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+
+ // rol r24
+ // clr r24
+ // rol r24
+
+ buildMI(MBB, MBBI, AVR::ADCRdRr)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ buildMI(MBB, MBBI, AVR::EORRdRr)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ auto MIRRC =
+ buildMI(MBB, MBBI, AVR::ADCRdRr)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIRRC->getOperand(3).setIsDead();
+
+ // SREG is always implicitly killed
+ MIRRC->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ASRB7Rd>(Block &MBB, BlockIt MBBI) {
+ MachineInstr &MI = *MBBI;
+ Register DstReg = MI.getOperand(0).getReg();
+ bool DstIsDead = MI.getOperand(0).isDead();
+ bool DstIsKill = MI.getOperand(1).isKill();
+ bool ImpIsDead = MI.getOperand(2).isDead();
+
+ // lsl r24
+ // sbc r24, r24
+
+ buildMI(MBB, MBBI, AVR::ADDRdRr)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ auto MIRRC = buildMI(MBB, MBBI, AVR::SBCRdRr)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .addReg(DstReg, getKillRegState(DstIsKill))
+ .addReg(DstReg, getKillRegState(DstIsKill));
+
+ if (ImpIsDead)
+ MIRRC->getOperand(3).setIsDead();
+
+ // SREG is always implicitly killed
+ MIRRC->getOperand(4).setIsKill();
+
+ MI.eraseFromParent();
+ return true;
+}
+
template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
Register DstLoReg, DstHiReg;
@@ -1616,6 +1759,7 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
EXPAND(AVR::ORIWRdK);
EXPAND(AVR::EORWRdRr);
EXPAND(AVR::COMWRd);
+ EXPAND(AVR::NEGWRd);
EXPAND(AVR::CPWRdRr);
EXPAND(AVR::CPCWRdRr);
EXPAND(AVR::LDIWRdK);
@@ -1658,6 +1802,9 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
EXPAND(AVR::RORWRd);
EXPAND(AVR::ROLWRd);
EXPAND(AVR::ASRWRd);
+ EXPAND(AVR::LSLB7Rd);
+ EXPAND(AVR::LSRB7Rd);
+ EXPAND(AVR::ASRB7Rd);
EXPAND(AVR::SEXT);
EXPAND(AVR::ZEXT);
EXPAND(AVR::SPREAD);
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index c95a553b86ac..757b41466c3f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -131,6 +131,26 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+static void restoreStatusRegister(MachineFunction &MF, MachineBasicBlock &MBB) {
+ const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+ MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+
+ DebugLoc DL = MBBI->getDebugLoc();
+ const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+ const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+ // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
+ // handlers at the very end of the function, just before reti.
+ if (AFI->isInterruptOrSignalHandler()) {
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
+ .addImm(0x3f)
+ .addReg(AVR::R0, RegState::Kill);
+ BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
+ }
+}
+
void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
@@ -151,18 +171,9 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
const AVRInstrInfo &TII = *STI.getInstrInfo();
- // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
- // handlers at the very end of the function, just before reti.
- if (AFI->isInterruptOrSignalHandler()) {
- BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
- BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
- .addImm(0x3f)
- .addReg(AVR::R0, RegState::Kill);
- BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
- }
-
// Early exit if there is no need to restore the frame pointer.
if (!FrameSize) {
+ restoreStatusRegister(MF, MBB);
return;
}
@@ -198,6 +209,8 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
// Write back R29R28 to SP and temporarily disable interrupts.
BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
.addReg(AVR::R29R28, RegState::Kill);
+
+ restoreStatusRegister(MF, MBB);
}
// Return true if the specified function should have a dedicated frame
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index fe31fa42c403..df382d553753 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -242,10 +242,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
ConstantSDNode *ImmNode = dyn_cast<ConstantSDNode>(ImmOp);
unsigned Reg;
- bool CanHandleRegImmOpt = true;
-
- CanHandleRegImmOpt &= ImmNode != 0;
- CanHandleRegImmOpt &= ImmNode->getAPIntValue().getZExtValue() < 64;
+ bool CanHandleRegImmOpt = ImmNode && ImmNode->getAPIntValue().ult(64);
if (CopyFromRegOp->getOpcode() == ISD::CopyFromReg) {
RegisterSDNode *RegNode =
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
index bf9b32e1278e..3e7c2984655a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -334,6 +334,36 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Invalid shift opcode");
}
+ // Optimize int8 shifts.
+ if (VT.getSizeInBits() == 8) {
+ if (Op.getOpcode() == ISD::SHL && 4 <= ShiftAmount && ShiftAmount < 7) {
+ // Optimize LSL when 4 <= ShiftAmount <= 6.
+ Victim = DAG.getNode(AVRISD::SWAP, dl, VT, Victim);
+ Victim =
+ DAG.getNode(ISD::AND, dl, VT, Victim, DAG.getConstant(0xf0, dl, VT));
+ ShiftAmount -= 4;
+ } else if (Op.getOpcode() == ISD::SRL && 4 <= ShiftAmount &&
+ ShiftAmount < 7) {
+ // Optimize LSR when 4 <= ShiftAmount <= 6.
+ Victim = DAG.getNode(AVRISD::SWAP, dl, VT, Victim);
+ Victim =
+ DAG.getNode(ISD::AND, dl, VT, Victim, DAG.getConstant(0x0f, dl, VT));
+ ShiftAmount -= 4;
+ } else if (Op.getOpcode() == ISD::SHL && ShiftAmount == 7) {
+ // Optimize LSL when ShiftAmount == 7.
+ Victim = DAG.getNode(AVRISD::LSL7, dl, VT, Victim);
+ ShiftAmount = 0;
+ } else if (Op.getOpcode() == ISD::SRL && ShiftAmount == 7) {
+ // Optimize LSR when ShiftAmount == 7.
+ Victim = DAG.getNode(AVRISD::LSR7, dl, VT, Victim);
+ ShiftAmount = 0;
+ } else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 7) {
+ // Optimize ASR when ShiftAmount == 7.
+ Victim = DAG.getNode(AVRISD::ASR7, dl, VT, Victim);
+ ShiftAmount = 0;
+ }
+ }
+
while (ShiftAmount--) {
Victim = DAG.getNode(Opc8, dl, VT, Victim);
}
@@ -437,6 +467,36 @@ static AVRCC::CondCodes intCCToAVRCC(ISD::CondCode CC) {
}
}
+/// Returns appropriate CP/CPI/CPC nodes code for the given 8/16-bit operands.
+SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,
+ SelectionDAG &DAG, SDLoc DL) const {
+ assert((LHS.getSimpleValueType() == RHS.getSimpleValueType()) &&
+ "LHS and RHS have different types");
+ assert(((LHS.getSimpleValueType() == MVT::i16) ||
+ (LHS.getSimpleValueType() == MVT::i8)) && "invalid comparison type");
+
+ SDValue Cmp;
+
+ if (LHS.getSimpleValueType() == MVT::i16 && dyn_cast<ConstantSDNode>(RHS)) {
+ // Generate a CPI/CPC pair if RHS is a 16-bit constant.
+ SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue LHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS,
+ DAG.getIntPtrConstant(1, DL));
+ SDValue RHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, RHS,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue RHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, RHS,
+ DAG.getIntPtrConstant(1, DL));
+ Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
+ Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
+ } else {
+ // Generate ordinary 16-bit comparison.
+ Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
+ }
+
+ return Cmp;
+}
+
/// Returns appropriate AVR CMP/CMPC nodes and corresponding condition code for
/// the given operands.
SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
@@ -549,7 +609,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
DAG.getIntPtrConstant(1, DL));
Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
} else {
- Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
+ Cmp = getAVRCmp(LHSlo, RHSlo, DAG, DL);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
}
} else if (VT == MVT::i64) {
@@ -587,7 +647,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
DAG.getIntPtrConstant(1, DL));
Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
} else {
- Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS0, RHS0);
+ Cmp = getAVRCmp(LHS0, RHS0, DAG, DL);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS1, RHS1, Cmp);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS2, RHS2, Cmp);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS3, RHS3, Cmp);
@@ -601,7 +661,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
: DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8,
LHS, DAG.getIntPtrConstant(1, DL)));
} else {
- Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
+ Cmp = getAVRCmp(LHS, RHS, DAG, DL);
}
} else {
llvm_unreachable("Invalid comparison size");
@@ -676,7 +736,7 @@ SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL));
return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
- MachinePointerInfo(SV), 0);
+ MachinePointerInfo(SV));
}
SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -1096,8 +1156,7 @@ SDValue AVRTargetLowering::LowerFormalArguments(
// from this parameter.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL));
InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(MF, FI),
- 0));
+ MachinePointerInfo::getFixedStack(MF, FI)));
}
}
@@ -1230,8 +1289,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain =
DAG.getStore(Chain, DL, Arg, PtrOff,
- MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
- 0);
+ MachinePointerInfo::getStack(MF, VA.getLocMemOffset()));
}
}
@@ -1460,9 +1518,11 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
// Create loop block.
MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *CheckBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(I, LoopBB);
+ F->insert(I, CheckBB);
F->insert(I, RemBB);
// Update machine-CFG edges by transferring all successors of the current
@@ -1471,14 +1531,14 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
BB->end());
RemBB->transferSuccessorsAndUpdatePHIs(BB);
- // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.
- BB->addSuccessor(LoopBB);
- BB->addSuccessor(RemBB);
- LoopBB->addSuccessor(RemBB);
- LoopBB->addSuccessor(LoopBB);
+ // Add edges BB => LoopBB => CheckBB => RemBB, CheckBB => LoopBB.
+ BB->addSuccessor(CheckBB);
+ LoopBB->addSuccessor(CheckBB);
+ CheckBB->addSuccessor(LoopBB);
+ CheckBB->addSuccessor(RemBB);
- Register ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
- Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
+ Register ShiftAmtReg = RI.createVirtualRegister(&AVR::GPR8RegClass);
+ Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::GPR8RegClass);
Register ShiftReg = RI.createVirtualRegister(RC);
Register ShiftReg2 = RI.createVirtualRegister(RC);
Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
@@ -1486,44 +1546,41 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
// BB:
- // cpi N, 0
- // breq RemBB
- BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
- BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
+ // rjmp CheckBB
+ BuildMI(BB, dl, TII.get(AVR::RJMPk)).addMBB(CheckBB);
// LoopBB:
- // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
- // ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
// ShiftReg2 = shift ShiftReg
+ auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
+ if (HasRepeatedOperand)
+ ShiftMI.addReg(ShiftReg);
+
+ // CheckBB:
+ // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+ // ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
+ // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
// ShiftAmt2 = ShiftAmt - 1;
- BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)
+ // if (ShiftAmt2 >= 0) goto LoopBB;
+ BuildMI(CheckBB, dl, TII.get(AVR::PHI), ShiftReg)
.addReg(SrcReg)
.addMBB(BB)
.addReg(ShiftReg2)
.addMBB(LoopBB);
- BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
+ BuildMI(CheckBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
.addReg(ShiftAmtSrcReg)
.addMBB(BB)
.addReg(ShiftAmtReg2)
.addMBB(LoopBB);
-
- auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
- if (HasRepeatedOperand)
- ShiftMI.addReg(ShiftReg);
-
- BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
- .addReg(ShiftAmtReg)
- .addImm(1);
- BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);
-
- // RemBB:
- // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
- BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
+ BuildMI(CheckBB, dl, TII.get(AVR::PHI), DstReg)
.addReg(SrcReg)
.addMBB(BB)
.addReg(ShiftReg2)
.addMBB(LoopBB);
+ BuildMI(CheckBB, dl, TII.get(AVR::DECRd), ShiftAmtReg2)
+ .addReg(ShiftAmtReg);
+ BuildMI(CheckBB, dl, TII.get(AVR::BRPLk)).addMBB(LoopBB);
+
MI.eraseFromParent(); // The pseudo instruction is gone now.
return RemBB;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
index d1eaf53b15e9..7aff4159211b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -38,6 +38,9 @@ enum NodeType {
LSL, ///< Logical shift left.
LSR, ///< Logical shift right.
ASR, ///< Arithmetic shift right.
+ LSL7, ///< Logical shift left 7 bits.
+ LSR7, ///< Logical shift right 7 bits.
+ ASR7, ///< Arithmetic shift right 7 bits.
ROR, ///< Bit rotate right.
ROL, ///< Bit rotate left.
LSLLOOP, ///< A loop of single logical shift left instructions.
@@ -56,6 +59,8 @@ enum NodeType {
CMPC,
/// Test for zero or minus instruction.
TST,
+ /// Swap Rd[7:4] <-> Rd[3:0].
+ SWAP,
/// Operand 0 and operand 1 are selection variable, operand 2
/// is condition code and operand 3 is flag operand.
SELECT_CC
@@ -136,6 +141,8 @@ public:
private:
SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
SelectionDAG &DAG, SDLoc dl) const;
+ SDValue getAVRCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+ SDLoc dl) const;
SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
index f03c254382b4..9f7c16fc96d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -59,6 +59,9 @@ def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>;
def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
+def AVRlsl7 : SDNode<"AVRISD::LSL7", SDTIntUnaryOp>;
+def AVRlsr7 : SDNode<"AVRISD::LSR7", SDTIntUnaryOp>;
+def AVRasr7 : SDNode<"AVRISD::ASR7", SDTIntUnaryOp>;
// Pseudo shift nodes for non-constant shift amounts.
def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
@@ -67,6 +70,9 @@ def AVRrolLoop : SDNode<"AVRISD::ROLLOOP", SDTIntShiftOp>;
def AVRrorLoop : SDNode<"AVRISD::RORLOOP", SDTIntShiftOp>;
def AVRasrLoop : SDNode<"AVRISD::ASRLOOP", SDTIntShiftOp>;
+// SWAP node.
+def AVRSwap : SDNode<"AVRISD::SWAP", SDTIntUnaryOp>;
+
//===----------------------------------------------------------------------===//
// AVR Operands, Complex Patterns and Transformations Definitions.
//===----------------------------------------------------------------------===//
@@ -732,13 +738,23 @@ Defs = [SREG] in
"comw\t$rd",
[(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
- //:TODO: optimize NEG for wider types
def NEGRd : FRd<0b1001,
0b0100001,
(outs GPR8:$rd),
(ins GPR8:$src),
"neg\t$rd",
[(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
+
+ // NEGW Rd+1:Rd
+ //
+ // Expands to:
+ // neg Rd+1
+ // neg Rd
+ // sbci Rd+1, 0
+ def NEGWRd : Pseudo<(outs DREGS:$rd),
+ (ins DREGS:$src),
+ "negw\t$rd",
+ [(set i16:$rd, (ineg i16:$src)), (implicit SREG)]>;
}
// TST Rd
@@ -1653,6 +1669,11 @@ Defs = [SREG] in
"lslw\t$rd",
[(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
+ def LSLB7Rd : Pseudo<(outs GPR8:$rd),
+ (ins GPR8:$src),
+ "lslb7\t$rd",
+ [(set i8:$rd, (AVRlsl7 i8:$src)), (implicit SREG)]>;
+
def LSRRd : FRd<0b1001,
0b0100110,
(outs GPR8:$rd),
@@ -1660,6 +1681,11 @@ Defs = [SREG] in
"lsr\t$rd",
[(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
+ def LSRB7Rd : Pseudo<(outs GPR8:$rd),
+ (ins GPR8:$src),
+ "lsrb7\t$rd",
+ [(set i8:$rd, (AVRlsr7 i8:$src)), (implicit SREG)]>;
+
def LSRWRd : Pseudo<(outs DREGS:$rd),
(ins DREGS:$src),
"lsrw\t$rd",
@@ -1672,6 +1698,11 @@ Defs = [SREG] in
"asr\t$rd",
[(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
+ def ASRB7Rd : Pseudo<(outs GPR8:$rd),
+ (ins GPR8:$src),
+ "asrb7\t$rd",
+ [(set i8:$rd, (AVRasr7 i8:$src)), (implicit SREG)]>;
+
def ASRWRd : Pseudo<(outs DREGS:$rd),
(ins DREGS:$src),
"asrw\t$rd",
@@ -1719,7 +1750,7 @@ def SWAPRd : FRd<0b1001,
(outs GPR8:$rd),
(ins GPR8:$src),
"swap\t$rd",
- [(set i8:$rd, (bswap i8:$src))]>;
+ [(set i8:$rd, (AVRSwap i8:$src))]>;
// IO register bit set/clear operations.
//:TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp
index 195ca95bc3bd..601865120491 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -29,7 +29,7 @@ namespace llvm {
AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const AVRTargetMachine &TM)
- : AVRGenSubtargetInfo(TT, CPU, FS), ELFArch(0),
+ : AVRGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), ELFArch(0),
// Subtarget features
m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
@@ -43,14 +43,14 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
InstrInfo(), FrameLowering(),
TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo() {
// Parse features string.
- ParseSubtargetFeatures(CPU, FS);
+ ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
}
AVRSubtarget &
AVRSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
const TargetMachine &TM) {
// Parse features string.
- ParseSubtargetFeatures(CPU, FS);
+ ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
return *this;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h
index 81d883eb30d9..7d49e43a83f5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -46,7 +46,7 @@ public:
/// Parses a subtarget feature string, setting appropriate options.
/// \note Definition of function is auto generated by `tblgen`.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
AVRSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
const TargetMachine &TM);
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 0c7136e6f77e..0fa8623e2fb7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -37,7 +37,7 @@ static StringRef getCPU(StringRef CPU) {
}
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- return RM.hasValue() ? *RM : Reloc::Static;
+ return RM.getValueOr(Reloc::Static);
}
AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 230bc7adc07a..19f769270569 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -14,7 +14,6 @@
#include "TargetInfo/AVRTargetInfo.h"
#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -166,13 +165,13 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually a imm8, but we have its bitwise
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const auto *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(~(uint8_t)CE->getValue()));
}
bool isImmCom8() const {
if (!isImm()) return false;
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const auto *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
int64_t Value = CE->getValue();
return isUInt<8>(Value);
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index ac72abe0d9f6..49840672bf9a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -13,12 +13,12 @@
#include "MCTargetDesc/AVRAsmBackend.h"
#include "MCTargetDesc/AVRFixupKinds.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
-
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDirectives.h"
#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 9e150f120dd4..46dc914adf78 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -22,6 +22,7 @@
namespace llvm {
class MCAssembler;
+class MCContext;
struct MCFixupKindInfo;
/// Utilities for manipulating generated AVR machine code.
@@ -47,11 +48,6 @@ public:
return AVR::NumTargetFixupKinds;
}
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
-
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index 910fd3455dee..8976ef28f3dc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -45,6 +45,7 @@ private:
void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O);
// Autogenerated by TableGen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &O);
void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 0a53e5346779..9eff554a082b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -189,9 +189,10 @@ void AVRMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
}
const char *AVRMCExpr::getName() const {
- const auto &Modifier = std::find_if(
- std::begin(ModifierNames), std::end(ModifierNames),
- [this](ModifierEntry const &Mod) { return Mod.VariantKind == Kind; });
+ const auto &Modifier =
+ llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
+ return Mod.VariantKind == Kind;
+ });
if (Modifier != std::end(ModifierNames)) {
return Modifier->Spelling;
@@ -200,9 +201,10 @@ const char *AVRMCExpr::getName() const {
}
AVRMCExpr::VariantKind AVRMCExpr::getKindByName(StringRef Name) {
- const auto &Modifier = std::find_if(
- std::begin(ModifierNames), std::end(ModifierNames),
- [&Name](ModifierEntry const &Mod) { return Mod.Spelling == Name; });
+ const auto &Modifier =
+ llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
+ return Mod.Spelling == Name;
+ });
if (Modifier != std::end(ModifierNames)) {
return Modifier->VariantKind;
diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index bfc274d9cdcc..95f4465924cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -53,7 +53,7 @@ static MCRegisterInfo *createAVRMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *createAVRMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
- return createAVRMCSubtargetInfoImpl(TT, CPU, FS);
+ return createAVRMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h
index 4a46b11e5e08..a98a3e08d5de 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPF.h
@@ -10,14 +10,17 @@
#define LLVM_LIB_TARGET_BPF_BPF_H
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
class BPFTargetMachine;
-ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
-ModulePass *createBPFPreserveDIType();
+ModulePass *createBPFAdjustOpt();
+ModulePass *createBPFCheckAndAdjustIR();
+FunctionPass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
+FunctionPass *createBPFPreserveDIType();
FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
FunctionPass *createBPFMISimplifyPatchablePass();
FunctionPass *createBPFMIPeepholePass();
@@ -25,13 +28,39 @@ FunctionPass *createBPFMIPeepholeTruncElimPass();
FunctionPass *createBPFMIPreEmitPeepholePass();
FunctionPass *createBPFMIPreEmitCheckingPass();
-void initializeBPFAbstractMemberAccessPass(PassRegistry&);
+void initializeBPFAdjustOptPass(PassRegistry&);
+void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
+
+void initializeBPFAbstractMemberAccessLegacyPassPass(PassRegistry &);
void initializeBPFPreserveDITypePass(PassRegistry&);
void initializeBPFMISimplifyPatchablePass(PassRegistry&);
void initializeBPFMIPeepholePass(PassRegistry&);
void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
-}
+
+class BPFAbstractMemberAccessPass
+ : public PassInfoMixin<BPFAbstractMemberAccessPass> {
+ BPFTargetMachine *TM;
+
+public:
+ BPFAbstractMemberAccessPass(BPFTargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+ static bool isRequired() { return true; }
+};
+
+class BPFPreserveDITypePass : public PassInfoMixin<BPFPreserveDITypePass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+ static bool isRequired() { return true; }
+};
+
+class BPFAdjustOptPass : public PassInfoMixin<BPFAdjustOptPass> {
+public:
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+} // namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 16708c4d1ce6..cd994a9c8365 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -81,7 +81,9 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsBPF.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -93,26 +95,30 @@
namespace llvm {
constexpr StringRef BPFCoreSharedInfo::AmaAttr;
+uint32_t BPFCoreSharedInfo::SeqNum;
+
+Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB,
+ Instruction *Input,
+ Instruction *Before) {
+ Function *Fn = Intrinsic::getDeclaration(
+ M, Intrinsic::bpf_passthrough, {Input->getType(), Input->getType()});
+ Constant *SeqNumVal = ConstantInt::get(Type::getInt32Ty(BB->getContext()),
+ BPFCoreSharedInfo::SeqNum++);
+
+ auto *NewInst = CallInst::Create(Fn, {SeqNumVal, Input});
+ BB->getInstList().insert(Before->getIterator(), NewInst);
+ return NewInst;
+}
} // namespace llvm
using namespace llvm;
namespace {
-
-class BPFAbstractMemberAccess final : public ModulePass {
- StringRef getPassName() const override {
- return "BPF Abstract Member Access";
- }
-
- bool runOnModule(Module &M) override;
-
+class BPFAbstractMemberAccess final {
public:
- static char ID;
- TargetMachine *TM;
- // Add optional BPFTargetMachine parameter so that BPF backend can add the phase
- // with target machine to find out the endianness. The default constructor (without
- // parameters) is used by the pass manager for managing purposes.
- BPFAbstractMemberAccess(BPFTargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) {}
+ BPFAbstractMemberAccess(BPFTargetMachine *TM) : TM(TM) {}
+
+ bool run(Function &F);
struct CallInfo {
uint32_t Kind;
@@ -131,9 +137,11 @@ private:
BPFPreserveFieldInfoAI = 4,
};
+ TargetMachine *TM;
const DataLayout *DL = nullptr;
+ Module *M = nullptr;
- std::map<std::string, GlobalVariable *> GEPGlobals;
+ static std::map<std::string, GlobalVariable *> GEPGlobals;
// A map to link preserve_*_access_index instrinsic calls.
std::map<CallInst *, std::pair<CallInst *, CallInfo>> AIChain;
// A map to hold all the base preserve_*_access_index instrinsic calls.
@@ -141,19 +149,19 @@ private:
// intrinsics.
std::map<CallInst *, CallInfo> BaseAICalls;
- bool doTransformation(Module &M);
+ bool doTransformation(Function &F);
void traceAICall(CallInst *Call, CallInfo &ParentInfo);
void traceBitCast(BitCastInst *BitCast, CallInst *Parent,
CallInfo &ParentInfo);
void traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
CallInfo &ParentInfo);
- void collectAICallChains(Module &M, Function &F);
+ void collectAICallChains(Function &F);
bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo);
bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI,
const MDNode *ChildMeta);
- bool removePreserveAccessIndexIntrinsic(Module &M);
+ bool removePreserveAccessIndexIntrinsic(Function &F);
void replaceWithGEP(std::vector<CallInst *> &CallList,
uint32_t NumOfZerosIndex, uint32_t DIIndex);
bool HasPreserveFieldInfoCall(CallInfoStack &CallStack);
@@ -165,28 +173,55 @@ private:
Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo,
std::string &AccessKey, MDNode *&BaseMeta);
+ MDNode *computeAccessKey(CallInst *Call, CallInfo &CInfo,
+ std::string &AccessKey, bool &IsInt32Ret);
uint64_t getConstant(const Value *IndexValue);
- bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo);
+ bool transformGEPChain(CallInst *Call, CallInfo &CInfo);
};
+
+std::map<std::string, GlobalVariable *> BPFAbstractMemberAccess::GEPGlobals;
+
+class BPFAbstractMemberAccessLegacyPass final : public FunctionPass {
+ BPFTargetMachine *TM;
+
+ bool runOnFunction(Function &F) override {
+ return BPFAbstractMemberAccess(TM).run(F);
+ }
+
+public:
+ static char ID;
+
+ // Add optional BPFTargetMachine parameter so that BPF backend can add the
+ // phase with target machine to find out the endianness. The default
+ // constructor (without parameters) is used by the pass manager for managing
+ // purposes.
+ BPFAbstractMemberAccessLegacyPass(BPFTargetMachine *TM = nullptr)
+ : FunctionPass(ID), TM(TM) {}
+};
+
} // End anonymous namespace
-char BPFAbstractMemberAccess::ID = 0;
-INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
- "abstracting struct/union member accessees", false, false)
+char BPFAbstractMemberAccessLegacyPass::ID = 0;
+INITIALIZE_PASS(BPFAbstractMemberAccessLegacyPass, DEBUG_TYPE,
+ "BPF Abstract Member Access", false, false)
-ModulePass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) {
- return new BPFAbstractMemberAccess(TM);
+FunctionPass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) {
+ return new BPFAbstractMemberAccessLegacyPass(TM);
}
-bool BPFAbstractMemberAccess::runOnModule(Module &M) {
+bool BPFAbstractMemberAccess::run(Function &F) {
LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n");
+ M = F.getParent();
+ if (!M)
+ return false;
+
// Bail out if no debug info.
- if (M.debug_compile_units().empty())
+ if (M->debug_compile_units().empty())
return false;
- DL = &M.getDataLayout();
- return doTransformation(M);
+ DL = &M->getDataLayout();
+ return doTransformation(F);
}
static bool SkipDIDerivedTag(unsigned Tag, bool skipTypedef) {
@@ -285,6 +320,34 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
CInfo.AccessIndex = InfoKind;
return true;
}
+ if (GV->getName().startswith("llvm.bpf.preserve.type.info")) {
+ CInfo.Kind = BPFPreserveFieldInfoAI;
+ CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+ if (!CInfo.Metadata)
+ report_fatal_error("Missing metadata for llvm.preserve.type.info intrinsic");
+ uint64_t Flag = getConstant(Call->getArgOperand(1));
+ if (Flag >= BPFCoreSharedInfo::MAX_PRESERVE_TYPE_INFO_FLAG)
+ report_fatal_error("Incorrect flag for llvm.bpf.preserve.type.info intrinsic");
+ if (Flag == BPFCoreSharedInfo::PRESERVE_TYPE_INFO_EXISTENCE)
+ CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_EXISTENCE;
+ else
+ CInfo.AccessIndex = BPFCoreSharedInfo::TYPE_SIZE;
+ return true;
+ }
+ if (GV->getName().startswith("llvm.bpf.preserve.enum.value")) {
+ CInfo.Kind = BPFPreserveFieldInfoAI;
+ CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+ if (!CInfo.Metadata)
+ report_fatal_error("Missing metadata for llvm.preserve.enum.value intrinsic");
+ uint64_t Flag = getConstant(Call->getArgOperand(2));
+ if (Flag >= BPFCoreSharedInfo::MAX_PRESERVE_ENUM_VALUE_FLAG)
+ report_fatal_error("Incorrect flag for llvm.bpf.preserve.enum.value intrinsic");
+ if (Flag == BPFCoreSharedInfo::PRESERVE_ENUM_VALUE_EXISTENCE)
+ CInfo.AccessIndex = BPFCoreSharedInfo::ENUM_VALUE_EXISTENCE;
+ else
+ CInfo.AccessIndex = BPFCoreSharedInfo::ENUM_VALUE;
+ return true;
+ }
return false;
}
@@ -311,28 +374,27 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
}
}
-bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
+bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Function &F) {
std::vector<CallInst *> PreserveArrayIndexCalls;
std::vector<CallInst *> PreserveUnionIndexCalls;
std::vector<CallInst *> PreserveStructIndexCalls;
bool Found = false;
- for (Function &F : M)
- for (auto &BB : F)
- for (auto &I : BB) {
- auto *Call = dyn_cast<CallInst>(&I);
- CallInfo CInfo;
- if (!IsPreserveDIAccessIndexCall(Call, CInfo))
- continue;
-
- Found = true;
- if (CInfo.Kind == BPFPreserveArrayAI)
- PreserveArrayIndexCalls.push_back(Call);
- else if (CInfo.Kind == BPFPreserveUnionAI)
- PreserveUnionIndexCalls.push_back(Call);
- else
- PreserveStructIndexCalls.push_back(Call);
- }
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ auto *Call = dyn_cast<CallInst>(&I);
+ CallInfo CInfo;
+ if (!IsPreserveDIAccessIndexCall(Call, CInfo))
+ continue;
+
+ Found = true;
+ if (CInfo.Kind == BPFPreserveArrayAI)
+ PreserveArrayIndexCalls.push_back(Call);
+ else if (CInfo.Kind == BPFPreserveUnionAI)
+ PreserveUnionIndexCalls.push_back(Call);
+ else
+ PreserveStructIndexCalls.push_back(Call);
+ }
// do the following transformation:
// . addr = preserve_array_access_index(base, dimension, index)
@@ -498,7 +560,7 @@ void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
}
}
-void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
+void BPFAbstractMemberAccess::collectAICallChains(Function &F) {
AIChain.clear();
BaseAICalls.clear();
@@ -847,28 +909,94 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
return Base;
}
+MDNode *BPFAbstractMemberAccess::computeAccessKey(CallInst *Call,
+ CallInfo &CInfo,
+ std::string &AccessKey,
+ bool &IsInt32Ret) {
+ DIType *Ty = stripQualifiers(cast<DIType>(CInfo.Metadata), false);
+ assert(!Ty->getName().empty());
+
+ int64_t PatchImm;
+ std::string AccessStr("0");
+ if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_EXISTENCE) {
+ PatchImm = 1;
+ } else if (CInfo.AccessIndex == BPFCoreSharedInfo::TYPE_SIZE) {
+ // typedef debuginfo type has size 0, get the eventual base type.
+ DIType *BaseTy = stripQualifiers(Ty, true);
+ PatchImm = BaseTy->getSizeInBits() / 8;
+ } else {
+ // ENUM_VALUE_EXISTENCE and ENUM_VALUE
+ IsInt32Ret = false;
+
+ const auto *CE = cast<ConstantExpr>(Call->getArgOperand(1));
+ const GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+ assert(GV->hasInitializer());
+ const ConstantDataArray *DA = cast<ConstantDataArray>(GV->getInitializer());
+ assert(DA->isString());
+ StringRef ValueStr = DA->getAsString();
+
+ // ValueStr format: <EnumeratorStr>:<Value>
+ size_t Separator = ValueStr.find_first_of(':');
+ StringRef EnumeratorStr = ValueStr.substr(0, Separator);
+
+ // Find enumerator index in the debuginfo
+ DIType *BaseTy = stripQualifiers(Ty, true);
+ const auto *CTy = cast<DICompositeType>(BaseTy);
+ assert(CTy->getTag() == dwarf::DW_TAG_enumeration_type);
+ int EnumIndex = 0;
+ for (const auto Element : CTy->getElements()) {
+ const auto *Enum = cast<DIEnumerator>(Element);
+ if (Enum->getName() == EnumeratorStr) {
+ AccessStr = std::to_string(EnumIndex);
+ break;
+ }
+ EnumIndex++;
+ }
+
+ if (CInfo.AccessIndex == BPFCoreSharedInfo::ENUM_VALUE) {
+ StringRef EValueStr = ValueStr.substr(Separator + 1);
+ PatchImm = std::stoll(std::string(EValueStr));
+ } else {
+ PatchImm = 1;
+ }
+ }
+
+ AccessKey = "llvm." + Ty->getName().str() + ":" +
+ std::to_string(CInfo.AccessIndex) + std::string(":") +
+ std::to_string(PatchImm) + std::string("$") + AccessStr;
+
+ return Ty;
+}
+
/// Call/Kind is the base preserve_*_access_index() call. Attempts to do
/// transformation to a chain of relocable GEPs.
-bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
+bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
CallInfo &CInfo) {
std::string AccessKey;
MDNode *TypeMeta;
- Value *Base =
- computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta);
- if (!Base)
- return false;
+ Value *Base = nullptr;
+ bool IsInt32Ret;
+
+ IsInt32Ret = CInfo.Kind == BPFPreserveFieldInfoAI;
+ if (CInfo.Kind == BPFPreserveFieldInfoAI && CInfo.Metadata) {
+ TypeMeta = computeAccessKey(Call, CInfo, AccessKey, IsInt32Ret);
+ } else {
+ Base = computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta);
+ if (!Base)
+ return false;
+ }
BasicBlock *BB = Call->getParent();
GlobalVariable *GV;
if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
IntegerType *VarType;
- if (CInfo.Kind == BPFPreserveFieldInfoAI)
+ if (IsInt32Ret)
VarType = Type::getInt32Ty(BB->getContext()); // 32bit return value
else
- VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr arith
+ VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr or enum value
- GV = new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
+ GV = new GlobalVariable(*M, VarType, false, GlobalVariable::ExternalLinkage,
NULL, AccessKey);
GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
@@ -879,9 +1007,15 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
if (CInfo.Kind == BPFPreserveFieldInfoAI) {
// Load the global variable which represents the returned field info.
- auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
- Call);
- Call->replaceAllUsesWith(LDInst);
+ LoadInst *LDInst;
+ if (IsInt32Ret)
+ LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "", Call);
+ else
+ LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+
+ Instruction *PassThroughInst =
+ BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
+ Call->replaceAllUsesWith(PassThroughInst);
Call->eraseFromParent();
return true;
}
@@ -889,7 +1023,7 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
// For any original GEP Call and Base %2 like
// %4 = bitcast %struct.net_device** %dev1 to i64*
// it is transformed to:
- // %6 = load sk_buff:50:$0:0:0:2:0
+ // %6 = load llvm.sk_buff:0:50$0:0:0:2:0
// %7 = bitcast %struct.sk_buff* %2 to i8*
// %8 = getelementptr i8, i8* %7, %6
// %9 = bitcast i8* %8 to i64*
@@ -912,24 +1046,75 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
auto *BCInst2 = new BitCastInst(GEP, Call->getType());
BB->getInstList().insert(Call->getIterator(), BCInst2);
- Call->replaceAllUsesWith(BCInst2);
+ // For the following code,
+ // Block0:
+ // ...
+ // if (...) goto Block1 else ...
+ // Block1:
+ // %6 = load llvm.sk_buff:0:50$0:0:0:2:0
+ // %7 = bitcast %struct.sk_buff* %2 to i8*
+ // %8 = getelementptr i8, i8* %7, %6
+ // ...
+ // goto CommonExit
+ // Block2:
+ // ...
+ // if (...) goto Block3 else ...
+ // Block3:
+ // %6 = load llvm.bpf_map:0:40$0:0:0:2:0
+ // %7 = bitcast %struct.sk_buff* %2 to i8*
+ // %8 = getelementptr i8, i8* %7, %6
+ // ...
+ // goto CommonExit
+ // CommonExit
+ // SimplifyCFG may generate:
+ // Block0:
+ // ...
+ // if (...) goto Block_Common else ...
+ // Block2:
+ // ...
+ // if (...) goto Block_Common else ...
+ // Block_Common:
+ // PHI = [llvm.sk_buff:0:50$0:0:0:2:0, llvm.bpf_map:0:40$0:0:0:2:0]
+ // %6 = load PHI
+ // %7 = bitcast %struct.sk_buff* %2 to i8*
+ // %8 = getelementptr i8, i8* %7, %6
+ // ...
+ // goto CommonExit
+ // For the above code, we cannot perform proper relocation since
+ // "load PHI" has two possible relocations.
+ //
+ // To prevent above tail merging, we use __builtin_bpf_passthrough()
+ // where one of its parameters is a seq_num. Since two
+ // __builtin_bpf_passthrough() funcs will always have different seq_num,
+ // tail merging cannot happen. The __builtin_bpf_passthrough() will be
+ // removed in the beginning of Target IR passes.
+ //
+ // This approach is also used in other places when global var
+ // representing a relocation is used.
+ Instruction *PassThroughInst =
+ BPFCoreSharedInfo::insertPassThrough(M, BB, BCInst2, Call);
+ Call->replaceAllUsesWith(PassThroughInst);
Call->eraseFromParent();
return true;
}
-bool BPFAbstractMemberAccess::doTransformation(Module &M) {
+bool BPFAbstractMemberAccess::doTransformation(Function &F) {
bool Transformed = false;
- for (Function &F : M) {
- // Collect PreserveDIAccessIndex Intrinsic call chains.
- // The call chains will be used to generate the access
- // patterns similar to GEP.
- collectAICallChains(M, F);
+ // Collect PreserveDIAccessIndex Intrinsic call chains.
+ // The call chains will be used to generate the access
+ // patterns similar to GEP.
+ collectAICallChains(F);
- for (auto &C : BaseAICalls)
- Transformed = transformGEPChain(M, C.first, C.second) || Transformed;
- }
+ for (auto &C : BaseAICalls)
+ Transformed = transformGEPChain(C.first, C.second) || Transformed;
+
+ return removePreserveAccessIndexIntrinsic(F) || Transformed;
+}
- return removePreserveAccessIndexIntrinsic(M) || Transformed;
+PreservedAnalyses
+BPFAbstractMemberAccessPass::run(Function &F, FunctionAnalysisManager &AM) {
+ return BPFAbstractMemberAccess(TM).run(F) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
new file mode 100644
index 000000000000..da543e7eba53
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -0,0 +1,323 @@
+//===---------------- BPFAdjustOpt.cpp - Adjust Optimization --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Adjust optimization to make the code more kernel verifier friendly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "BPFTargetMachine.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "bpf-adjust-opt"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ DisableBPFserializeICMP("bpf-disable-serialize-icmp", cl::Hidden,
+ cl::desc("BPF: Disable Serializing ICMP insns."),
+ cl::init(false));
+
+static cl::opt<bool> DisableBPFavoidSpeculation(
+ "bpf-disable-avoid-speculation", cl::Hidden,
+ cl::desc("BPF: Disable Avoiding Speculative Code Motion."),
+ cl::init(false));
+
+namespace {
+
+class BPFAdjustOpt final : public ModulePass {
+public:
+ static char ID;
+
+ BPFAdjustOpt() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+};
+
+class BPFAdjustOptImpl {
+ struct PassThroughInfo {
+ Instruction *Input;
+ Instruction *UsedInst;
+ uint32_t OpIdx;
+ PassThroughInfo(Instruction *I, Instruction *U, uint32_t Idx)
+ : Input(I), UsedInst(U), OpIdx(Idx) {}
+ };
+
+public:
+ BPFAdjustOptImpl(Module *M) : M(M) {}
+
+ bool run();
+
+private:
+ Module *M;
+ SmallVector<PassThroughInfo, 16> PassThroughs;
+
+ void adjustBasicBlock(BasicBlock &BB);
+ bool serializeICMPCrossBB(BasicBlock &BB);
+ void adjustInst(Instruction &I);
+ bool serializeICMPInBB(Instruction &I);
+ bool avoidSpeculation(Instruction &I);
+ bool insertPassThrough();
+};
+
+} // End anonymous namespace
+
+char BPFAdjustOpt::ID = 0;
+INITIALIZE_PASS(BPFAdjustOpt, "bpf-adjust-opt", "BPF Adjust Optimization",
+ false, false)
+
+ModulePass *llvm::createBPFAdjustOpt() { return new BPFAdjustOpt(); }
+
+bool BPFAdjustOpt::runOnModule(Module &M) { return BPFAdjustOptImpl(&M).run(); }
+
+bool BPFAdjustOptImpl::run() {
+ for (Function &F : *M)
+ for (auto &BB : F) {
+ adjustBasicBlock(BB);
+ for (auto &I : BB)
+ adjustInst(I);
+ }
+
+ return insertPassThrough();
+}
+
+bool BPFAdjustOptImpl::insertPassThrough() {
+ for (auto &Info : PassThroughs) {
+ auto *CI = BPFCoreSharedInfo::insertPassThrough(
+ M, Info.UsedInst->getParent(), Info.Input, Info.UsedInst);
+ Info.UsedInst->setOperand(Info.OpIdx, CI);
+ }
+
+ return !PassThroughs.empty();
+}
+
+// To avoid combining conditionals in the same basic block by
+// instrcombine optimization.
+bool BPFAdjustOptImpl::serializeICMPInBB(Instruction &I) {
+ // For:
+ // comp1 = icmp <opcode> ...;
+ // comp2 = icmp <opcode> ...;
+ // ... or comp1 comp2 ...
+ // changed to:
+ // comp1 = icmp <opcode> ...;
+ // comp2 = icmp <opcode> ...;
+ // new_comp1 = __builtin_bpf_passthrough(seq_num, comp1)
+ // ... or new_comp1 comp2 ...
+ if (I.getOpcode() != Instruction::Or)
+ return false;
+ auto *Icmp1 = dyn_cast<ICmpInst>(I.getOperand(0));
+ if (!Icmp1)
+ return false;
+ auto *Icmp2 = dyn_cast<ICmpInst>(I.getOperand(1));
+ if (!Icmp2)
+ return false;
+
+ Value *Icmp1Op0 = Icmp1->getOperand(0);
+ Value *Icmp2Op0 = Icmp2->getOperand(0);
+ if (Icmp1Op0 != Icmp2Op0)
+ return false;
+
+ // Now we got two icmp instructions which feed into
+ // an "or" instruction.
+ PassThroughInfo Info(Icmp1, &I, 0);
+ PassThroughs.push_back(Info);
+ return true;
+}
+
+// To avoid combining conditionals in the same basic block by
+// instrcombine optimization.
+bool BPFAdjustOptImpl::serializeICMPCrossBB(BasicBlock &BB) {
+ // For:
+ // B1:
+ // comp1 = icmp <opcode> ...;
+ // if (comp1) goto B2 else B3;
+ // B2:
+ // comp2 = icmp <opcode> ...;
+ // if (comp2) goto B4 else B5;
+ // B4:
+ // ...
+ // changed to:
+ // B1:
+ // comp1 = icmp <opcode> ...;
+ // comp1 = __builtin_bpf_passthrough(seq_num, comp1);
+ // if (comp1) goto B2 else B3;
+ // B2:
+ // comp2 = icmp <opcode> ...;
+ // if (comp2) goto B4 else B5;
+ // B4:
+ // ...
+
+ // Check basic predecessors, if two of them (say B1, B2) are using
+ // icmp instructions to generate conditions and one is the predesessor
+ // of another (e.g., B1 is the predecessor of B2). Add a passthrough
+ // barrier after icmp inst of block B1.
+ BasicBlock *B2 = BB.getSinglePredecessor();
+ if (!B2)
+ return false;
+
+ BasicBlock *B1 = B2->getSinglePredecessor();
+ if (!B1)
+ return false;
+
+ Instruction *TI = B2->getTerminator();
+ auto *BI = dyn_cast<BranchInst>(TI);
+ if (!BI || !BI->isConditional())
+ return false;
+ auto *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond || B2->getFirstNonPHI() != Cond)
+ return false;
+ Value *B2Op0 = Cond->getOperand(0);
+ auto Cond2Op = Cond->getPredicate();
+
+ TI = B1->getTerminator();
+ BI = dyn_cast<BranchInst>(TI);
+ if (!BI || !BI->isConditional())
+ return false;
+ Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond)
+ return false;
+ Value *B1Op0 = Cond->getOperand(0);
+ auto Cond1Op = Cond->getPredicate();
+
+ if (B1Op0 != B2Op0)
+ return false;
+
+ if (Cond1Op == ICmpInst::ICMP_SGT || Cond1Op == ICmpInst::ICMP_SGE) {
+ if (Cond2Op != ICmpInst::ICMP_SLT && Cond1Op != ICmpInst::ICMP_SLE)
+ return false;
+ } else if (Cond1Op == ICmpInst::ICMP_SLT || Cond1Op == ICmpInst::ICMP_SLE) {
+ if (Cond2Op != ICmpInst::ICMP_SGT && Cond1Op != ICmpInst::ICMP_SGE)
+ return false;
+ } else {
+ return false;
+ }
+
+ PassThroughInfo Info(Cond, BI, 0);
+ PassThroughs.push_back(Info);
+
+ return true;
+}
+
+// To avoid speculative hoisting certain computations out of
+// a basic block.
+bool BPFAdjustOptImpl::avoidSpeculation(Instruction &I) {
+ if (auto *LdInst = dyn_cast<LoadInst>(&I)) {
+ if (auto *GV = dyn_cast<GlobalVariable>(LdInst->getOperand(0))) {
+ if (GV->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
+ GV->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
+ return false;
+ }
+ }
+
+ if (!isa<LoadInst>(&I) && !isa<CallInst>(&I))
+ return false;
+
+ // For:
+ // B1:
+ // var = ...
+ // ...
+ // /* icmp may not be in the same block as var = ... */
+ // comp1 = icmp <opcode> var, <const>;
+ // if (comp1) goto B2 else B3;
+ // B2:
+ // ... var ...
+ // change to:
+ // B1:
+ // var = ...
+ // ...
+ // /* icmp may not be in the same block as var = ... */
+ // comp1 = icmp <opcode> var, <const>;
+ // if (comp1) goto B2 else B3;
+ // B2:
+ // var = __builtin_bpf_passthrough(seq_num, var);
+ // ... var ...
+ bool isCandidate = false;
+ SmallVector<PassThroughInfo, 4> Candidates;
+ for (User *U : I.users()) {
+ Instruction *Inst = dyn_cast<Instruction>(U);
+ if (!Inst)
+ continue;
+
+ // May cover a little bit more than the
+ // above pattern.
+ if (auto *Icmp1 = dyn_cast<ICmpInst>(Inst)) {
+ Value *Icmp1Op1 = Icmp1->getOperand(1);
+ if (!isa<Constant>(Icmp1Op1))
+ return false;
+ isCandidate = true;
+ continue;
+ }
+
+ // Ignore the use in the same basic block as the definition.
+ if (Inst->getParent() == I.getParent())
+ continue;
+
+ // use in a different basic block, If there is a call or
+ // load/store insn before this instruction in this basic
+ // block. Most likely it cannot be hoisted out. Skip it.
+ for (auto &I2 : *Inst->getParent()) {
+ if (dyn_cast<CallInst>(&I2))
+ return false;
+ if (dyn_cast<LoadInst>(&I2) || dyn_cast<StoreInst>(&I2))
+ return false;
+ if (&I2 == Inst)
+ break;
+ }
+
+ // It should be used in a GEP or a simple arithmetic like
+ // ZEXT/SEXT which is used for GEP.
+ if (Inst->getOpcode() == Instruction::ZExt ||
+ Inst->getOpcode() == Instruction::SExt) {
+ PassThroughInfo Info(&I, Inst, 0);
+ Candidates.push_back(Info);
+ } else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
+ // traverse GEP inst to find Use operand index
+ unsigned i, e;
+ for (i = 1, e = GI->getNumOperands(); i != e; ++i) {
+ Value *V = GI->getOperand(i);
+ if (V == &I)
+ break;
+ }
+ if (i == e)
+ continue;
+
+ PassThroughInfo Info(&I, GI, i);
+ Candidates.push_back(Info);
+ }
+ }
+
+ if (!isCandidate || Candidates.empty())
+ return false;
+
+ llvm::append_range(PassThroughs, Candidates);
+ return true;
+}
+
+void BPFAdjustOptImpl::adjustBasicBlock(BasicBlock &BB) {
+ if (!DisableBPFserializeICMP && serializeICMPCrossBB(BB))
+ return;
+}
+
+void BPFAdjustOptImpl::adjustInst(Instruction &I) {
+ if (!DisableBPFserializeICMP && serializeICMPInBB(I))
+ return;
+ if (!DisableBPFavoidSpeculation && avoidSpeculation(I))
+ return;
+}
+
+PreservedAnalyses BPFAdjustOptPass::run(Module &M, ModuleAnalysisManager &AM) {
+ return BPFAdjustOptImpl(&M).run() ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h
index af6425b16fa0..0c504412480d 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCORE.h
@@ -13,6 +13,10 @@
namespace llvm {
+class BasicBlock;
+class Instruction;
+class Module;
+
class BPFCoreSharedInfo {
public:
enum PatchableRelocKind : uint32_t {
@@ -24,6 +28,10 @@ public:
FIELD_RSHIFT_U64,
BTF_TYPE_ID_LOCAL,
BTF_TYPE_ID_REMOTE,
+ TYPE_EXISTENCE,
+ TYPE_SIZE,
+ ENUM_VALUE_EXISTENCE,
+ ENUM_VALUE,
MAX_FIELD_RELOC_KIND,
};
@@ -35,10 +43,32 @@ public:
MAX_BTF_TYPE_ID_FLAG,
};
+ enum PreserveTypeInfo : uint32_t {
+ PRESERVE_TYPE_INFO_EXISTENCE = 0,
+ PRESERVE_TYPE_INFO_SIZE,
+
+ MAX_PRESERVE_TYPE_INFO_FLAG,
+ };
+
+ enum PreserveEnumValue : uint32_t {
+ PRESERVE_ENUM_VALUE_EXISTENCE = 0,
+ PRESERVE_ENUM_VALUE,
+
+ MAX_PRESERVE_ENUM_VALUE_FLAG,
+ };
+
/// The attribute attached to globals representing a field access
static constexpr StringRef AmaAttr = "btf_ama";
/// The attribute attached to globals representing a type id
static constexpr StringRef TypeIdAttr = "btf_type_id";
+
+ /// llvm.bpf.passthrough builtin seq number
+ static uint32_t SeqNum;
+
+ /// Insert a bpf passthrough builtin function.
+ static Instruction *insertPassThrough(Module *M, BasicBlock *BB,
+ Instruction *Input,
+ Instruction *Before);
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
new file mode 100644
index 000000000000..5239218ad003
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -0,0 +1,130 @@
+//===------------ BPFCheckAndAdjustIR.cpp - Check and Adjust IR -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Check IR and adjust IR for verifier friendly codes.
+// The following are done for IR checking:
+// - no relocation globals in PHI node.
+// The following are done for IR adjustment:
+// - remove __builtin_bpf_passthrough builtins. Target independent IR
+// optimizations are done and those builtins can be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "BPFTargetMachine.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "bpf-check-and-opt-ir"
+
+using namespace llvm;
+
+namespace {
+
+class BPFCheckAndAdjustIR final : public ModulePass {
+ bool runOnModule(Module &F) override;
+
+public:
+ static char ID;
+ BPFCheckAndAdjustIR() : ModulePass(ID) {}
+
+private:
+ void checkIR(Module &M);
+ bool adjustIR(Module &M);
+ bool removePassThroughBuiltin(Module &M);
+};
+} // End anonymous namespace
+
+char BPFCheckAndAdjustIR::ID = 0;
+INITIALIZE_PASS(BPFCheckAndAdjustIR, DEBUG_TYPE, "BPF Check And Adjust IR",
+ false, false)
+
+ModulePass *llvm::createBPFCheckAndAdjustIR() {
+ return new BPFCheckAndAdjustIR();
+}
+
+void BPFCheckAndAdjustIR::checkIR(Module &M) {
+ // Ensure relocation global won't appear in PHI node
+ // This may happen if the compiler generated the following code:
+ // B1:
+ // g1 = @llvm.skb_buff:0:1...
+ // ...
+ // goto B_COMMON
+ // B2:
+ // g2 = @llvm.skb_buff:0:2...
+ // ...
+ // goto B_COMMON
+ // B_COMMON:
+ // g = PHI(g1, g2)
+ // x = load g
+ // ...
+ // If anything likes the above "g = PHI(g1, g2)", issue a fatal error.
+ for (Function &F : M)
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ PHINode *PN = dyn_cast<PHINode>(&I);
+ if (!PN || PN->use_empty())
+ continue;
+ for (int i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+ auto *GV = dyn_cast<GlobalVariable>(PN->getIncomingValue(i));
+ if (!GV)
+ continue;
+ if (GV->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
+ GV->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
+ report_fatal_error("relocation global in PHI node");
+ }
+ }
+}
+
+bool BPFCheckAndAdjustIR::removePassThroughBuiltin(Module &M) {
+ // Remove __builtin_bpf_passthrough()'s which are used to prevent
+ // certain IR optimizations. Now major IR optimizations are done,
+ // remove them.
+ bool Changed = false;
+ CallInst *ToBeDeleted = nullptr;
+ for (Function &F : M)
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ if (ToBeDeleted) {
+ ToBeDeleted->eraseFromParent();
+ ToBeDeleted = nullptr;
+ }
+
+ auto *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+ auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+ if (!GV)
+ continue;
+ if (!GV->getName().startswith("llvm.bpf.passthrough"))
+ continue;
+ Changed = true;
+ Value *Arg = Call->getArgOperand(1);
+ Call->replaceAllUsesWith(Arg);
+ ToBeDeleted = Call;
+ }
+ return Changed;
+}
+
+bool BPFCheckAndAdjustIR::adjustIR(Module &M) {
+ return removePassThroughBuiltin(M);
+}
+
+bool BPFCheckAndAdjustIR::runOnModule(Module &M) {
+ checkIR(M);
+ return adjustIR(M);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 77f565fb5957..f10a0d4c0077 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -494,8 +494,6 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
I++;
CurDAG->DeleteNode(Node);
-
- return;
}
FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
index a02556a39909..3322b8d93b3a 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td
index 9f00dc85d789..a809065014e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -44,6 +44,9 @@ def BPF_MOV : BPFArithOp<0xb>;
def BPF_ARSH : BPFArithOp<0xc>;
def BPF_END : BPFArithOp<0xd>;
+def BPF_XCHG : BPFArithOp<0xe>;
+def BPF_CMPXCHG : BPFArithOp<0xf>;
+
class BPFEndDir<bits<1> val> {
bits<1> Value = val;
}
@@ -86,7 +89,13 @@ def BPF_IMM : BPFModeModifer<0x0>;
def BPF_ABS : BPFModeModifer<0x1>;
def BPF_IND : BPFModeModifer<0x2>;
def BPF_MEM : BPFModeModifer<0x3>;
-def BPF_XADD : BPFModeModifer<0x6>;
+def BPF_ATOMIC : BPFModeModifer<0x6>;
+
+class BPFAtomicFlag<bits<4> val> {
+ bits<4> Value = val;
+}
+
+def BPF_FETCH : BPFAtomicFlag<0x1>;
class InstBPF<dag outs, dag ins, string asmstr, list<dag> pattern>
: Instruction {
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td
index 4298e2eaec04..082e1f4a92c2 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -617,9 +617,9 @@ let Predicates = [BPFNoALU32] in {
def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
}
-// Atomics
+// Atomic XADD for BPFNoALU32
class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
- : TYPE_LD_ST<BPF_XADD.Value, SizeOp.Value,
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
(outs GPR:$dst),
(ins MEMri:$addr, GPR:$val),
"lock *("#OpcodeStr#" *)($addr) += $val",
@@ -630,14 +630,88 @@ class XADD<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
let Inst{51-48} = addr{19-16}; // base reg
let Inst{55-52} = dst;
let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = BPF_ADD.Value;
let BPFClass = BPF_STX;
}
-class XADD32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
- : TYPE_LD_ST<BPF_XADD.Value, SizeOp.Value,
+let Constraints = "$dst = $val" in {
+ let Predicates = [BPFNoALU32] in {
+ def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>;
+ }
+}
+
+// Atomic add, and, or, xor
+class ATOMIC_NOFETCH<BPFArithOp Opc, string Opstr>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_DW.Value,
+ (outs GPR:$dst),
+ (ins MEMri:$addr, GPR:$val),
+ "lock *(u64 *)($addr) " #Opstr# "= $val",
+ []> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = Opc.Value;
+ let BPFClass = BPF_STX;
+}
+
+class ATOMIC32_NOFETCH<BPFArithOp Opc, string Opstr>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_W.Value,
(outs GPR32:$dst),
(ins MEMri:$addr, GPR32:$val),
- "lock *("#OpcodeStr#" *)($addr) += $val",
+ "lock *(u32 *)($addr) " #Opstr# "= $val",
+ []> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = Opc.Value;
+ let BPFClass = BPF_STX;
+}
+
+let Constraints = "$dst = $val" in {
+ let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+ def XADDW32 : ATOMIC32_NOFETCH<BPF_ADD, "+">;
+ def XANDW32 : ATOMIC32_NOFETCH<BPF_AND, "&">;
+ def XORW32 : ATOMIC32_NOFETCH<BPF_OR, "|">;
+ def XXORW32 : ATOMIC32_NOFETCH<BPF_XOR, "^">;
+ }
+
+ def XADDD : ATOMIC_NOFETCH<BPF_ADD, "+">;
+ def XANDD : ATOMIC_NOFETCH<BPF_AND, "&">;
+ def XORD : ATOMIC_NOFETCH<BPF_OR, "|">;
+ def XXORD : ATOMIC_NOFETCH<BPF_XOR, "^">;
+}
+
+// Atomic Fetch-and-<add, and, or, xor> operations
+class XFALU64<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
+ string OpcStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+ (outs GPR:$dst),
+ (ins MEMri:$addr, GPR:$val),
+ "$dst = atomic_fetch_"#OpcStr#"(("#OpcodeStr#" *)($addr), $val)",
+ [(set GPR:$dst, (OpNode ADDRri:$addr, GPR:$val))]> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = Opc.Value;
+ let Inst{3-0} = BPF_FETCH.Value;
+ let BPFClass = BPF_STX;
+}
+
+class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
+ string OpcStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+ (outs GPR32:$dst),
+ (ins MEMri:$addr, GPR32:$val),
+ "$dst = atomic_fetch_"#OpcStr#"(("#OpcodeStr#" *)($addr), $val)",
[(set GPR32:$dst, (OpNode ADDRri:$addr, GPR32:$val))]> {
bits<4> dst;
bits<20> addr;
@@ -645,19 +719,117 @@ class XADD32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
let Inst{51-48} = addr{19-16}; // base reg
let Inst{55-52} = dst;
let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = Opc.Value;
+ let Inst{3-0} = BPF_FETCH.Value;
let BPFClass = BPF_STX;
}
let Constraints = "$dst = $val" in {
- let Predicates = [BPFNoALU32] in {
- def XADDW : XADD<BPF_W, "u32", atomic_load_add_32>;
+ let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
+ def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_32>;
+ def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_32>;
+ def XFORW32 : XFALU32<BPF_W, BPF_OR, "u32", "or", atomic_load_or_32>;
+ def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_32>;
}
+ def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_64>;
+ def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_64>;
+ def XFORD : XFALU64<BPF_DW, BPF_OR, "u64", "or", atomic_load_or_64>;
+ def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_64>;
+}
+
+// atomic_load_sub can be represented as a neg followed
+// by an atomic_load_add.
+def : Pat<(atomic_load_sub_32 ADDRri:$addr, GPR32:$val),
+ (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_64 ADDRri:$addr, GPR:$val),
+ (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+
+// Atomic Exchange
+class XCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+ (outs GPR:$dst),
+ (ins MEMri:$addr, GPR:$val),
+ "$dst = xchg_"#OpcodeStr#"($addr, $val)",
+ [(set GPR:$dst, (OpNode ADDRri:$addr,GPR:$val))]> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = BPF_XCHG.Value;
+ let Inst{3-0} = BPF_FETCH.Value;
+ let BPFClass = BPF_STX;
+}
+
+class XCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+ (outs GPR32:$dst),
+ (ins MEMri:$addr, GPR32:$val),
+ "$dst = xchg32_"#OpcodeStr#"($addr, $val)",
+ [(set GPR32:$dst, (OpNode ADDRri:$addr,GPR32:$val))]> {
+ bits<4> dst;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = dst;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = BPF_XCHG.Value;
+ let Inst{3-0} = BPF_FETCH.Value;
+ let BPFClass = BPF_STX;
+}
+
+let Constraints = "$dst = $val" in {
let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
- def XADDW32 : XADD32<BPF_W, "u32", atomic_load_add_32>;
+ def XCHGW32 : XCHG32<BPF_W, "32", atomic_swap_32>;
}
- def XADDD : XADD<BPF_DW, "u64", atomic_load_add_64>;
+ def XCHGD : XCHG<BPF_DW, "64", atomic_swap_64>;
+}
+
+// Compare-And-Exchange
+class CMPXCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+ (outs),
+ (ins MEMri:$addr, GPR:$new),
+ "r0 = cmpxchg_"#OpcodeStr#"($addr, r0, $new)",
+ [(set R0, (OpNode ADDRri:$addr, R0, GPR:$new))]> {
+ bits<4> new;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = new;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = BPF_CMPXCHG.Value;
+ let Inst{3-0} = BPF_FETCH.Value;
+ let BPFClass = BPF_STX;
+}
+
+class CMPXCHG32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
+ : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
+ (outs),
+ (ins MEMri:$addr, GPR32:$new),
+ "w0 = cmpxchg32_"#OpcodeStr#"($addr, w0, $new)",
+ [(set W0, (OpNode ADDRri:$addr, W0, GPR32:$new))]> {
+ bits<4> new;
+ bits<20> addr;
+
+ let Inst{51-48} = addr{19-16}; // base reg
+ let Inst{55-52} = new;
+ let Inst{47-32} = addr{15-0}; // offset
+ let Inst{7-4} = BPF_CMPXCHG.Value;
+ let Inst{3-0} = BPF_FETCH.Value;
+ let BPFClass = BPF_STX;
+}
+
+let Predicates = [BPFHasALU32], Defs = [W0], Uses = [W0],
+ DecoderNamespace = "BPFALU32" in {
+ def CMPXCHGW32 : CMPXCHG32<BPF_W, "32", atomic_cmp_swap_32>;
+}
+
+let Defs = [R0], Uses = [R0] in {
+ def CMPXCHGD : CMPXCHG<BPF_DW, "64", atomic_cmp_swap_64>;
}
// bswap16, bswap32, bswap64
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp
index f82f166eda4d..4e24e3d911b8 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -41,7 +41,7 @@ private:
// Initialize class variables.
void initialize(MachineFunction &MFParm);
- void checkingIllegalXADD(void);
+ bool processAtomicInsts(void);
public:
@@ -49,7 +49,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override {
if (!skipFunction(MF.getFunction())) {
initialize(MF);
- checkingIllegalXADD();
+ return processAtomicInsts();
}
return false;
}
@@ -143,17 +143,15 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
return true;
// Otherwise, return true if any aliased SuperReg of GPR32 is not dead.
- std::vector<unsigned>::iterator search_begin = GPR64DeadDefs.begin();
- std::vector<unsigned>::iterator search_end = GPR64DeadDefs.end();
for (auto I : GPR32LiveDefs)
for (MCSuperRegIterator SR(I, TRI); SR.isValid(); ++SR)
- if (std::find(search_begin, search_end, *SR) == search_end)
- return true;
+ if (!llvm::is_contained(GPR64DeadDefs, *SR))
+ return true;
return false;
}
-void BPFMIPreEmitChecking::checkingIllegalXADD(void) {
+bool BPFMIPreEmitChecking::processAtomicInsts(void) {
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB) {
if (MI.getOpcode() != BPF::XADDW &&
@@ -174,7 +172,71 @@ void BPFMIPreEmitChecking::checkingIllegalXADD(void) {
}
}
- return;
+ // Check return values of atomic_fetch_and_{add,and,or,xor}.
+ // If the return is not used, the atomic_fetch_and_<op> instruction
+ // is replaced with atomic_<op> instruction.
+ MachineInstr *ToErase = nullptr;
+ bool Changed = false;
+ const BPFInstrInfo *TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ if (MI.getOpcode() != BPF::XFADDW32 && MI.getOpcode() != BPF::XFADDD &&
+ MI.getOpcode() != BPF::XFANDW32 && MI.getOpcode() != BPF::XFANDD &&
+ MI.getOpcode() != BPF::XFXORW32 && MI.getOpcode() != BPF::XFXORD &&
+ MI.getOpcode() != BPF::XFORW32 && MI.getOpcode() != BPF::XFORD)
+ continue;
+
+ if (hasLiveDefs(MI, TRI))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Transforming "; MI.dump());
+ unsigned newOpcode;
+ switch (MI.getOpcode()) {
+ case BPF::XFADDW32:
+ newOpcode = BPF::XADDW32;
+ break;
+ case BPF::XFADDD:
+ newOpcode = BPF::XADDD;
+ break;
+ case BPF::XFANDW32:
+ newOpcode = BPF::XANDW32;
+ break;
+ case BPF::XFANDD:
+ newOpcode = BPF::XANDD;
+ break;
+ case BPF::XFXORW32:
+ newOpcode = BPF::XXORW32;
+ break;
+ case BPF::XFXORD:
+ newOpcode = BPF::XXORD;
+ break;
+ case BPF::XFORW32:
+ newOpcode = BPF::XORW32;
+ break;
+ case BPF::XFORD:
+ newOpcode = BPF::XORD;
+ break;
+ default:
+ llvm_unreachable("Incorrect Atomic Instruction Opcode");
+ }
+
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(newOpcode))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+
+ ToErase = &MI;
+ Changed = true;
+ }
+ }
+
+ return Changed;
}
} // end default namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index c3cb7647aa79..18a4f60c171a 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -17,6 +17,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -33,58 +34,32 @@ using namespace llvm;
namespace {
-class BPFPreserveDIType final : public ModulePass {
- StringRef getPassName() const override {
- return "BPF Preserve DebugInfo Type";
- }
-
- bool runOnModule(Module &M) override;
-
-public:
- static char ID;
- BPFPreserveDIType() : ModulePass(ID) {}
-
-private:
- bool doTransformation(Module &M);
-};
-} // End anonymous namespace
-
-char BPFPreserveDIType::ID = 0;
-INITIALIZE_PASS(BPFPreserveDIType, DEBUG_TYPE, "preserve debuginfo type", false,
- false)
-
-ModulePass *llvm::createBPFPreserveDIType() { return new BPFPreserveDIType(); }
-
-bool BPFPreserveDIType::runOnModule(Module &M) {
+static bool BPFPreserveDITypeImpl(Function &F) {
LLVM_DEBUG(dbgs() << "********** preserve debuginfo type **********\n");
+ Module *M = F.getParent();
+
// Bail out if no debug info.
- if (M.debug_compile_units().empty())
+ if (M->debug_compile_units().empty())
return false;
- return doTransformation(M);
-}
-
-bool BPFPreserveDIType::doTransformation(Module &M) {
std::vector<CallInst *> PreserveDITypeCalls;
- for (auto &F : M) {
- for (auto &BB : F) {
- for (auto &I : BB) {
- auto *Call = dyn_cast<CallInst>(&I);
- if (!Call)
- continue;
-
- const auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
- if (!GV)
- continue;
-
- if (GV->getName().startswith("llvm.bpf.btf.type.id")) {
- if (!Call->getMetadata(LLVMContext::MD_preserve_access_index))
- report_fatal_error(
- "Missing metadata for llvm.bpf.btf.type.id intrinsic");
- PreserveDITypeCalls.push_back(Call);
- }
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *Call = dyn_cast<CallInst>(&I);
+ if (!Call)
+ continue;
+
+ const auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+ if (!GV)
+ continue;
+
+ if (GV->getName().startswith("llvm.bpf.btf.type.id")) {
+ if (!Call->getMetadata(LLVMContext::MD_preserve_access_index))
+ report_fatal_error(
+ "Missing metadata for llvm.bpf.btf.type.id intrinsic");
+ PreserveDITypeCalls.push_back(Call);
}
}
}
@@ -93,39 +68,72 @@ bool BPFPreserveDIType::doTransformation(Module &M) {
return false;
std::string BaseName = "llvm.btf_type_id.";
- int Count = 0;
+ static int Count = 0;
for (auto Call : PreserveDITypeCalls) {
- const ConstantInt *Flag = dyn_cast<ConstantInt>(Call->getArgOperand(2));
+ const ConstantInt *Flag = dyn_cast<ConstantInt>(Call->getArgOperand(1));
assert(Flag);
uint64_t FlagValue = Flag->getValue().getZExtValue();
if (FlagValue >= BPFCoreSharedInfo::MAX_BTF_TYPE_ID_FLAG)
report_fatal_error("Incorrect flag for llvm.bpf.btf.type.id intrinsic");
+ MDNode *MD = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+
uint32_t Reloc;
- if (FlagValue == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL_RELOC)
+ if (FlagValue == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL_RELOC) {
Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL;
- else
+ } else {
Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_REMOTE;
+ DIType *Ty = cast<DIType>(MD);
+ if (Ty->getName().empty())
+ report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc");
+ }
BasicBlock *BB = Call->getParent();
- IntegerType *VarType = Type::getInt32Ty(BB->getContext());
+ IntegerType *VarType = Type::getInt64Ty(BB->getContext());
std::string GVName = BaseName + std::to_string(Count) + "$" +
std::to_string(Reloc);
- GlobalVariable *GV =
- new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
- NULL, GVName);
+ GlobalVariable *GV = new GlobalVariable(
+ *M, VarType, false, GlobalVariable::ExternalLinkage, NULL, GVName);
GV->addAttribute(BPFCoreSharedInfo::TypeIdAttr);
- MDNode *MD = Call->getMetadata(LLVMContext::MD_preserve_access_index);
GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
// Load the global variable which represents the type info.
- auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
- Call);
- Call->replaceAllUsesWith(LDInst);
+ auto *LDInst =
+ new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+ Instruction *PassThroughInst =
+ BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
+ Call->replaceAllUsesWith(PassThroughInst);
Call->eraseFromParent();
Count++;
}
return true;
}
+
+class BPFPreserveDIType final : public FunctionPass {
+ bool runOnFunction(Function &F) override;
+
+public:
+ static char ID;
+ BPFPreserveDIType() : FunctionPass(ID) {}
+};
+} // End anonymous namespace
+
+char BPFPreserveDIType::ID = 0;
+INITIALIZE_PASS(BPFPreserveDIType, DEBUG_TYPE, "BPF Preserve Debuginfo Type",
+ false, false)
+
+FunctionPass *llvm::createBPFPreserveDIType() {
+ return new BPFPreserveDIType();
+}
+
+bool BPFPreserveDIType::runOnFunction(Function &F) {
+ return BPFPreserveDITypeImpl(F);
+}
+
+PreservedAnalyses BPFPreserveDITypePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ return BPFPreserveDITypeImpl(F) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp
index f3cb03b1f1f5..fac02e6476b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -29,7 +29,7 @@ BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {
initializeEnvironment();
initSubtargetFeatures(CPU, FS);
- ParseSubtargetFeatures(CPU, FS);
+ ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
return *this;
}
@@ -59,6 +59,6 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
- : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(),
+ : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(),
FrameLowering(initializeSubtargetDependencies(CPU, FS)),
TLInfo(TM, *this) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h
index 3da6a026ab7e..7649e0e92222 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -67,7 +67,7 @@ public:
// ParseSubtargetFeatures - Parses features string setting specified
// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
bool getHasJmpExt() const { return HasJmpExt; }
bool getHasJmp32() const { return HasJmp32; }
bool getHasAlu32() const { return HasAlu32; }
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 54204ee197ec..c0244b9f2c74 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -18,9 +18,15 @@
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
using namespace llvm;
static cl::
@@ -34,8 +40,10 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
- initializeBPFAbstractMemberAccessPass(PR);
+ initializeBPFAbstractMemberAccessLegacyPassPass(PR);
initializeBPFPreserveDITypePass(PR);
+ initializeBPFAdjustOptPass(PR);
+ initializeBPFCheckAndAdjustIRPass(PR);
initializeBPFMIPeepholePass(PR);
initializeBPFMIPeepholeTruncElimPass(PR);
}
@@ -49,9 +57,7 @@ static std::string computeDataLayout(const Triple &TT) {
}
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::PIC_;
- return *RM;
+ return RM.getValueOr(Reloc::PIC_);
}
BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
@@ -94,11 +100,48 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
return new BPFPassConfig(*this, PM);
}
-void BPFPassConfig::addIRPasses() {
+void BPFTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+ Builder.addExtension(
+ PassManagerBuilder::EP_EarlyAsPossible,
+ [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ PM.add(createBPFAbstractMemberAccess(this));
+ PM.add(createBPFPreserveDIType());
+ });
+
+ Builder.addExtension(
+ PassManagerBuilder::EP_Peephole,
+ [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ PM.add(createCFGSimplificationPass(
+ SimplifyCFGOptions().hoistCommonInsts(true)));
+ });
+ Builder.addExtension(
+ PassManagerBuilder::EP_ModuleOptimizerEarly,
+ [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ PM.add(createBPFAdjustOpt());
+ });
+}
- addPass(createBPFAbstractMemberAccess(&getBPFTargetMachine()));
- addPass(createBPFPreserveDIType());
+void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) {
+ PB.registerPipelineStartEPCallback(
+ [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
+ FunctionPassManager FPM(DebugPassManager);
+ FPM.addPass(BPFAbstractMemberAccessPass(this));
+ FPM.addPass(BPFPreserveDITypePass());
+ MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ });
+ PB.registerPeepholeEPCallback([=](FunctionPassManager &FPM,
+ PassBuilder::OptimizationLevel Level) {
+ FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
+ });
+ PB.registerPipelineEarlySimplificationEPCallback(
+ [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
+ MPM.addPass(BPFAdjustOptPass());
+ });
+}
+void BPFPassConfig::addIRPasses() {
+ addPass(createBPFCheckAndAdjustIR());
TargetPassConfig::addIRPasses();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
index beac7bd862da..5243a15eb7b0 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -37,6 +37,10 @@ public:
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
+
+ void adjustPassManager(PassManagerBuilder &) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) override;
};
}
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
index 4510e9357489..f9bdffe7cbae 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -22,6 +22,7 @@
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/LineIterator.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
@@ -993,12 +994,13 @@ void BTFDebug::generatePatchImmReloc(const MCSymbol *ORSym, uint32_t RootId,
FieldReloc.OffsetNameOff = addString(IndexPattern);
FieldReloc.RelocKind = std::stoull(std::string(RelocKindStr));
- PatchImms[GVar] = std::stoul(std::string(PatchImmStr));
+ PatchImms[GVar] = std::make_pair(std::stoll(std::string(PatchImmStr)),
+ FieldReloc.RelocKind);
} else {
StringRef RelocStr = AccessPattern.substr(FirstDollar + 1);
FieldReloc.OffsetNameOff = addString("0");
FieldReloc.RelocKind = std::stoull(std::string(RelocStr));
- PatchImms[GVar] = RootId;
+ PatchImms[GVar] = std::make_pair(RootId, FieldReloc.RelocKind);
}
FieldRelocTable[SecNameOff].push_back(FieldReloc);
}
@@ -1074,6 +1076,9 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
}
}
+ if (!CurMI) // no debug info
+ return;
+
// Skip this instruction if no DebugLoc or the DebugLoc
// is the same as the previous instruction.
const DebugLoc &DL = MI->getDebugLoc();
@@ -1125,6 +1130,20 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
if (ProcessingMapDef != SecName.startswith(".maps"))
continue;
+ // Create a .rodata datasec if the global variable is an initialized
+ // constant with private linkage and if it won't be in .rodata.str<#>
+ // and .rodata.cst<#> sections.
+ if (SecName == ".rodata" && Global.hasPrivateLinkage() &&
+ DataSecEntries.find(std::string(SecName)) == DataSecEntries.end()) {
+ SectionKind GVKind =
+ TargetLoweringObjectFile::getKindForGlobal(&Global, Asm->TM);
+ // skip .rodata.str<#> and .rodata.cst<#> sections
+ if (!GVKind.isMergeableCString() && !GVKind.isMergeableConst()) {
+ DataSecEntries[std::string(SecName)] =
+ std::make_unique<BTFKindDataSec>(Asm, std::string(SecName));
+ }
+ }
+
SmallVector<DIGlobalVariableExpression *, 1> GVs;
Global.getDebugInfo(GVs);
@@ -1194,14 +1213,23 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
auto *GVar = dyn_cast<GlobalVariable>(GVal);
if (GVar) {
// Emit "mov ri, <imm>"
- uint32_t Imm;
+ int64_t Imm;
+ uint32_t Reloc;
if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
- GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
- Imm = PatchImms[GVar];
- else
+ GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr)) {
+ Imm = PatchImms[GVar].first;
+ Reloc = PatchImms[GVar].second;
+ } else {
return false;
+ }
- OutMI.setOpcode(BPF::MOV_ri);
+ if (Reloc == BPFCoreSharedInfo::ENUM_VALUE_EXISTENCE ||
+ Reloc == BPFCoreSharedInfo::ENUM_VALUE ||
+ Reloc == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL ||
+ Reloc == BPFCoreSharedInfo::BTF_TYPE_ID_REMOTE)
+ OutMI.setOpcode(BPF::LD_imm64);
+ else
+ OutMI.setOpcode(BPF::MOV_ri);
OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
OutMI.addOperand(MCOperand::createImm(Imm));
return true;
@@ -1215,7 +1243,7 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
const GlobalValue *GVal = MO.getGlobal();
auto *GVar = dyn_cast<GlobalVariable>(GVal);
if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
- uint32_t Imm = PatchImms[GVar];
+ uint32_t Imm = PatchImms[GVar].first;
OutMI.setOpcode(MI->getOperand(1).getImm());
if (MI->getOperand(0).isImm())
OutMI.addOperand(MCOperand::createImm(MI->getOperand(0).getImm()));
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
index 2f39f665299a..1bad0d11fee4 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/BTFDebug.h
@@ -16,7 +16,8 @@
#include "llvm/ADT/StringMap.h"
#include "llvm/CodeGen/DebugHandlerBase.h"
-#include "llvm/CodeGen/MachineInstr.h"
+#include <cstdint>
+#include <map>
#include <set>
#include <unordered_map>
#include "BTF.h"
@@ -27,9 +28,12 @@ class AsmPrinter;
class BTFDebug;
class DIType;
class GlobalVariable;
+class MachineFunction;
+class MachineInstr;
+class MachineOperand;
+class MCInst;
class MCStreamer;
class MCSymbol;
-class MachineFunction;
/// The base class for BTF type generation.
class BTFTypeBase {
@@ -251,7 +255,7 @@ class BTFDebug : public DebugHandlerBase {
StringMap<std::vector<std::string>> FileContent;
std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
std::vector<BTFTypeStruct *> StructTypes;
- std::map<const GlobalVariable *, uint32_t> PatchImms;
+ std::map<const GlobalVariable *, std::pair<int64_t, uint32_t>> PatchImms;
std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
FixupDerivedTypes;
std::set<const Function *>ProtoFunctions;
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 4d98dc7341d0..3a1492743bf4 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -58,7 +58,7 @@ public:
BPF_MEM = 0x3,
BPF_LEN = 0x4,
BPF_MSH = 0x5,
- BPF_XADD = 0x6
+ BPF_ATOMIC = 0x6
};
BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
@@ -176,7 +176,7 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
uint8_t InstMode = getInstMode(Insn);
if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
getInstSize(Insn) != BPF_DW &&
- (InstMode == BPF_MEM || InstMode == BPF_XADD) &&
+ (InstMode == BPF_MEM || InstMode == BPF_ATOMIC) &&
STI.getFeatureBits()[BPF::ALU32])
Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
this, STI);
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 9d829ac45a10..29e9d5da0836 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -43,11 +43,6 @@ public:
unsigned getNumFixupKinds() const override { return 1; }
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
-
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
index 2181bb575cdd..e76067ea41ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h
@@ -32,6 +32,7 @@ public:
void printBrTargetOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
};
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index f9abe76c976b..12af92e0d198 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -13,6 +13,7 @@
#include "MCTargetDesc/BPFMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -158,12 +159,18 @@ void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
+ // For CMPXCHG instructions, output is implicitly in R0/W0,
+ // so memory operand starts from operand 0.
+ int MemOpStartIndex = 1, Opcode = MI.getOpcode();
+ if (Opcode == BPF::CMPXCHGW32 || Opcode == BPF::CMPXCHGD)
+ MemOpStartIndex = 0;
+
uint64_t Encoding;
- const MCOperand Op1 = MI.getOperand(1);
+ const MCOperand Op1 = MI.getOperand(MemOpStartIndex);
assert(Op1.isReg() && "First operand is not register.");
Encoding = MRI.getEncodingValue(Op1.getReg());
Encoding <<= 16;
- MCOperand Op2 = MI.getOperand(2);
+ MCOperand Op2 = MI.getOperand(MemOpStartIndex + 1);
assert(Op2.isImm() && "Second operand is not immediate.");
Encoding |= Op2.getImm() & 0xffff;
return Encoding;
diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 58da0830d002..8fb7d7e89f09 100644
--- a/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -46,7 +46,7 @@ static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
- return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
+ return createBPFMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx,
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td
new file mode 100644
index 000000000000..da6151befa1b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKY.td
@@ -0,0 +1,32 @@
+//===-- CSKY.td - Describe the CSKY Target Machine ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Registers, calling conventions, instruction descriptions.
+//===----------------------------------------------------------------------===//
+
+include "CSKYRegisterInfo.td"
+include "CSKYInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// CSKY processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic-csky", NoSchedModel, []>;
+
+//===----------------------------------------------------------------------===//
+// Define the CSKY target.
+//===----------------------------------------------------------------------===//
+
+def CSKYInstrInfo : InstrInfo;
+
+def CSKY : Target {
+ let InstructionSet = CSKYInstrInfo;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
new file mode 100644
index 000000000000..86f9dd0b7da3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -0,0 +1,528 @@
+//===-- CSKYInstrFormats.td - CSKY Instruction Formats -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+class AddrMode<bits<5> val> {
+ bits<5> Value = val;
+}
+
+def AddrModeNone : AddrMode<0>;
+def AddrMode32B : AddrMode<1>; // ld32.b, ld32.bs, st32.b, st32.bs, +4kb
+def AddrMode32H : AddrMode<2>; // ld32.h, ld32.hs, st32.h, st32.hs, +8kb
+def AddrMode32WD : AddrMode<3>; // ld32.w, st32.w, ld32.d, st32.d, +16kb
+def AddrMode16B : AddrMode<4>; // ld16.b, +32b
+def AddrMode16H : AddrMode<5>; // ld16.h, +64b
+def AddrMode16W : AddrMode<6>; // ld16.w, +128b or +1kb
+def AddrMode32SDF : AddrMode<7>; // flds, fldd, +1kb
+
+class CSKYInst<AddrMode am, int sz, dag outs, dag ins, string asmstr,
+ list<dag> pattern> : Instruction {
+ let Namespace = "CSKY";
+ int Size = sz;
+ AddrMode AM = am;
+
+ let OutOperandList = outs;
+ let InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+ let Itinerary = NoItinerary;
+ let TSFlags{4 - 0} = AM.Value;
+}
+
+class CSKYPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : CSKYInst<AddrModeNone, 0, outs, ins, asmstr, pattern> {
+ let isCodeGenOnly = 1;
+ let isPseudo = 1;
+}
+
+class CSKY32Inst<AddrMode am, bits<6> opcode, dag outs, dag ins, string asmstr,
+ list<dag> pattern>
+ : CSKYInst<am, 4, outs, ins, asmstr, pattern> {
+ field bits<32> Inst;
+ let Inst{31 - 26} = opcode;
+}
+
+// CSKY 32-bit instruction
+// Format< OP[6] | Offset[26] >
+// Instruction(1): bsr32
+class J<bits<6> opcode, dag outs, dag ins, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, opcode, outs, ins, !strconcat(op, "\t$offset"),
+ pattern> {
+ bits<26> offset;
+ let Inst{25 - 0} = offset;
+}
+
+// Format< OP[6] | RZ[5] | SOP[3] | OFFSET[18] >
+// Instructions(7): grs, lrs32.b, lrs32.h, lrs32.w, srs32.b, srs32.h, srs32.w
+class I_18_Z_L<bits<3> sop, string op, Operand operand, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x33, (outs GPR:$rz), (ins operand:$offset),
+ !strconcat(op, "\t$rz, $offset"), pattern> {
+ bits<5> rz;
+ bits<18> offset;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 18} = sop;
+ let Inst{17 - 0} = offset;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | IMM[16] >
+// Instructions(1): ori32
+class I_16_ZX<string op, ImmLeaf ImmType, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3b,
+ (outs GPR:$rz), (ins GPR:$rx,ImmType:$imm16),
+ !strconcat(op, "\t$rz, $rx, $imm16"), pattern> {
+ bits<5> rz;
+ bits<5> rx;
+ bits<16> imm16;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 0} = imm16;
+}
+
+// Format< OP[6] | SOP[5] | RZ[5] | IMM[16] >
+// Instructions(3): movi32, movih32, (bgeni32)
+class I_16_MOV<bits<5> sop, string op, ImmLeaf ImmType>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs GPR:$rz), (ins ImmType:$imm16),
+ !strconcat(op, "\t$rz, $imm16"),
+ [(set GPR:$rz, ImmType:$imm16)]> {
+ bits<5> rz;
+ bits<16> imm16;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = rz;
+ let Inst{15 - 0} = imm16;
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+ let isMoveImm = 1;
+}
+
+// Format< OP[6] | SOP[5] | RZ[5] | OFFSET[16] >
+// Instructions(1): lrw32
+class I_16_Z_L<bits<5> sop, string op, Operand operand, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a,
+ (outs GPR:$rz), (ins operand:$imm16),
+ !strconcat(op, "\t$rz, [$imm16]"), pattern> {
+ bits<5> rz;
+ bits<16> imm16;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = rz;
+ let Inst{15 - 0} = imm16;
+}
+
+// Format< OP[6] | SOP[5] | 00000[5] | OFFSET[16] >
+// Instructions(5): bt32, bf32, br32, jmpi32, jsri32
+class I_16_L<bits<5> sop, dag outs, dag ins, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a, outs, ins, !strconcat(op, "\t$imm16"),
+ pattern> {
+ bits<16> imm16;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = 0;
+ let Inst{15 - 0} = imm16;
+}
+
+// bt32, bf32, br32, jmpi32
+class I_16_L_B<bits<5> sop, string op, Operand operand, list<dag> pattern>
+ : I_16_L<sop, (outs), (ins operand:$imm16, CARRY:$ca), op, pattern> {
+ let isBranch = 1;
+ let isTerminator = 1;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | 0000000000000000[16] >
+// Instructions(2): jmp32, jsr32
+class I_16_JX<bits<5> sop, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs), (ins GPR:$rx),
+ !strconcat(op, "\t$rx"), pattern> {
+ bits<5> rx;
+ bits<16> imm16;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 0} = 0;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | 00000000000000[14] | IMM[2] >
+// Instructions(1): jmpix32
+class I_16_J_XI<bits<5> sop, string op, Operand operand, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs),
+ (ins GPR:$rx, operand:$imm2),
+ !strconcat(op, "\t$rx, $imm2"), pattern> {
+ bits<5> rx;
+ bits<2> imm2;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 2} = 0;
+ let Inst{1 - 0} = imm2;
+}
+
+// Format< OP[6] | SOP[5] | PCODE[5] | 0000000000000000[16] >
+// Instructions(1): rts32
+class I_16_RET<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs), (ins), op, pattern> {
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = pcode;
+ let Inst{15 - 0} = 0;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | IMM16[16] >
+// Instructions(3): cmpnei32, cmphsi32, cmplti32
+class I_16_X<bits<5> sop, string op>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs CARRY:$ca),
+ (ins GPR:$rx, i32imm:$imm16), !strconcat(op, "\t$rx, $imm16"), []> {
+ bits<16> imm16;
+ bits<5> rx;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 0} = imm16;
+ let isCompare = 1;
+}
+
+// Format< OP[6] | SOP[5] | RX[5] | OFFSET[16] >
+// Instructions(7): bez32, bnez32, bnezad32, bhz32, blsz32, blz32, bhsz32
+class I_16_X_L<bits<5> sop, string op, Operand operand>
+ : CSKY32Inst<AddrModeNone, 0x3a, (outs), (ins GPR:$rx, operand:$imm16),
+ !strconcat(op, "\t$rx, $imm16"), []> {
+ bits<5> rx;
+ bits<16> imm16;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 0} = imm16;
+ let isBranch = 1;
+ let isTerminator = 1;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[4] | IMM[12] >
+// Instructions(5): addi32, subi32, andi32, andni32, xori32
+class I_12<bits<4> sop, string op, SDNode node, ImmLeaf ImmType>
+ : CSKY32Inst<AddrModeNone, 0x39, (outs GPR:$rz),
+ (ins GPR:$rx, ImmType:$imm12), !strconcat(op, "\t$rz, $rx, $imm12"),
+ [(set GPR:$rz, (node GPR:$rx, ImmType:$imm12))]> {
+ bits<5> rz;
+ bits<5> rx;
+ bits<12> imm12;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 12} = sop;
+ let Inst{11 - 0} = imm12;
+}
+
+class I_LDST<AddrMode am, bits<6> opcode, bits<4> sop, dag outs, dag ins,
+ string op, list<dag> pattern>
+ : CSKY32Inst<am, opcode, outs, ins, !strconcat(op, "\t$rz, ($rx, $imm12)"),
+ pattern> {
+ bits<5> rx;
+ bits<5> rz;
+ bits<12> imm12;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 12} = sop;
+ let Inst{11 - 0} = imm12;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[4] | OFFSET[12] >
+// Instructions(6): ld32.b, ld32.bs, ld32.h, ld32.hs, ld32.w, ld32.d
+class I_LD<AddrMode am, bits<4> sop, string op, Operand operand>
+ : I_LDST<am, 0x36, sop,
+ (outs GPR:$rz), (ins GPR:$rx, operand:$imm12), op, []>;
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[4] | OFFSET[12] >
+// Instructions(4): st32.b, st32.h, st32.w, st32.d
+class I_ST<AddrMode am, bits<4> sop, string op, Operand operand>
+ : I_LDST<am, 0x37, sop, (outs),
+ (ins GPR:$rz, GPR:$rx, operand:$imm12), op, []>;
+
+// Format< OP[6] | SOP[5] | PCODE[5] | 0000[4] | 000 | R28 | LIST2[3] | R15 |
+// LIST1[4] >
+// Instructions(2): push32, pop32
+class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op>
+ : CSKY32Inst<AddrModeNone, 0x3a, outs, ins, !strconcat(op, "\t$regs"), []> {
+ bits<12> regs;
+ let Inst{25 - 21} = sop;
+ let Inst{20 - 16} = pcode;
+ let Inst{15 - 12} = 0;
+ let Inst{11 - 0} = regs;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | PCODE[5] | IMM[5]>
+// Instructions(4): incf32, inct32, decf32, dect32
+class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+ list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+ (ins GPR:$false, GPR:$rx, ImmType:$imm5),
+ !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+ bits<5> rz;
+ bits<5> rx;
+ bits<5> imm5;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = imm5;
+ let Constraints = "$rz = $false";
+}
+
+// Format< OP[6] | IMM[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5]>
+// Instructions(13): decgt32, declt32, decne32, lsli32, lslc32, lsri32
+// lsrc32, asri32, asrc32, rotli32, xsr32, bclri32, bseti32
+class I_5_XZ<bits<6> sop, bits<5> pcode, string op, dag ins, dag outs,
+ list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, ins, outs,
+ !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+ bits<5> imm5;
+ bits<5> rx;
+ bits<5> rz;
+ let Inst{25 - 21} = imm5;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | IMM[5]>
+// Instructions(2): ldm32, (ldq32), stm32, (stq32)
+class I_5_YX<bits<6> opcode, dag outs, dag ins, string op, list<dag> pattern,
+ bits<5> imm5>
+ : CSKY32Inst<AddrModeNone, opcode, outs, ins,
+ op #"\t${ry}, (${rx}), " #!cast<int>(imm5), pattern> {
+ bits<5> rx;
+ bits<5> ry;
+ let Inst{25 - 21} = ry; // ry
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = 0b000111;
+ let Inst{9 - 5} = 0b00001;
+ let Inst{4 - 0} = imm5{4 - 0}; // imm5
+}
+
+// Format< OP[6] | LSB[5] | RX[5] | SOP[6] | MSB[5] | RZ[5]>
+// Instructions(6): zext32, zextb32, zexth32, sext32, sextb32, sexth32
+class I_5_XZ_U<bits<6> sop, bits<5> lsb, bits<5> msb, dag outs, dag ins,
+ string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, outs, ins,
+ op #"\t$rz, $rx, " #!cast<int>(msb) #", " #!cast<int>(lsb),
+ pattern> {
+ bits<5> rx;
+ bits<5> rz;
+ let Inst{25 - 21} = lsb; // lsb
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = msb; // msb
+ let Inst{4 - 0} = rz;
+}
+
+// sextb, sexth
+class I_5_XZ_US<bits<6> sop, bits<5> lsb, bits<5> msb, string op, SDNode opnode,
+ ValueType type> : I_5_XZ_U<sop, lsb, msb,
+ (outs GPR:$rz), (ins GPR:$rx),op, [(set GPR:$rz, (opnode GPR:$rx, type))]>;
+
+class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v>
+ : I_5_XZ_U<sop, lsb, msb, (outs GPR:$rz), (ins GPR:$rx), op,
+ [(set GPR:$rz, (and GPR:$rx, (i32 v)))]>;
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | SIZE[5] | LSB[5]>
+// Instructions(1): ins32
+class I_5_ZX_U<bits<6> sop, string op, Operand operand, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins operand:$size_lsb),
+ !strconcat(op, "\t$rz, operand:$size_lsb"), pattern> {
+ bits<10> size_lsb;
+ bits<5> rz;
+ bits<5> rx;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = size_lsb{9 - 5}; // size
+ let Inst{4 - 0} = size_lsb{4 - 0}; // lsb
+}
+
+// Format< OP[6] | IMM[5] | RX[5] | SOP[6] | PCODE[5] | 00000 >
+// Instructions(1): btsti32
+class I_5_X<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+ list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31,
+ (outs CARRY:$ca), (ins GPR:$rx, ImmType:$imm5),
+ !strconcat(op, "\t$rx, $imm5"), pattern> {
+ bits<5> imm5;
+ bits<5> rx;
+ let Inst{25 - 21} = imm5;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = 0;
+ let isCompare = 1;
+}
+
+// Format< OP[6] | IMM[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5]>
+// Instructions(1): bmaski32
+class I_5_Z<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+ list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins ImmType:$imm5),
+ !strconcat(op, "\t$rz, $imm5"), pattern> {
+ bits<5> imm5;
+ bits<5> rz;
+ let Inst{25 - 21} = imm5;
+ let Inst{20 - 16} = 0;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions(24): addu32, addc32, subu32, subc32, (rsub32), ixh32, ixw32,
+// ixd32, and32, andn32, or32, xor32, nor32, lsl32, lsr32, asr32, rotl32
+// mult32, divu32, divs32, mul.(u/s)32, mula.32.l, mula.u32, mulall.s16.s
+class R_YXZ<bits<6> opcode, bits<6> sop, bits<5> pcode, dag outs, dag ins,
+ string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, opcode, outs, ins,
+ !strconcat(op, "\t$rz, $rx, $ry"), pattern> {
+ bits<5> ry;
+ bits<5> rx;
+ bits<5> rz;
+ let Inst{25 - 21} = ry;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = rz;
+}
+
+// R_YXZ instructions with simple pattern
+// Output: GPR:rz
+// Input: GPR:rx, GPR:ry
+// Asm string: op rz, rx, ry
+// Instructions: addu32, subu32, ixh32, ixw32, ixd32, and32, andn32, or32,
+// xor32, nor32, lsl32, lsr32, asr32, mult32, divu32, divs32
+class R_YXZ_SP_F1<bits<6> sop, bits<5> pcode, PatFrag opnode, string op,
+ bit Commutable = 0> : R_YXZ<0x31, sop, pcode, (outs GPR:$rz),
+ (ins GPR:$rx, GPR:$ry), op, [(set GPR:$rz, (opnode GPR:$rx, GPR:$ry))]> {
+ let isCommutable = Commutable;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(8) ldr32.b, ldr32.h, ldr32.bs, ldr32.hs, ldr32.w,
+// str32.b, str32.h, str32.w
+class R_YXZ_LDST<bits<6> opcode, bits<6> sop, bits<5> pcode, int no, dag outs,
+ dag ins, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, opcode, outs, ins,
+ op #"\t$rz, ($rx, $ry << " #no #")", pattern> {
+ bits<5> rx;
+ bits<5> ry;
+ bits<5> rz;
+ let Inst{25 - 21} = ry; // ry;
+ let Inst{20 - 16} = rx; // rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode; // pcode;
+ let Inst{4 - 0} = rz;
+}
+
+class I_LDR<bits<6> sop, bits<5> pcode, string op, int no>
+ : R_YXZ_LDST<0x34, sop, pcode, no,
+ (outs GPR:$rz), (ins GPR:$rx, GPR:$ry), op, []>;
+
+class I_STR<bits<6> sop, bits<5> pcode, string op, int no>
+ : R_YXZ_LDST<0x35, sop, pcode, no, (outs),
+ (ins GPR:$rz, GPR:$rx, GPR:$ry), op, []>;
+
+// Format< OP[6] | RX[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(1) not32
+class R_XXZ<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op,
+ list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, outs, ins, !strconcat(op, "\t$rz, $rx"),
+ pattern> {
+ bits<5> rx;
+ bits<5> rz;
+ let Inst{25 - 21} = rx;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(4) cmpne32, cmphs32, cmplt32, tst32
+class R_YX<bits<6> sop, bits<5> pcode, string op>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),
+ (ins GPR:$rx, GPR:$ry),
+ !strconcat(op, "\t$rx, $ry"), []> {
+ bits<5> ry;
+ bits<5> rx;
+ let Inst{25 - 21} = ry;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = 0;
+ let isCompare = 1;
+}
+
+// Format< OP[6] | 00000[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(12)
+// mov32, xtrb0.32, xtrb1.32, xtrb2.32, xtrb3.32, brev32, revb32
+// revh32, abs32, ff0.32, ff1.32, bgenr32
+class R_XZ<bits<6> sop, bits<5> pcode, string op>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins GPR:$rx),
+ !strconcat(op, "\t$rz, $rx"), []> {
+ bits<5> rx;
+ bits<5> rz;
+ let Inst{25 - 21} = 0;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RZ[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(2) movf32, movt32
+class R_ZX<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+ (ins CARRY:$ca, GPR:$rx, GPR:$false),
+ !strconcat(op, "\t$rz, $rx"), pattern> {
+ bits<5> rz;
+ bits<5> rx;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = 0;
+ let Constraints = "$rz = $false";
+ let isSelect = 1;
+}
+
+// Format< OP[6] | 00000[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(1) tstnbz32
+class R_X<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),(ins GPR:$rx),
+ !strconcat(op, "\t$rx"), pattern> {
+ bits<5> rx;
+ let Inst{25 - 21} = 0;
+ let Inst{20 - 16} = rx;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = 0;
+}
+
+// Format< OP[6] | 00000[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5] >
+// Instructions:(2) mvc32, mvcv32
+class R_Z_1<bits<6> sop, bits<5> pcode, string op>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+ (ins CARRY:$ca), !strconcat(op, "\t$rz"), []> {
+ bits<5> rz;
+ let Inst{25 - 21} = 0;
+ let Inst{20 - 16} = 0;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = rz;
+}
+
+// Format< OP[6] | RZ[5] | 00000[5] | SOP[6] | PCODE[5] | 00000[5] >
+// Instructions:(2) clrf32, clrt32
+class R_Z_2<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
+ : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
+ (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
+ bits<5> rz;
+ let Inst{25 - 21} = rz;
+ let Inst{20 - 16} = 0;
+ let Inst{15 - 10} = sop;
+ let Inst{9 - 5} = pcode;
+ let Inst{4 - 0} = 0;
+ let Constraints = "$rz = $false";
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
new file mode 100644
index 000000000000..7add217530e1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -0,0 +1,108 @@
+//===-- CSKYInstrInfo.td - Target Description for CSKY -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the CSKY instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "CSKYInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// CSKY specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// TODO: Add CSKY specific DAG Nodes.
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+class oimm<int num> : Operand<i32>,
+ ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> {
+ let EncoderMethod = "getOImmOpValue";
+}
+
+class uimm<int num, int shift = 0> : Operand<i32>,
+ ImmLeaf<i32, "return isShiftedUInt<"#num#", "#shift#">(Imm);"> {
+ let EncoderMethod = "getImmOpValue<"#shift#">";
+}
+
+class simm<int num, int shift = 0> : Operand<i32>,
+ ImmLeaf<i32, "return isShiftedInt<"#num#", "#shift#">(Imm);"> {
+ let EncoderMethod = "getImmOpValue<"#shift#">";
+}
+
+def nimm_XFORM : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(~N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+class nimm<int num> : Operand<i32>,
+ ImmLeaf<i32, "return isUInt<"#num#">(~Imm);", nimm_XFORM> {
+}
+
+
+def oimm12 : oimm<12>;
+
+def nimm12 : nimm<12>;
+
+def uimm5 : uimm<5>;
+def uimm12 : uimm<12>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions.
+//===----------------------------------------------------------------------===//
+
+class TriOpFrag<dag res> : PatFrag<(ops node: $LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
+
+def ADDI32 : I_12<0x0, "addi32", add, oimm12>;
+def SUBI32 : I_12<0x1, "subi32", sub, oimm12>;
+def ANDI32 : I_12<0x2, "andi32", and, uimm12>;
+def ANDNI32 : I_12<0x3, "andni32", and, nimm12>;
+def XORI32 : I_12<0x4, "xori32", xor, uimm12>;
+def LSLI32 : I_5_XZ<0x12, 0x1, "lsli32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (shl GPR:$rx, uimm5:$imm5))]>;
+def LSRI32 : I_5_XZ<0x12, 0x2, "lsri32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (srl GPR:$rx, uimm5:$imm5))]>;
+def ASRI32 : I_5_XZ<0x12, 0x4, "asri32",
+ (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
+ [(set GPR:$rz, (sra GPR:$rx, uimm5:$imm5))]>;
+
+
+
+def ADDU32 : R_YXZ_SP_F1<0x0, 0x1,
+ BinOpFrag<(add node:$LHS, node:$RHS)>, "addu32", 1>;
+def SUBU32 : R_YXZ_SP_F1<0x0, 0x4,
+ BinOpFrag<(sub node:$LHS, node:$RHS)>, "subu32">;
+def AND32 : R_YXZ_SP_F1<0x8, 0x1,
+ BinOpFrag<(and node:$LHS, node:$RHS)>, "and32", 1>;
+def ANDN32 : R_YXZ_SP_F1<0x8, 0x2,
+ BinOpFrag<(and node:$LHS, (not node:$RHS))>, "andn32">;
+def OR32: R_YXZ_SP_F1<0x9, 0x1,
+ BinOpFrag<(or node:$LHS, node:$RHS)>, "or32", 1>;
+def XOR32 : R_YXZ_SP_F1<0x9, 0x2,
+ BinOpFrag<(xor node:$LHS, node:$RHS)>, "xor32", 1>;
+def NOR32 : R_YXZ_SP_F1<0x9, 0x4,
+ BinOpFrag<(not (or node:$LHS, node:$RHS))>, "nor32", 1>;
+def LSL32 : R_YXZ_SP_F1<0x10, 0x1,
+ BinOpFrag<(shl node:$LHS, node:$RHS)>, "lsl32">;
+def LSR32 : R_YXZ_SP_F1<0x10, 0x2,
+ BinOpFrag<(srl node:$LHS, node:$RHS)>, "lsr32">;
+def ASR32 : R_YXZ_SP_F1<0x10, 0x4,
+ BinOpFrag<(sra node:$LHS, node:$RHS)>, "asr32">;
+def MULT32 : R_YXZ_SP_F1<0x21, 0x1,
+ BinOpFrag<(mul node:$LHS, node:$RHS)>, "mult32", 1>;
+def DIVS32 : R_YXZ_SP_F1<0x20, 0x2,
+ BinOpFrag<(sdiv node:$LHS, node:$RHS)>, "divs32">;
+def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
+ BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">;
+
+def NOT32 : R_XXZ<0b001001, 0b00100, (outs GPR:$rz), (ins GPR:$rx),
+ "not", [(set GPR:$rz, (not GPR:$rx))]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
new file mode 100644
index 000000000000..aef4589a67f2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
@@ -0,0 +1,182 @@
+//===-- CSKYRegisterInfo.td - CSKY Register defs -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the CSKY registers.
+//===----------------------------------------------------------------------===//
+
+let Namespace = "CSKY" in {
+ class CSKYReg<bits<6> Enc, string n, list<string> alt = []> : Register<n> {
+ let HWEncoding{5 - 0} = Enc;
+ let AltNames = alt;
+ }
+
+ class CSKYFReg32<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
+ let HWEncoding{4 - 0} = Enc;
+ let AltNames = alt;
+ }
+
+ // Because CSKYFReg64 register have AsmName and AltNames that alias with their
+ // 32-bit sub-register, CSKYAsmParser will need to coerce a register number
+ // from a CSKYFReg32 to the equivalent CSKYFReg64 when appropriate.
+ def sub32_0 : SubRegIndex<32, 0>;
+ def sub32_32 : SubRegIndex<32, 32>;
+ def sub64_0 : SubRegIndex<64, 0>;
+ def sub64_64 : SubRegIndex<64,64>;
+
+ class CSKYFReg64<CSKYFReg32 subreg> : Register<""> {
+ let HWEncoding{4 - 0} = subreg.HWEncoding{4 - 0};
+ let SubRegs = [subreg];
+ let SubRegIndices = [sub32_0];
+ let AsmName = subreg.AsmName;
+ let AltNames = subreg.AltNames;
+ }
+
+ class CSKYFReg128<CSKYFReg64 subreg> : Register<""> {
+ let HWEncoding{4 - 0} = subreg.HWEncoding{4 - 0};
+ let SubRegs = [subreg];
+ let SubRegIndices = [sub64_0];
+ let AsmName = subreg.AsmName;
+ let AltNames = subreg.AltNames;
+ }
+
+ def ABIRegAltName : RegAltNameIndex;
+} // Namespace = "CSKY"
+
+let RegAltNameIndices = [ABIRegAltName] in {
+ def R0 : CSKYReg<0, "r0", ["a0"]>, DwarfRegNum<[0]>;
+ def R1 : CSKYReg<1, "r1", ["a1"]>, DwarfRegNum<[1]>;
+ def R2 : CSKYReg<2, "r2", ["a2"]>, DwarfRegNum<[2]>;
+ def R3 : CSKYReg<3, "r3", ["a3"]>, DwarfRegNum<[3]>;
+ def R4 : CSKYReg<4, "r4", ["l0"]>, DwarfRegNum<[4]>;
+ def R5 : CSKYReg<5, "r5", ["l1"]>, DwarfRegNum<[5]>;
+ def R6 : CSKYReg<6, "r6", ["l2"]>, DwarfRegNum<[6]>;
+ def R7 : CSKYReg<7, "r7", ["l3"]>, DwarfRegNum<[7]>;
+ def R8 : CSKYReg<8, "r8", ["l4"]>, DwarfRegNum<[8]>;
+ def R9 : CSKYReg<9, "r9", ["l5"]>, DwarfRegNum<[9]>;
+ def R10 : CSKYReg<10, "r10", ["l6"]>, DwarfRegNum<[10]>;
+ def R11 : CSKYReg<11, "r11", ["l7"]>, DwarfRegNum<[11]>;
+ def R12 : CSKYReg<12, "r12", ["t0"]>, DwarfRegNum<[12]>;
+ def R13 : CSKYReg<13, "r13", ["t1"]>, DwarfRegNum<[13]>;
+ def R14 : CSKYReg<14, "r14", ["sp"]>, DwarfRegNum<[14]>;
+ def R15 : CSKYReg<15, "r15", ["lr"]>, DwarfRegNum<[15]>;
+ def R16 : CSKYReg<16, "r16", ["l8"]>, DwarfRegNum<[16]>;
+ def R17 : CSKYReg<17, "r17", ["l9"]>, DwarfRegNum<[17]>;
+ def R18 : CSKYReg<18, "r18", ["t2"]>, DwarfRegNum<[18]>;
+ def R19 : CSKYReg<19, "r19", ["t3"]>, DwarfRegNum<[19]>;
+ def R20 : CSKYReg<20, "r20", ["t4"]>, DwarfRegNum<[20]>;
+ def R21 : CSKYReg<21, "r21", ["t5"]>, DwarfRegNum<[21]>;
+ def R22 : CSKYReg<22, "r22", ["t6"]>, DwarfRegNum<[22]>;
+ def R23 : CSKYReg<23, "r23", ["t7"]>, DwarfRegNum<[23]>;
+ def R24 : CSKYReg<24, "r24", ["t8"]>, DwarfRegNum<[24]>;
+ def R25 : CSKYReg<25, "r25", ["t9"]>, DwarfRegNum<[25]>;
+ def R26 : CSKYReg<26, "r26", ["r26"]>, DwarfRegNum<[26]>;
+ def R27 : CSKYReg<27, "r27", ["r27"]>, DwarfRegNum<[27]>;
+ def R28 : CSKYReg<28, "r28", ["rgb"]>, DwarfRegNum<[28]>;
+ def R29 : CSKYReg<29, "r29", ["rtb"]>, DwarfRegNum<[29]>;
+ def R30 : CSKYReg<30, "r30", ["svbr"]>, DwarfRegNum<[30]>;
+ def R31 : CSKYReg<31, "r31", ["tls"]>, DwarfRegNum<[31]>;
+ def C : CSKYReg<32, "cr0", ["psr"]>;
+
+}
+
+def GPRTuple : RegisterTuples<
+ [sub32_0, sub32_32],
+ [(add (sequence "R%u", 0, 30)), (add (sequence "R%u", 1, 31))],
+ [ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
+ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+ "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+ "r24", "r25", "r26", "r27", "r28", "r29", "r30"
+ ]>;
+
+// Floating point registers
+let RegAltNameIndices = [ABIRegAltName] in {
+ def F0_32 : CSKYFReg32<0, "fr0", ["vr0"]>, DwarfRegNum<[32]>;
+ def F1_32 : CSKYFReg32<1, "fr1", ["vr1"]>, DwarfRegNum<[33]>;
+ def F2_32 : CSKYFReg32<2, "fr2", ["vr2"]>, DwarfRegNum<[34]>;
+ def F3_32 : CSKYFReg32<3, "fr3", ["vr3"]>, DwarfRegNum<[35]>;
+ def F4_32 : CSKYFReg32<4, "fr4", ["vr4"]>, DwarfRegNum<[36]>;
+ def F5_32 : CSKYFReg32<5, "fr5", ["vr5"]>, DwarfRegNum<[37]>;
+ def F6_32 : CSKYFReg32<6, "fr6", ["vr6"]>, DwarfRegNum<[38]>;
+ def F7_32 : CSKYFReg32<7, "fr7", ["vr7"]>, DwarfRegNum<[39]>;
+ def F8_32 : CSKYFReg32<8, "fr8", ["vr8"]>, DwarfRegNum<[40]>;
+ def F9_32 : CSKYFReg32<9, "fr9", ["vr9"]>, DwarfRegNum<[41]>;
+ def F10_32 : CSKYFReg32<10, "fr10", ["vr10"]>, DwarfRegNum<[42]>;
+ def F11_32 : CSKYFReg32<11, "fr11", ["vr11"]>, DwarfRegNum<[43]>;
+ def F12_32 : CSKYFReg32<12, "fr12", ["vr12"]>, DwarfRegNum<[44]>;
+ def F13_32 : CSKYFReg32<13, "fr13", ["vr13"]>, DwarfRegNum<[45]>;
+ def F14_32 : CSKYFReg32<14, "fr14", ["vr14"]>, DwarfRegNum<[46]>;
+ def F15_32 : CSKYFReg32<15, "fr15", ["vr15"]>, DwarfRegNum<[47]>;
+ def F16_32 : CSKYFReg32<16, "fr16", ["vr16"]>, DwarfRegNum<[48]>;
+ def F17_32 : CSKYFReg32<17, "fr17", ["vr17"]>, DwarfRegNum<[49]>;
+ def F18_32 : CSKYFReg32<18, "fr18", ["vr18"]>, DwarfRegNum<[50]>;
+ def F19_32 : CSKYFReg32<19, "fr19", ["vr19"]>, DwarfRegNum<[51]>;
+ def F20_32 : CSKYFReg32<20, "fr20", ["vr20"]>, DwarfRegNum<[52]>;
+ def F21_32 : CSKYFReg32<21, "fr21", ["vr21"]>, DwarfRegNum<[53]>;
+ def F22_32 : CSKYFReg32<22, "fr22", ["vr22"]>, DwarfRegNum<[54]>;
+ def F23_32 : CSKYFReg32<23, "fr23", ["vr23"]>, DwarfRegNum<[55]>;
+ def F24_32 : CSKYFReg32<24, "fr24", ["vr24"]>, DwarfRegNum<[56]>;
+ def F25_32 : CSKYFReg32<25, "fr25", ["vr25"]>, DwarfRegNum<[57]>;
+ def F26_32 : CSKYFReg32<26, "fr26", ["vr26"]>, DwarfRegNum<[58]>;
+ def F27_32 : CSKYFReg32<27, "fr27", ["vr27"]>, DwarfRegNum<[59]>;
+ def F28_32 : CSKYFReg32<28, "fr28", ["vr28"]>, DwarfRegNum<[60]>;
+ def F29_32 : CSKYFReg32<29, "fr29", ["vr29"]>, DwarfRegNum<[61]>;
+ def F30_32 : CSKYFReg32<30, "fr30", ["vr30"]>, DwarfRegNum<[62]>;
+ def F31_32 : CSKYFReg32<31, "fr31", ["vr31"]>, DwarfRegNum<[63]>;
+
+ foreach Index = 0 - 31 in {
+ def F#Index#_64 : CSKYFReg64<!cast<CSKYFReg32>("F"#Index#"_32")>,
+ DwarfRegNum<[!add(Index, 32)]>;
+
+ def F#Index#_128 : CSKYFReg128<!cast<CSKYFReg64>("F"#Index#"_64")>,
+ DwarfRegNum<[!add(Index, 32)]>;
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the CSKY register class.
+//===----------------------------------------------------------------------===//
+
+// The order of registers represents the preferred allocation sequence.
+// Registers are listed in the order caller-save, callee-save, specials.
+def GPR : RegisterClass<"CSKY", [i32], 32,
+ (add (sequence "R%u", 0, 3), (sequence "R%u", 12, 13),
+ (sequence "R%u", 18, 25), R15, (sequence "R%u", 4, 11),
+ (sequence "R%u", 16, 17), (sequence "R%u", 26, 27), R28,
+ (sequence "R%u", 29, 30), R14, R31)> {
+ let Size = 32;
+}
+
+def GPRPair : RegisterClass<"CSKY", [untyped], 32, (add GPRTuple)> {
+ let Size = 64;
+}
+
+def CARRY : RegisterClass<"CSKY", [i32], 32, (add C)> {
+ let Size = 32;
+ let CopyCost = -1;
+}
+
+// The order of registers represents the preferred allocation sequence.
+// Registers are listed in the order caller-save, callee-save, specials.
+def FPR32 : RegisterClass<"CSKY", [f32], 32,
+ (add (sequence "F%u_32", 0, 31))>;
+def sFPR32 : RegisterClass<"CSKY", [f32], 32,
+ (add (sequence "F%u_32", 0, 15))>;
+
+def FPR64 : RegisterClass<"CSKY", [f64], 64,
+ (add (sequence "F%u_64", 0, 31))>;
+def sFPR64 : RegisterClass<"CSKY", [f64], 64,
+ (add (sequence "F%u_64", 0, 15))>;
+
+def FPR128 : RegisterClass<"CSKY",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+ (add (sequence "F%u_128", 0, 31))>;
+def sFPR128 : RegisterClass<"CSKY",
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+ (add (sequence "F%u_128", 0, 15))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
new file mode 100644
index 000000000000..1c13796e84b6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -0,0 +1,68 @@
+//===--- CSKYTargetMachine.cpp - Define TargetMachine for CSKY ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about CSKY target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYTargetMachine.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTarget() {
+ RegisterTargetMachine<CSKYTargetMachine> X(getTheCSKYTarget());
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+ std::string Ret;
+
+ // Only support little endian for now.
+ // TODO: Add support for big endian.
+ Ret += "e";
+
+ // CSKY is always 32-bit target with the CSKYv2 ABI as prefer now.
+ // It's a 4-byte aligned stack with ELF mangling only.
+ Ret += "-m:e-S32-p:32:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32"
+ "-v128:32:32-a:0:32-Fi32-n32";
+
+ return Ret;
+}
+
+CSKYTargetMachine::CSKYTargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+ RM.getValueOr(Reloc::Static),
+ getEffectiveCodeModel(CM, CodeModel::Small), OL),
+ TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
+ initAsmInfo();
+}
+
+namespace {
+class CSKYPassConfig : public TargetPassConfig {
+public:
+ CSKYPassConfig(CSKYTargetMachine &TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ CSKYTargetMachine &getCSKYTargetMachine() const {
+ return getTM<CSKYTargetMachine>();
+ }
+};
+
+} // namespace
+
+TargetPassConfig *CSKYTargetMachine::createPassConfig(PassManagerBase &PM) {
+ return new CSKYPassConfig(*this, PM);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h
new file mode 100644
index 000000000000..d50e3877b550
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/CSKYTargetMachine.h
@@ -0,0 +1,38 @@
+//===--- CSKYTargetMachine.h - Define TargetMachine for CSKY ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CSKY specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYTARGETMACHINE_H
+#define LLVM_LIB_TARGET_CSKY_CSKYTARGETMACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class CSKYTargetMachine : public LLVMTargetMachine {
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+ CSKYTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF.get();
+ }
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
new file mode 100644
index 000000000000..e30123d64755
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -0,0 +1,69 @@
+//===-- CSKYAsmBackend.cpp - CSKY Assembler Backend -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYAsmBackend.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "csky-asmbackend"
+
+using namespace llvm;
+
+std::unique_ptr<MCObjectTargetWriter>
+CSKYAsmBackend::createObjectTargetWriter() const {
+ return createCSKYELFObjectWriter();
+}
+
+unsigned int CSKYAsmBackend::getNumFixupKinds() const { return 1; }
+
+void CSKYAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsResolved,
+ const MCSubtargetInfo *STI) const {
+ return;
+}
+
+bool CSKYAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ return false;
+}
+
+void CSKYAsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ llvm_unreachable("CSKYAsmBackend::relaxInstruction() unimplemented");
+}
+
+bool CSKYAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+ if (Count % 2)
+ return false;
+
+ // MOV32 r0, r0
+ while (Count >= 4) {
+ OS.write("\xc4\x00\x48\x20", 4);
+ Count -= 4;
+ }
+ // MOV16 r0, r0
+ if (Count)
+ OS.write("\x6c\x03", 2);
+
+ return true;
+}
+
+MCAsmBackend *llvm::createCSKYAsmBackend(const Target &T,
+ const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options) {
+ return new CSKYAsmBackend(STI, Options);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
new file mode 100644
index 000000000000..b4cba4264e03
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -0,0 +1,39 @@
+//===-- CSKYAsmBackend.h - CSKY Assembler Backend -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCTargetOptions.h"
+
+namespace llvm {
+
+class CSKYAsmBackend : public MCAsmBackend {
+
+public:
+ CSKYAsmBackend(const MCSubtargetInfo &STI, const MCTargetOptions &OP)
+ : MCAsmBackend(support::little) {}
+
+ unsigned int getNumFixupKinds() const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const override;
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override;
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+ bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYASMBACKEND_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
new file mode 100644
index 000000000000..163632632290
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
@@ -0,0 +1,45 @@
+//===-- CSKYELFObjectWriter.cpp - CSKY ELF Writer -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+#define DEBUG_TYPE "csky-elf-object-writer"
+
+using namespace llvm;
+
+namespace {
+
+class CSKYELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ CSKYELFObjectWriter(uint8_t OSABI = 0)
+ : MCELFObjectTargetWriter(false, OSABI, ELF::EM_CSKY, true){};
+ ~CSKYELFObjectWriter() {}
+
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+} // namespace
+
+unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
+ const MCFixup &Fixup,
+ bool IsPCRel) const {
+ // Determine the type of the relocation.
+ switch ((unsigned)Fixup.getKind()) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ }
+}
+
+std::unique_ptr<MCObjectTargetWriter> llvm::createCSKYELFObjectWriter() {
+ return std::make_unique<CSKYELFObjectWriter>();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
new file mode 100644
index 000000000000..668247bbbd87
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
@@ -0,0 +1,25 @@
+//===-- CSKYMCAsmInfo.cpp - CSKY Asm properties ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the CSKYMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCAsmInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+void CSKYMCAsmInfo::anchor() {}
+
+CSKYMCAsmInfo::CSKYMCAsmInfo(const Triple &TargetTriple) {
+ AlignmentIsInBytes = false;
+ SupportsDebugInformation = true;
+ CommentString = "#";
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h
new file mode 100644
index 000000000000..3e0609f19531
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.h
@@ -0,0 +1,29 @@
+//===-- CSKYMCAsmInfo.h - CSKY Asm Info ------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the CSKYMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCASMINFO_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class CSKYMCAsmInfo : public MCAsmInfoELF {
+ void anchor() override;
+
+public:
+ explicit CSKYMCAsmInfo(const Triple &TargetTriple);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCASMINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
new file mode 100644
index 000000000000..ed2b0e77b81a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
@@ -0,0 +1,71 @@
+//===-- CSKYMCCodeEmitter.cpp - CSKY Code Emitter interface ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKYMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCCodeEmitter.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-mccode-emitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+unsigned CSKYMCCodeEmitter::getOImmOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+ assert(MO.isImm() && "Unexpected MO type.");
+ return MO.getImm() - 1;
+}
+
+void CSKYMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCInstrDesc &Desc = MII.get(MI.getOpcode());
+ unsigned Size = Desc.getSize();
+ uint32_t Bin = getBinaryCodeForInstr(MI, Fixups, STI);
+
+ uint16_t LO16 = static_cast<uint16_t>(Bin);
+ uint16_t HI16 = static_cast<uint16_t>(Bin >> 16);
+
+ if (Size == 4)
+ support::endian::write<uint16_t>(OS, HI16, support::little);
+
+ support::endian::write<uint16_t>(OS, LO16, support::little);
+ ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+CSKYMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg())
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+ if (MO.isImm())
+ return static_cast<unsigned>(MO.getImm());
+
+ llvm_unreachable("Unhandled expression!");
+ return 0;
+}
+
+MCCodeEmitter *llvm::createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx) {
+ return new CSKYMCCodeEmitter(Ctx, MCII);
+}
+
+#include "CSKYGenMCCodeEmitter.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
new file mode 100644
index 000000000000..c850a4bab745
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
@@ -0,0 +1,61 @@
+//===-- CSKYMCCodeEmitter.cpp - CSKY Code Emitter interface ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKYMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+
+namespace llvm {
+
+class CSKYMCCodeEmitter : public MCCodeEmitter {
+ MCContext &Ctx;
+ const MCInstrInfo &MII;
+
+public:
+ CSKYMCCodeEmitter(MCContext &Ctx, const MCInstrInfo &MII)
+ : Ctx(Ctx), MII(MII) {}
+
+ ~CSKYMCCodeEmitter() {}
+
+ void encodeInstruction(const MCInst &Inst, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ // Generated by tablegen.
+ uint64_t getBinaryCodeForInstr(const MCInst &MI,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ // Default encoding method used by tablegen.
+ unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ template <int shift = 0>
+ unsigned getImmOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(Idx);
+ assert(MO.isImm() && "Unexpected MO type.");
+ return (MO.getImm() >> shift);
+ }
+
+ unsigned getOImmOpValue(const MCInst &MI, unsigned Idx,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
new file mode 100644
index 000000000000..876000a37004
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -0,0 +1,62 @@
+//===-- CSKYMCTargetDesc.cpp - CSKY Target Descriptions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file provides CSKY specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCTargetDesc.h"
+#include "CSKYAsmBackend.h"
+#include "CSKYMCAsmInfo.h"
+#include "CSKYMCCodeEmitter.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "CSKYGenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "CSKYGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCAsmInfo *createCSKYMCAsmInfo(const MCRegisterInfo &MRI,
+ const Triple &TT,
+ const MCTargetOptions &Options) {
+ MCAsmInfo *MAI = new CSKYMCAsmInfo(TT);
+
+ // Initial state of the frame pointer is SP.
+ unsigned Reg = MRI.getDwarfRegNum(CSKY::R14, true);
+ MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0);
+ MAI->addInitialFrameState(Inst);
+ return MAI;
+}
+
+static MCInstrInfo *createCSKYMCInstrInfo() {
+ MCInstrInfo *Info = new MCInstrInfo();
+ InitCSKYMCInstrInfo(Info);
+ return Info;
+}
+
+static MCRegisterInfo *createCSKYMCRegisterInfo(const Triple &TT) {
+ MCRegisterInfo *Info = new MCRegisterInfo();
+ InitCSKYMCRegisterInfo(Info, CSKY::R15);
+ return Info;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetMC() {
+ auto &CSKYTarget = getTheCSKYTarget();
+ TargetRegistry::RegisterMCAsmBackend(CSKYTarget, createCSKYAsmBackend);
+ TargetRegistry::RegisterMCAsmInfo(CSKYTarget, createCSKYMCAsmInfo);
+ TargetRegistry::RegisterMCInstrInfo(CSKYTarget, createCSKYMCInstrInfo);
+ TargetRegistry::RegisterMCRegInfo(CSKYTarget, createCSKYMCRegisterInfo);
+ TargetRegistry::RegisterMCCodeEmitter(CSKYTarget, createCSKYMCCodeEmitter);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
new file mode 100644
index 000000000000..da8a3b63a2f9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
@@ -0,0 +1,48 @@
+//===-- CSKYMCTargetDesc.h - CSKY Target Descriptions -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides CSKY specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
+
+#include "llvm/MC/MCTargetOptions.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectTargetWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class Triple;
+
+std::unique_ptr<MCObjectTargetWriter> createCSKYELFObjectWriter();
+
+MCAsmBackend *createCSKYAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+ const MCRegisterInfo &MRI,
+ const MCTargetOptions &Options);
+
+MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
+ const MCRegisterInfo &MRI,
+ MCContext &Ctx);
+} // namespace llvm
+
+#define GET_REGINFO_ENUM
+#include "CSKYGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "CSKYGenInstrInfo.inc"
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
new file mode 100644
index 000000000000..1af2e672ff42
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- CSKYTargetInfo.cpp - CSKY Target Implementation -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheCSKYTarget() {
+ static Target TheCSKYTarget;
+ return TheCSKYTarget;
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTargetInfo() {
+ RegisterTarget<Triple::csky> X(getTheCSKYTarget(), "csky", "C-SKY", "CSKY");
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h
new file mode 100644
index 000000000000..c317c5401f03
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.h
@@ -0,0 +1,20 @@
+//===-- CSKYTargetInfo.cpp - CSKY Target Implementation -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_TARGETINFO_CSKYTARGETINFO_H
+#define LLVM_LIB_TARGET_CSKY_TARGETINFO_CSKYTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheCSKYTarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_TARGETINFO_CSKYTARGETINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 1e7862c36ea0..b6763fd9aef0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -641,7 +641,7 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return true;
return finishBundle(IDLoc, Out);
}
- MCInst *SubInst = new (getParser().getContext()) MCInst;
+ MCInst *SubInst = getParser().getContext().createMCInst();
if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
MatchingInlineAsm)) {
if (InBrackets)
@@ -945,7 +945,7 @@ bool HexagonAsmParser::isLabel(AsmToken &Token) {
StringRef Raw(String.data(), Third.getString().data() - String.data() +
Third.getString().size());
std::string Collapsed = std::string(Raw);
- Collapsed.erase(llvm::remove_if(Collapsed, isSpace), Collapsed.end());
+ llvm::erase_if(Collapsed, isSpace);
StringRef Whole = Collapsed;
std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
if (!matchRegister(DotSplit.first.lower()))
@@ -997,7 +997,7 @@ OperandMatchResultTy HexagonAsmParser::tryParseRegister(unsigned &RegNo,
NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
}
std::string Collapsed = std::string(RawString);
- Collapsed.erase(llvm::remove_if(Collapsed, isSpace), Collapsed.end());
+ llvm::erase_if(Collapsed, isSpace);
StringRef FullString = Collapsed;
std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
unsigned DotReg = matchRegister(DotSplit.first.lower());
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
index 7ef23ef35a74..8bced3cec082 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -198,10 +198,10 @@ BitTracker::~BitTracker() {
// the actual bits of the "self" register.
// While this cannot happen in the current implementation, I'm not sure
// if this should be ruled out in the future.
-bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
+bool BT::RegisterCell::meet(const RegisterCell &RC, Register SelfR) {
// An example when "meet" can be invoked with SelfR == 0 is a phi node
// with a physical register as an operand.
- assert(SelfR == 0 || Register::isVirtualRegister(SelfR));
+ assert(SelfR == 0 || SelfR.isVirtual());
bool Changed = false;
for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
const BitValue &RCV = RC[i];
@@ -335,13 +335,13 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
// 1. find a physical register PhysR from the same class as RR.Reg,
// 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
// 3. find a register class that contains PhysS.
- if (Register::isVirtualRegister(RR.Reg)) {
+ if (RR.Reg.isVirtual()) {
const auto &VC = composeWithSubRegIndex(*MRI.getRegClass(RR.Reg), RR.Sub);
return TRI.getRegSizeInBits(VC);
}
- assert(Register::isPhysicalRegister(RR.Reg));
- Register PhysR =
- (RR.Sub == 0) ? Register(RR.Reg) : TRI.getSubReg(RR.Reg, RR.Sub);
+ assert(RR.Reg.isPhysical());
+ MCRegister PhysR =
+ (RR.Sub == 0) ? RR.Reg.asMCReg() : TRI.getSubReg(RR.Reg, RR.Sub);
return getPhysRegBitWidth(PhysR);
}
@@ -351,10 +351,10 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
// Physical registers are assumed to be present in the map with an unknown
// value. Don't actually insert anything in the map, just return the cell.
- if (Register::isPhysicalRegister(RR.Reg))
+ if (RR.Reg.isPhysical())
return RegisterCell::self(0, BW);
- assert(Register::isVirtualRegister(RR.Reg));
+ assert(RR.Reg.isVirtual());
// For virtual registers that belong to a class that is not tracked,
// generate an "unknown" value as well.
const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
@@ -377,7 +377,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
// While updating the cell map can be done in a meaningful way for
// a part of a register, it makes little sense to implement it as the
// SSA representation would never contain such "partial definitions".
- if (!Register::isVirtualRegister(RR.Reg))
+ if (!RR.Reg.isVirtual())
return;
assert(RR.Sub == 0 && "Unexpected sub-register in definition");
// Eliminate all ref-to-reg-0 bit values: replace them with "self".
@@ -704,15 +704,14 @@ BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
return Res;
}
-BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
+BT::BitMask BT::MachineEvaluator::mask(Register Reg, unsigned Sub) const {
assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
uint16_t W = getRegBitWidth(Reg);
assert(W > 0 && "Cannot generate mask for empty register");
return BitMask(0, W-1);
}
-uint16_t BT::MachineEvaluator::getPhysRegBitWidth(unsigned Reg) const {
- assert(Register::isPhysicalRegister(Reg));
+uint16_t BT::MachineEvaluator::getPhysRegBitWidth(MCRegister Reg) const {
const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg);
return TRI.getRegSizeInBits(PC);
}
@@ -875,7 +874,7 @@ void BT::visitNonBranch(const MachineInstr &MI) {
continue;
RegisterRef RD(MO);
assert(RD.Sub == 0 && "Unexpected sub-register in definition");
- if (!Register::isVirtualRegister(RD.Reg))
+ if (!RD.Reg.isVirtual())
continue;
bool Changed = false;
@@ -980,7 +979,7 @@ void BT::visitBranchesFrom(const MachineInstr &BI) {
FlowQ.push(CFGEdge(ThisN, TB->getNumber()));
}
-void BT::visitUsesOf(unsigned Reg) {
+void BT::visitUsesOf(Register Reg) {
if (Trace)
dbgs() << "queuing uses of modified reg " << printReg(Reg, &ME.TRI)
<< " cell: " << ME.getCell(Reg, Map) << '\n';
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h
index efb21805b801..08c0359a4b7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.h
@@ -62,7 +62,7 @@ private:
void visitPHI(const MachineInstr &PI);
void visitNonBranch(const MachineInstr &MI);
void visitBranchesFrom(const MachineInstr &BI);
- void visitUsesOf(unsigned Reg);
+ void visitUsesOf(Register Reg);
using CFGEdge = std::pair<int, int>;
using EdgeSetType = std::set<CFGEdge>;
@@ -131,19 +131,20 @@ struct BitTracker::BitRef {
return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
}
- unsigned Reg;
+ Register Reg;
uint16_t Pos;
};
// Abstraction of a register reference in MachineOperand. It contains the
// register number and the subregister index.
+// FIXME: Consolidate duplicate definitions of RegisterRef
struct BitTracker::RegisterRef {
- RegisterRef(unsigned R = 0, unsigned S = 0)
- : Reg(R), Sub(S) {}
+ RegisterRef(Register R = 0, unsigned S = 0) : Reg(R), Sub(S) {}
RegisterRef(const MachineOperand &MO)
: Reg(MO.getReg()), Sub(MO.getSubReg()) {}
- unsigned Reg, Sub;
+ Register Reg;
+ unsigned Sub;
};
// Value that a single bit can take. This is outside of the context of
@@ -312,7 +313,7 @@ struct BitTracker::RegisterCell {
return Bits[BitN];
}
- bool meet(const RegisterCell &RC, unsigned SelfR);
+ bool meet(const RegisterCell &RC, Register SelfR);
RegisterCell &insert(const RegisterCell &RC, const BitMask &M);
RegisterCell extract(const BitMask &M) const; // Returns a new cell.
RegisterCell &rol(uint16_t Sh); // Rotate left.
@@ -461,7 +462,7 @@ struct BitTracker::MachineEvaluator {
// Sub == 0, in this case, the function should return a mask that spans
// the entire register Reg (which is what the default implementation
// does).
- virtual BitMask mask(unsigned Reg, unsigned Sub) const;
+ virtual BitMask mask(Register Reg, unsigned Sub) const;
// Indicate whether a given register class should be tracked.
virtual bool track(const TargetRegisterClass *RC) const { return true; }
// Evaluate a non-branching machine instruction, given the cell map with
@@ -484,7 +485,7 @@ struct BitTracker::MachineEvaluator {
llvm_unreachable("Unimplemented composeWithSubRegIndex");
}
// Return the size in bits of the physical register Reg.
- virtual uint16_t getPhysRegBitWidth(unsigned Reg) const;
+ virtual uint16_t getPhysRegBitWidth(MCRegister Reg) const;
const TargetRegisterInfo &TRI;
MachineRegisterInfo &MRI;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index f3a87ef20a60..aeaeac65de96 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -175,7 +175,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
while (Result == Success && !Complete) {
if (Bytes.size() < HEXAGON_INSTR_SIZE)
return MCDisassembler::Fail;
- MCInst *Inst = new (getContext()) MCInst;
+ MCInst *Inst = getContext().createMCInst();
Result = getSingleInstruction(*Inst, MI, Bytes, Address, cs, Complete);
MI.addOperand(MCOperand::createInst(Inst));
Size += HEXAGON_INSTR_SIZE;
@@ -384,8 +384,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
break;
}
MI.setOpcode(Hexagon::DuplexIClass0 + duplexIClass);
- MCInst *MILow = new (getContext()) MCInst;
- MCInst *MIHigh = new (getContext()) MCInst;
+ MCInst *MILow = getContext().createMCInst();
+ MCInst *MIHigh = getContext().createMCInst();
auto TmpExtender = CurrentExtender;
CurrentExtender =
nullptr; // constant extenders in duplex must always be in slot 1
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h
index 58dadf012da5..98e5710d4fc1 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/Hexagon.h
@@ -14,12 +14,9 @@
#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
#define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-
namespace llvm {
class HexagonTargetMachine;
+ class ImmutablePass;
/// Creates a Hexagon-specific Target Transformation Info pass.
ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 49edb0d99492..54aa14849dd9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -206,14 +206,14 @@ namespace {
uint16_t W);
static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
uint16_t W, uint64_t &U);
- static bool replaceReg(unsigned OldR, unsigned NewR,
- MachineRegisterInfo &MRI);
+ static bool replaceReg(Register OldR, Register NewR,
+ MachineRegisterInfo &MRI);
static bool getSubregMask(const BitTracker::RegisterRef &RR,
unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI);
- static bool replaceRegWithSub(unsigned OldR, unsigned NewR,
- unsigned NewSR, MachineRegisterInfo &MRI);
- static bool replaceSubWithSub(unsigned OldR, unsigned OldSR,
- unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI);
+ static bool replaceRegWithSub(Register OldR, Register NewR, unsigned NewSR,
+ MachineRegisterInfo &MRI);
+ static bool replaceSubWithSub(Register OldR, unsigned OldSR, Register NewR,
+ unsigned NewSR, MachineRegisterInfo &MRI);
static bool parseRegSequence(const MachineInstr &I,
BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH,
const MachineRegisterInfo &MRI);
@@ -292,7 +292,7 @@ void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
if (!Op.isReg() || !Op.isDef())
continue;
Register R = Op.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
Defs.insert(R);
}
@@ -304,7 +304,7 @@ void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
if (!Op.isReg() || !Op.isUse())
continue;
Register R = Op.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
Uses.insert(R);
}
@@ -352,9 +352,9 @@ bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
return true;
}
-bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
- MachineRegisterInfo &MRI) {
- if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
+bool HexagonBitSimplify::replaceReg(Register OldR, Register NewR,
+ MachineRegisterInfo &MRI) {
+ if (!OldR.isVirtual() || !NewR.isVirtual())
return false;
auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
decltype(End) NextI;
@@ -365,9 +365,10 @@ bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
return Begin != End;
}
-bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
- unsigned NewSR, MachineRegisterInfo &MRI) {
- if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
+bool HexagonBitSimplify::replaceRegWithSub(Register OldR, Register NewR,
+ unsigned NewSR,
+ MachineRegisterInfo &MRI) {
+ if (!OldR.isVirtual() || !NewR.isVirtual())
return false;
if (hasTiedUse(OldR, MRI, NewSR))
return false;
@@ -381,9 +382,10 @@ bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
return Begin != End;
}
-bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
- unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
- if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
+bool HexagonBitSimplify::replaceSubWithSub(Register OldR, unsigned OldSR,
+ Register NewR, unsigned NewSR,
+ MachineRegisterInfo &MRI) {
+ if (!OldR.isVirtual() || !NewR.isVirtual())
return false;
if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR))
return false;
@@ -894,7 +896,7 @@ bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
// register class.
const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
- if (!Register::isVirtualRegister(RR.Reg))
+ if (!RR.Reg.isVirtual())
return nullptr;
auto *RC = MRI.getRegClass(RR.Reg);
if (RR.Sub == 0)
@@ -925,8 +927,7 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
// with a 32-bit register.
bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
- if (!Register::isVirtualRegister(RD.Reg) ||
- !Register::isVirtualRegister(RS.Reg))
+ if (!RD.Reg.isVirtual() || !RS.Reg.isVirtual())
return false;
// Return false if one (or both) classes are nullptr.
auto *DRC = getFinalVRegClass(RD, MRI);
@@ -1017,7 +1018,7 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
if (!Op.isReg() || !Op.isDef())
continue;
Register R = Op.getReg();
- if (!Register::isVirtualRegister(R) || !isDead(R)) {
+ if (!R.isVirtual() || !isDead(R)) {
AllDead = false;
break;
}
@@ -1219,7 +1220,7 @@ bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
MachineInstr &UseI = *I->getParent();
if (UseI.isPHI() || UseI.isCopy()) {
Register DefR = UseI.getOperand(0).getReg();
- if (!Register::isVirtualRegister(DefR))
+ if (!DefR.isVirtual())
return false;
Pending.push_back(DefR);
} else {
@@ -1380,8 +1381,9 @@ namespace {
static bool isTfrConst(const MachineInstr &MI);
private:
- unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
- MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
+ Register genTfrConst(const TargetRegisterClass *RC, int64_t C,
+ MachineBasicBlock &B, MachineBasicBlock::iterator At,
+ DebugLoc &DL);
const HexagonInstrInfo &HII;
MachineRegisterInfo &MRI;
@@ -1408,8 +1410,10 @@ bool ConstGeneration::isTfrConst(const MachineInstr &MI) {
// Generate a transfer-immediate instruction that is appropriate for the
// register class and the actual value being transferred.
-unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
- MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
+Register ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
+ MachineBasicBlock &B,
+ MachineBasicBlock::iterator At,
+ DebugLoc &DL) {
Register Reg = MRI.createVirtualRegister(RC);
if (RC == &Hexagon::IntRegsRegClass) {
BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
@@ -1473,8 +1477,8 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
HBS::getInstrDefs(*I, Defs);
if (Defs.count() != 1)
continue;
- unsigned DR = Defs.find_first();
- if (!Register::isVirtualRegister(DR))
+ Register DR = Defs.find_first();
+ if (!DR.isVirtual())
continue;
uint64_t U;
const BitTracker::RegisterCell &DRC = BT.lookup(DR);
@@ -1482,7 +1486,7 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
int64_t C = U;
DebugLoc DL = I->getDebugLoc();
auto At = I->isPHI() ? B.getFirstNonPHI() : I;
- unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
+ Register ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
if (ImmReg) {
HBS::replaceReg(DR, ImmReg, MRI);
BT.put(ImmReg, DRC);
@@ -1549,7 +1553,7 @@ bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp,
if (!HBS::getSubregMask(Inp, B, W, MRI))
return false;
- for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) {
+ for (Register R = AVs.find_first(); R; R = AVs.find_next(R)) {
if (!BT.has(R) || Forbidden[R])
continue;
const BitTracker::RegisterCell &RC = BT.lookup(R);
@@ -1608,7 +1612,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
DebugLoc DL = I->getDebugLoc();
auto At = I->isPHI() ? B.getFirstNonPHI() : I;
- for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) {
+ for (Register R = Defs.find_first(); R; R = Defs.find_next(R)) {
BitTracker::RegisterRef MR;
auto *FRC = HBS::getFinalVRegClass(R, MRI);
@@ -1815,7 +1819,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
if (I == B+16)
return false;
- unsigned Reg = RC[I].RefI.Reg;
+ Register Reg = RC[I].RefI.Reg;
unsigned P = RC[I].RefI.Pos; // The RefI.Pos will be advanced by I-B.
if (P < I-B)
return false;
@@ -1823,7 +1827,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
if (Reg == 0 || Reg == SelfR) // Don't match "self".
return false;
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
return false;
if (!BT.has(Reg))
return false;
@@ -2363,7 +2367,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
P = V.RefI.Pos;
}
if (P != std::numeric_limits<unsigned>::max()) {
- unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ Register NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
.addReg(RR.Reg, 0, RR.Sub)
.addImm(P);
@@ -3165,8 +3169,8 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
HBS::getInstrDefs(*I, Defs);
if (Defs.count() != 1)
continue;
- unsigned DefR = Defs.find_first();
- if (!Register::isVirtualRegister(DefR))
+ Register DefR = Defs.find_first();
+ if (!DefR.isVirtual())
continue;
if (!isBitShuffle(&*I, DefR))
continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index 1e4030b84bc1..0f6dedeb28c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -86,7 +86,7 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
}
}
-BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+BT::BitMask HexagonEvaluator::mask(Register Reg, unsigned Sub) const {
if (Sub == 0)
return MachineEvaluator::mask(Reg, 0);
const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
@@ -110,9 +110,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
llvm_unreachable("Unexpected register/subregister");
}
-uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const {
- assert(Register::isPhysicalRegister(Reg));
-
+uint16_t HexagonEvaluator::getPhysRegBitWidth(MCRegister Reg) const {
using namespace Hexagon;
const auto &HST = MF.getSubtarget<HexagonSubtarget>();
if (HST.useHVXOps()) {
@@ -1043,7 +1041,7 @@ unsigned HexagonEvaluator::getUniqueDefVReg(const MachineInstr &MI) const {
if (!Op.isReg() || !Op.isDef())
continue;
Register R = Op.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
if (DefReg != 0)
return 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h
index 02607d50f686..2d24e859e761 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.h
@@ -36,9 +36,9 @@ struct HexagonEvaluator : public BitTracker::MachineEvaluator {
bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
BranchTargetList &Targets, bool &FallsThru) const override;
- BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
+ BitTracker::BitMask mask(Register Reg, unsigned Sub) const override;
- uint16_t getPhysRegBitWidth(unsigned Reg) const override;
+ uint16_t getPhysRegBitWidth(MCRegister Reg) const override;
const TargetRegisterClass &composeWithSubRegIndex(
const TargetRegisterClass &RC, unsigned Idx) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index d1d1b8ee7d41..56ee3cd60c17 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -84,7 +84,7 @@ void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
if (empty())
return;
- llvm::sort(begin(), end());
+ llvm::sort(*this);
iterator Iter = begin();
while (Iter != end()-1) {
@@ -275,7 +275,7 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
for (; I.isValid(); ++I)
SRs.insert({*I, 0});
} else {
- assert(Register::isVirtualRegister(R.Reg));
+ assert(R.Reg.isVirtual());
auto &RC = *MRI.getRegClass(R.Reg);
unsigned PReg = *RC.begin();
MCSubRegIndexIterator I(PReg, &TRI);
@@ -482,7 +482,7 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
}
}
for (auto &P : LiveMap)
- if (Register::isVirtualRegister(P.first.Reg))
+ if (P.first.Reg.isVirtual())
addDeadRanges(P.first);
LLVM_DEBUG(dbgs() << __func__ << ": dead map\n"
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
index 61115e29a708..5a3b6433fba7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -10,6 +10,7 @@
#define LLVM_LIB_TARGET_HEXAGON_HEXAGONBLOCKRANGES_H
#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/Register.h"
#include <cassert>
#include <map>
#include <set>
@@ -30,8 +31,10 @@ class TargetRegisterInfo;
struct HexagonBlockRanges {
HexagonBlockRanges(MachineFunction &MF);
+ // FIXME: Consolidate duplicate definitions of RegisterRef
struct RegisterRef {
- unsigned Reg, Sub;
+ llvm::Register Reg;
+ unsigned Sub;
bool operator<(RegisterRef R) const {
return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 11a455ce4347..b456cf139c55 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 6a5192c866cc..11e7d5a17fa9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -447,7 +447,7 @@ static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM,
Work.erase(First);
NodeChildrenMap::iterator CF = NCM.find(N);
if (CF != NCM.end()) {
- Work.insert(Work.end(), CF->second.begin(), CF->second.end());
+ llvm::append_range(Work, CF->second);
Nodes.insert(CF->second.begin(), CF->second.end());
}
}
@@ -472,10 +472,11 @@ static const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
// determining equality. The only purpose of the ordering is to eliminate
// duplication due to the commutativity of equality/non-equality.
static NodePair node_pair(GepNode *N1, GepNode *N2) {
- uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2);
- if (P1 <= P2)
- return std::make_pair(N1, N2);
- return std::make_pair(N2, N1);
+ uintptr_t P1 = reinterpret_cast<uintptr_t>(N1);
+ uintptr_t P2 = reinterpret_cast<uintptr_t>(N2);
+ if (P1 <= P2)
+ return std::make_pair(N1, N2);
+ return std::make_pair(N2, N1);
}
static unsigned node_hash(GepNode *N) {
@@ -650,8 +651,7 @@ void HexagonCommonGEP::common() {
// Node for removal.
Erase.insert(*I);
}
- NodeVect::iterator NewE = remove_if(Nodes, in_set(Erase));
- Nodes.resize(std::distance(Nodes.begin(), NewE));
+ erase_if(Nodes, in_set(Erase));
LLVM_DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
}
@@ -1145,7 +1145,7 @@ void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
NodeChildrenMap::iterator CF = NCM.find(N);
if (CF != NCM.end()) {
NodeVect &Cs = CF->second;
- Work.insert(Work.end(), Cs.begin(), Cs.end());
+ llvm::append_range(Work, Cs);
}
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 05b95d8b7314..a774baaa48e6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -242,18 +242,15 @@ namespace {
return *this;
}
bool isVReg() const {
- return Reg != 0 && !llvm::Register::isStackSlot(Reg) &&
- llvm::Register::isVirtualRegister(Reg);
- }
- bool isSlot() const {
- return Reg != 0 && llvm::Register::isStackSlot(Reg);
+ return Reg != 0 && !Reg.isStack() && Reg.isVirtual();
}
+ bool isSlot() const { return Reg != 0 && Reg.isStack(); }
operator MachineOperand() const {
if (isVReg())
return MachineOperand::CreateReg(Reg, /*Def*/false, /*Imp*/false,
/*Kill*/false, /*Dead*/false, /*Undef*/false,
/*EarlyClobber*/false, Sub);
- if (llvm::Register::isStackSlot(Reg)) {
+ if (Reg.isStack()) {
int FI = llvm::Register::stackSlot2Index(Reg);
return MachineOperand::CreateFI(FI);
}
@@ -265,7 +262,8 @@ namespace {
// For std::map.
return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
}
- unsigned Reg = 0, Sub = 0;
+ llvm::Register Reg;
+ unsigned Sub = 0;
};
struct ExtExpr {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 77578378b058..4a2b0600f42b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -83,7 +83,8 @@ namespace {
// FIXME: Use TargetInstrInfo::RegSubRegPair. Also duplicated in
// HexagonGenPredicate
struct RegisterSubReg {
- unsigned Reg, SubReg;
+ Register Reg;
+ unsigned SubReg;
explicit RegisterSubReg(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {}
explicit RegisterSubReg(const MachineOperand &MO)
@@ -216,16 +217,16 @@ namespace {
void clear() { Map.clear(); }
- bool has(unsigned R) const {
+ bool has(Register R) const {
// All non-virtual registers are considered "bottom".
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
return true;
MapType::const_iterator F = Map.find(R);
return F != Map.end();
}
- const LatticeCell &get(unsigned R) const {
- if (!Register::isVirtualRegister(R))
+ const LatticeCell &get(Register R) const {
+ if (!R.isVirtual())
return Bottom;
MapType::const_iterator F = Map.find(R);
if (F != Map.end())
@@ -234,14 +235,12 @@ namespace {
}
// Invalidates any const references.
- void update(unsigned R, const LatticeCell &L) {
- Map[R] = L;
- }
+ void update(Register R, const LatticeCell &L) { Map[R] = L; }
void print(raw_ostream &os, const TargetRegisterInfo &TRI) const;
private:
- using MapType = std::map<unsigned, LatticeCell>;
+ using MapType = std::map<Register, LatticeCell>;
MapType Map;
// To avoid creating "top" entries, return a const reference to
@@ -633,7 +632,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
const MachineOperand &MD = PN.getOperand(0);
RegisterSubReg DefR(MD);
- assert(Register::isVirtualRegister(DefR.Reg));
+ assert(DefR.Reg.isVirtual());
bool Changed = false;
@@ -662,7 +661,7 @@ Bottomize:
RegisterSubReg UseR(SO);
// If the input is not a virtual register, we don't really know what
// value it holds.
- if (!Register::isVirtualRegister(UseR.Reg))
+ if (!UseR.Reg.isVirtual())
goto Bottomize;
// If there is no cell for an input register, it means top.
if (!Cells.has(UseR.Reg))
@@ -704,7 +703,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
continue;
RegisterSubReg DefR(MO);
// Only track virtual registers.
- if (!Register::isVirtualRegister(DefR.Reg))
+ if (!DefR.Reg.isVirtual())
continue;
bool Changed = false;
// If the evaluation failed, set cells for all output registers to bottom.
@@ -1086,7 +1085,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs,
LatticeCell &RC) {
- if (!Register::isVirtualRegister(R.Reg))
+ if (!R.Reg.isVirtual())
return false;
const LatticeCell &L = Inputs.get(R.Reg);
if (!R.SubReg) {
@@ -1884,7 +1883,7 @@ namespace {
bool evaluateHexVector2(const MachineInstr &MI, const CellMap &Inputs,
CellMap &Outputs);
- void replaceAllRegUsesWith(unsigned FromReg, unsigned ToReg);
+ void replaceAllRegUsesWith(Register FromReg, Register ToReg);
bool rewriteHexBranch(MachineInstr &BrI, const CellMap &Inputs);
bool rewriteHexConstDefs(MachineInstr &MI, const CellMap &Inputs,
bool &AllDefs);
@@ -1942,7 +1941,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
unsigned Opc = MI.getOpcode();
RegisterSubReg DefR(MD);
assert(!DefR.SubReg);
- if (!Register::isVirtualRegister(DefR.Reg))
+ if (!DefR.Reg.isVirtual())
return false;
if (MI.isCopy()) {
@@ -2809,7 +2808,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
continue;
RegisterSubReg R(MO);
- if (!Register::isVirtualRegister(R.Reg))
+ if (!R.Reg.isVirtual())
continue;
HasUse = true;
// PHIs can legitimately have "top" cells after propagation.
@@ -2851,7 +2850,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
if (!MO.isReg() || !MO.isDef())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
assert(!MO.getSubReg());
assert(Inputs.has(R));
@@ -3130,10 +3129,10 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
return Changed;
}
-void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg,
- unsigned ToReg) {
- assert(Register::isVirtualRegister(FromReg));
- assert(Register::isVirtualRegister(ToReg));
+void HexagonConstEvaluator::replaceAllRegUsesWith(Register FromReg,
+ Register ToReg) {
+ assert(FromReg.isVirtual());
+ assert(ToReg.isVirtual());
for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
MachineOperand &O = *I;
++I;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 587527d8c32c..23d0cc829e52 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -10,6 +10,7 @@
// to move them together. If we can move them next to each other we do so and
// replace them with a combine instruction.
//===----------------------------------------------------------------------===//
+
#include "HexagonInstrInfo.h"
#include "HexagonSubtarget.h"
#include "llvm/ADT/DenseMap.h"
@@ -26,6 +27,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index a431af17e6d0..d36ffc3da641 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -386,7 +386,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
if (!MO.isReg() || !MO.isDef())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
if (!isPredicate(R))
continue;
@@ -403,7 +403,7 @@ bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
if (!MO.isReg() || !MO.isUse())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
const MachineInstr *DefI = MRI->getVRegDef(R);
// "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
@@ -493,7 +493,7 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
if (!MO.isReg() || !MO.isDef())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
if (isPredicate(R))
PredDefs++;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c1d0599830cc..fcc880463925 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -174,6 +174,7 @@ namespace {
unsigned CoaCounter = 0;
unsigned TfrCounter = 0;
+ // FIXME: Consolidate duplicate definitions of RegisterRef
struct RegisterRef {
RegisterRef(const MachineOperand &Op) : Reg(Op.getReg()),
Sub(Op.getSubReg()) {}
@@ -187,7 +188,8 @@ namespace {
return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
}
- unsigned Reg, Sub;
+ Register Reg;
+ unsigned Sub;
};
using ReferenceMap = DenseMap<unsigned, unsigned>;
@@ -196,25 +198,25 @@ namespace {
unsigned getMaskForSub(unsigned Sub);
bool isCondset(const MachineInstr &MI);
- LaneBitmask getLaneMask(unsigned Reg, unsigned Sub);
+ LaneBitmask getLaneMask(Register Reg, unsigned Sub);
void addRefToMap(RegisterRef RR, ReferenceMap &Map, unsigned Exec);
bool isRefInMap(RegisterRef, ReferenceMap &Map, unsigned Exec);
- void updateDeadsInRange(unsigned Reg, LaneBitmask LM, LiveRange &Range);
- void updateKillFlags(unsigned Reg);
- void updateDeadFlags(unsigned Reg);
- void recalculateLiveInterval(unsigned Reg);
+ void updateDeadsInRange(Register Reg, LaneBitmask LM, LiveRange &Range);
+ void updateKillFlags(Register Reg);
+ void updateDeadFlags(Register Reg);
+ void recalculateLiveInterval(Register Reg);
void removeInstr(MachineInstr &MI);
- void updateLiveness(std::set<unsigned> &RegSet, bool Recalc,
- bool UpdateKills, bool UpdateDeads);
+ void updateLiveness(std::set<Register> &RegSet, bool Recalc,
+ bool UpdateKills, bool UpdateDeads);
unsigned getCondTfrOpcode(const MachineOperand &SO, bool Cond);
MachineInstr *genCondTfrFor(MachineOperand &SrcOp,
MachineBasicBlock::iterator At, unsigned DstR,
unsigned DstSR, const MachineOperand &PredOp, bool PredSense,
bool ReadUndef, bool ImpUse);
- bool split(MachineInstr &MI, std::set<unsigned> &UpdRegs);
+ bool split(MachineInstr &MI, std::set<Register> &UpdRegs);
bool isPredicable(MachineInstr *MI);
MachineInstr *getReachingDefForPred(RegisterRef RD,
@@ -224,19 +226,18 @@ namespace {
void predicateAt(const MachineOperand &DefOp, MachineInstr &MI,
MachineBasicBlock::iterator Where,
const MachineOperand &PredOp, bool Cond,
- std::set<unsigned> &UpdRegs);
+ std::set<Register> &UpdRegs);
void renameInRange(RegisterRef RO, RegisterRef RN, unsigned PredR,
bool Cond, MachineBasicBlock::iterator First,
MachineBasicBlock::iterator Last);
- bool predicate(MachineInstr &TfrI, bool Cond, std::set<unsigned> &UpdRegs);
- bool predicateInBlock(MachineBasicBlock &B,
- std::set<unsigned> &UpdRegs);
+ bool predicate(MachineInstr &TfrI, bool Cond, std::set<Register> &UpdRegs);
+ bool predicateInBlock(MachineBasicBlock &B, std::set<Register> &UpdRegs);
bool isIntReg(RegisterRef RR, unsigned &BW);
bool isIntraBlocks(LiveInterval &LI);
bool coalesceRegisters(RegisterRef R1, RegisterRef R2);
- bool coalesceSegments(const SmallVectorImpl<MachineInstr*> &Condsets,
- std::set<unsigned> &UpdRegs);
+ bool coalesceSegments(const SmallVectorImpl<MachineInstr *> &Condsets,
+ std::set<Register> &UpdRegs);
};
} // end anonymous namespace
@@ -285,8 +286,8 @@ bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
return false;
}
-LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
- assert(Register::isVirtualRegister(Reg));
+LaneBitmask HexagonExpandCondsets::getLaneMask(Register Reg, unsigned Sub) {
+ assert(Reg.isVirtual());
return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
: MRI->getMaxLaneMaskForVReg(Reg);
}
@@ -312,7 +313,7 @@ bool HexagonExpandCondsets::isRefInMap(RegisterRef RR, ReferenceMap &Map,
return false;
}
-void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
+void HexagonExpandCondsets::updateKillFlags(Register Reg) {
auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
// Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
MachineInstr *MI = LIS->getInstructionFromIndex(K);
@@ -363,9 +364,9 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
}
}
-void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
- LiveRange &Range) {
- assert(Register::isVirtualRegister(Reg));
+void HexagonExpandCondsets::updateDeadsInRange(Register Reg, LaneBitmask LM,
+ LiveRange &Range) {
+ assert(Reg.isVirtual());
if (Range.empty())
return;
@@ -374,7 +375,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
if (!Op.isReg() || !Op.isDef())
return { false, false };
Register DR = Op.getReg(), DSR = Op.getSubReg();
- if (!Register::isVirtualRegister(DR) || DR != Reg)
+ if (!DR.isVirtual() || DR != Reg)
return { false, false };
LaneBitmask SLM = getLaneMask(DR, DSR);
LaneBitmask A = SLM & LM;
@@ -524,7 +525,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
}
}
-void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
+void HexagonExpandCondsets::updateDeadFlags(Register Reg) {
LiveInterval &LI = LIS->getInterval(Reg);
if (LI.hasSubRanges()) {
for (LiveInterval::SubRange &S : LI.subranges()) {
@@ -538,7 +539,7 @@ void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
}
}
-void HexagonExpandCondsets::recalculateLiveInterval(unsigned Reg) {
+void HexagonExpandCondsets::recalculateLiveInterval(Register Reg) {
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
}
@@ -548,12 +549,13 @@ void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
MI.eraseFromParent();
}
-void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
- bool Recalc, bool UpdateKills, bool UpdateDeads) {
+void HexagonExpandCondsets::updateLiveness(std::set<Register> &RegSet,
+ bool Recalc, bool UpdateKills,
+ bool UpdateDeads) {
UpdateKills |= UpdateDeads;
- for (unsigned R : RegSet) {
- if (!Register::isVirtualRegister(R)) {
- assert(Register::isPhysicalRegister(R));
+ for (Register R : RegSet) {
+ if (!R.isVirtual()) {
+ assert(R.isPhysical());
// There shouldn't be any physical registers as operands, except
// possibly reserved registers.
assert(MRI->isReserved(R));
@@ -580,17 +582,16 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
using namespace Hexagon;
if (SO.isReg()) {
- Register PhysR;
+ MCRegister PhysR;
RegisterRef RS = SO;
- if (Register::isVirtualRegister(RS.Reg)) {
+ if (RS.Reg.isVirtual()) {
const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg);
assert(VC->begin() != VC->end() && "Empty register class");
PhysR = *VC->begin();
} else {
- assert(Register::isPhysicalRegister(RS.Reg));
PhysR = RS.Reg;
}
- Register PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
+ MCRegister PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
switch (TRI->getRegSizeInBits(*RC)) {
case 32:
@@ -661,7 +662,7 @@ MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
/// Replace a MUX instruction MI with a pair A2_tfrt/A2_tfrf. This function
/// performs all necessary changes to complete the replacement.
bool HexagonExpandCondsets::split(MachineInstr &MI,
- std::set<unsigned> &UpdRegs) {
+ std::set<Register> &UpdRegs) {
if (TfrLimitActive) {
if (TfrCounter >= TfrLimit)
return false;
@@ -803,7 +804,7 @@ bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
// For physical register we would need to check register aliases, etc.
// and we don't want to bother with that. It would be of little value
// before the actual register rewriting (from virtual to physical).
- if (!Register::isVirtualRegister(RR.Reg))
+ if (!RR.Reg.isVirtual())
return false;
// No redefs for any operand.
if (isRefInMap(RR, Defs, Exec_Then))
@@ -855,7 +856,7 @@ void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
MachineInstr &MI,
MachineBasicBlock::iterator Where,
const MachineOperand &PredOp, bool Cond,
- std::set<unsigned> &UpdRegs) {
+ std::set<Register> &UpdRegs) {
// The problem with updating live intervals is that we can move one def
// past another def. In particular, this can happen when moving an A2_tfrt
// over an A2_tfrf defining the same register. From the point of view of
@@ -933,7 +934,7 @@ void HexagonExpandCondsets::renameInRange(RegisterRef RO, RegisterRef RN,
/// the copy under the given condition (using the same predicate register as
/// the copy).
bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
- std::set<unsigned> &UpdRegs) {
+ std::set<Register> &UpdRegs) {
// TfrI - A2_tfr[tf] Instruction (not A2_tfrsi).
unsigned Opc = TfrI.getOpcode();
(void)Opc;
@@ -1000,7 +1001,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
// subregisters are other physical registers, and we are not checking
// that.
RegisterRef RR = Op;
- if (!Register::isVirtualRegister(RR.Reg))
+ if (!RR.Reg.isVirtual())
return false;
ReferenceMap &Map = Op.isDef() ? Defs : Uses;
@@ -1067,7 +1068,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
/// Predicate all cases of conditional copies in the specified block.
bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
- std::set<unsigned> &UpdRegs) {
+ std::set<Register> &UpdRegs) {
bool Changed = false;
MachineBasicBlock::iterator I, E, NextI;
for (I = B.begin(), E = B.end(); I != E; I = NextI) {
@@ -1092,7 +1093,7 @@ bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
}
bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
- if (!Register::isVirtualRegister(RR.Reg))
+ if (!RR.Reg.isVirtual())
return false;
const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg);
if (RC == &Hexagon::IntRegsRegClass) {
@@ -1172,7 +1173,7 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
}
L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN));
}
- while (L2.begin() != L2.end())
+ while (!L2.empty())
L2.removeSegment(*L2.begin());
LIS->removeInterval(R2.Reg);
@@ -1187,8 +1188,8 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
/// the destination register. This could lead to having only one predicated
/// instruction in the end instead of two.
bool HexagonExpandCondsets::coalesceSegments(
- const SmallVectorImpl<MachineInstr*> &Condsets,
- std::set<unsigned> &UpdRegs) {
+ const SmallVectorImpl<MachineInstr *> &Condsets,
+ std::set<Register> &UpdRegs) {
SmallVector<MachineInstr*,16> TwoRegs;
for (MachineInstr *MI : Condsets) {
MachineOperand &S1 = MI->getOperand(2), &S2 = MI->getOperand(3);
@@ -1262,7 +1263,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
MF.getFunction().getParent()));
bool Changed = false;
- std::set<unsigned> CoalUpd, PredUpd;
+ std::set<Register> CoalUpd, PredUpd;
SmallVector<MachineInstr*,16> Condsets;
for (auto &B : MF)
@@ -1279,7 +1280,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
// in the IR (they have been removed by live range analysis).
// Updating them right before we split is the easiest, because splitting
// adds definitions which would interfere with updating kills afterwards.
- std::set<unsigned> KillUpd;
+ std::set<Register> KillUpd;
for (MachineInstr *MI : Condsets)
for (MachineOperand &Op : MI->operands())
if (Op.isReg() && Op.isUse())
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 010b7171ce17..a62610ae2b7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -306,7 +306,7 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
Register R = MO.getReg();
// Virtual registers will need scavenging, which then may require
// a stack slot.
- if (Register::isVirtualRegister(R))
+ if (R.isVirtual())
return true;
for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
if (CSR[*S])
@@ -1104,7 +1104,8 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
Offset = MFI.getObjectOffset(F->getFrameIdx());
} else {
Register FrameReg;
- Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg);
+ Offset =
+ getFrameIndexReference(MF, F->getFrameIdx(), FrameReg).getFixed();
}
// Subtract 8 to make room for R30 and R31, which are added above.
Offset -= 8;
@@ -1256,9 +1257,9 @@ static const char *getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType,
return nullptr;
}
-int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
auto &MFI = MF.getFrameInfo();
auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
@@ -1354,7 +1355,7 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
int RealOffset = Offset;
if (!UseFP && !UseAP)
RealOffset = FrameSize+Offset;
- return RealOffset;
+ return StackOffset::getFixed(RealOffset);
}
bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index 87d385e1ce3c..4ffd31b670e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -11,6 +11,7 @@
#include "Hexagon.h"
#include "HexagonBlockRanges.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -82,8 +83,8 @@ public:
return true;
}
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
bool hasFP(const MachineFunction &MF) const override;
const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 2f29e88bc989..f2026877b22c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -613,7 +613,7 @@ void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
if (MO.isReg() && MO.isDef()) {
Register R = MO.getReg();
assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
- if (Register::isVirtualRegister(R))
+ if (R.isVirtual())
RO.insert(std::make_pair(R, Index++));
}
}
@@ -730,7 +730,7 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
if (!MO.isReg() || !MO.isDef())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
Defs.insert(R);
}
@@ -743,7 +743,7 @@ void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
if (!MO.isReg() || !MO.isUse())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
Uses.insert(R);
}
@@ -1089,9 +1089,7 @@ void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool {
return IR.second.empty();
};
- auto End = llvm::remove_if(LL, IsEmpty);
- if (End != LL.end())
- LL.erase(End, LL.end());
+ llvm::erase_if(LL, IsEmpty);
} else {
// The definition of VR is constant-extended, and all candidates have
// empty removable-register sets. Pick the maximum candidate, and remove
@@ -1179,9 +1177,7 @@ void HexagonGenInsert::pruneRegCopies(unsigned VR) {
auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool {
return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32);
};
- auto End = llvm::remove_if(LL, IsCopy);
- if (End != LL.end())
- LL.erase(End, LL.end());
+ llvm::erase_if(LL, IsCopy);
}
void HexagonGenInsert::pruneCandidates() {
@@ -1483,7 +1479,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
if (!MO.isReg() || !MO.isDef())
continue;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R) || !MRI->use_nodbg_empty(R)) {
+ if (!R.isVirtual() || !MRI->use_nodbg_empty(R)) {
AllDead = false;
break;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 903287e68c99..d8d2025c5d27 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -48,7 +48,8 @@ namespace {
// FIXME: Use TargetInstrInfo::RegSubRegPair
struct RegisterSubReg {
- unsigned R, S;
+ Register R;
+ unsigned S;
RegisterSubReg(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
RegisterSubReg(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
@@ -111,7 +112,7 @@ namespace {
VectOfInst PUsers;
RegToRegMap G2P;
- bool isPredReg(unsigned R);
+ bool isPredReg(Register R);
void collectPredicateGPR(MachineFunction &MF);
void processPredicateGPR(const RegisterSubReg &Reg);
unsigned getPredForm(unsigned Opc);
@@ -133,8 +134,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
"Hexagon generate predicate operations", false, false)
-bool HexagonGenPredicate::isPredReg(unsigned R) {
- if (!Register::isVirtualRegister(R))
+bool HexagonGenPredicate::isPredReg(Register R) {
+ if (!R.isVirtual())
return false;
const TargetRegisterClass *RC = MRI->getRegClass(R);
return RC == &Hexagon::PredRegsRegClass;
@@ -214,7 +215,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
case TargetOpcode::COPY:
if (isPredReg(MI->getOperand(1).getReg())) {
RegisterSubReg RD = MI->getOperand(0);
- if (Register::isVirtualRegister(RD.R))
+ if (RD.R.isVirtual())
PredGPRs.insert(RD);
}
break;
@@ -246,7 +247,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
// Create a predicate register for a given Reg. The newly created register
// will have its value copied from Reg, so that it can be later used as
// an operand in other instructions.
- assert(Register::isVirtualRegister(Reg.R));
+ assert(Reg.R.isVirtual());
RegToRegMap::iterator F = G2P.find(Reg);
if (F != G2P.end())
return F->second;
@@ -472,9 +473,9 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
continue;
RegisterSubReg DR = MI.getOperand(0);
RegisterSubReg SR = MI.getOperand(1);
- if (!Register::isVirtualRegister(DR.R))
+ if (!DR.R.isVirtual())
continue;
- if (!Register::isVirtualRegister(SR.R))
+ if (!SR.R.isVirtual())
continue;
if (MRI->getRegClass(DR.R) != PredRC)
continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 4833935f8d24..2f23e8643720 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -390,7 +390,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
TRI = HST.getRegisterInfo();
for (auto &L : *MLI)
- if (!L->getParentLoop()) {
+ if (L->isOutermost()) {
bool L0Used = false;
bool L1Used = false;
Changed |= convertToHardwareLoop(L, L0Used, L1Used);
@@ -1432,7 +1432,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
Register Reg = InitVal->getReg();
// We don't know the value of a physical register.
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
return true;
MachineInstr *Def = MRI->getVRegDef(Reg);
@@ -1510,7 +1510,7 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
int64_t TV;
Register R = MO.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
return false;
MachineInstr *DI = MRI->getVRegDef(R);
unsigned DOpc = DI->getOpcode();
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index b4b389a7b956..bdd5c7dd151e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -231,10 +231,10 @@ SDNode *HexagonDAGToDAGISel::StoreInstrForLoadIntrinsic(MachineSDNode *LoadN,
if (Size >= 4)
TS = CurDAG->getStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, PI,
- Size);
+ Align(Size));
else
TS = CurDAG->getTruncStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc,
- PI, MVT::getIntegerVT(Size * 8), Size);
+ PI, MVT::getIntegerVT(Size * 8), Align(Size));
SDNode *StoreN;
{
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c0f92042e5da..29e76b53910e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -789,6 +789,12 @@ struct ShuffleMask {
OS << " }";
}
};
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
+ SM.print(OS);
+ return OS;
+}
} // namespace
// --------------------------------------------------------------------
@@ -828,6 +834,7 @@ namespace llvm {
void selectVAlign(SDNode *N);
private:
+ void select(SDNode *ISelN);
void materialize(const ResultStack &Results);
SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
@@ -931,46 +938,19 @@ bool HvxSelector::selectVectorConstants(SDNode *N) {
SmallVector<SDNode*,4> Nodes;
SetVector<SDNode*> WorkQ;
- // The one-use test for VSPLATW's operand may fail due to dead nodes
- // left over in the DAG.
- DAG.RemoveDeadNodes();
-
// The DAG can change (due to CSE) during selection, so cache all the
// unselected nodes first to avoid traversing a mutating DAG.
-
- auto IsNodeToSelect = [] (SDNode *N) {
- if (N->isMachineOpcode())
- return false;
- switch (N->getOpcode()) {
- case HexagonISD::VZERO:
- case HexagonISD::VSPLATW:
- return true;
- case ISD::LOAD: {
- SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
- unsigned AddrOpc = Addr.getOpcode();
- if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
- if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
- return true;
- }
- break;
- }
- // Make sure to select the operand of VSPLATW.
- bool IsSplatOp = N->hasOneUse() &&
- N->use_begin()->getOpcode() == HexagonISD::VSPLATW;
- return IsSplatOp;
- };
-
WorkQ.insert(N);
for (unsigned i = 0; i != WorkQ.size(); ++i) {
SDNode *W = WorkQ[i];
- if (IsNodeToSelect(W))
+ if (!W->isMachineOpcode() && W->getOpcode() == HexagonISD::ISEL)
Nodes.push_back(W);
for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
WorkQ.insert(W->getOperand(j).getNode());
}
for (SDNode *L : Nodes)
- ISel.Select(L);
+ select(L);
return !Nodes.empty();
}
@@ -1358,6 +1338,82 @@ namespace {
};
}
+void HvxSelector::select(SDNode *ISelN) {
+ // What's important here is to select the right set of nodes. The main
+ // selection algorithm loops over nodes in a topological order, i.e. users
+ // are visited before their operands.
+ //
+ // It is an error to have an unselected node with a selected operand, and
+ // there is an assertion in the main selector code to enforce that.
+ //
+ // Such a situation could occur if we selected a node, which is both a
+ // subnode of ISelN, and a subnode of an unrelated (and yet unselected)
+ // node in the DAG.
+ assert(ISelN->getOpcode() == HexagonISD::ISEL);
+ SDNode *N0 = ISelN->getOperand(0).getNode();
+ if (N0->isMachineOpcode()) {
+ ISel.ReplaceNode(ISelN, N0);
+ return;
+ }
+
+ // There could have been nodes created (i.e. inserted into the DAG)
+ // that are now dead. Remove them, in case they use any of the nodes
+ // to select (and make them look shared).
+ DAG.RemoveDeadNodes();
+
+ SetVector<SDNode*> SubNodes, TmpQ;
+ std::map<SDNode*,unsigned> NumOps;
+
+ // Don't want to select N0 if it's shared with another node, except if
+ // it's shared with other ISELs.
+ auto IsISelN = [](SDNode *T) { return T->getOpcode() == HexagonISD::ISEL; };
+ if (llvm::all_of(N0->uses(), IsISelN))
+ SubNodes.insert(N0);
+
+ auto InSubNodes = [&SubNodes](SDNode *T) { return SubNodes.count(T); };
+ for (unsigned I = 0; I != SubNodes.size(); ++I) {
+ SDNode *S = SubNodes[I];
+ unsigned OpN = 0;
+ // Only add subnodes that are only reachable from N0.
+ for (SDValue Op : S->ops()) {
+ SDNode *O = Op.getNode();
+ if (llvm::all_of(O->uses(), InSubNodes)) {
+ SubNodes.insert(O);
+ ++OpN;
+ }
+ }
+ NumOps.insert({S, OpN});
+ if (OpN == 0)
+ TmpQ.insert(S);
+ }
+
+ for (unsigned I = 0; I != TmpQ.size(); ++I) {
+ SDNode *S = TmpQ[I];
+ for (SDNode *U : S->uses()) {
+ if (U == ISelN)
+ continue;
+ auto F = NumOps.find(U);
+ assert(F != NumOps.end());
+ if (F->second > 0 && !--F->second)
+ TmpQ.insert(F->first);
+ }
+ }
+
+ // Remove the marker.
+ ISel.ReplaceNode(ISelN, N0);
+
+ assert(SubNodes.size() == TmpQ.size());
+ NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
+
+ Deleter DUQ(DAG, Queue);
+ for (SDNode *S : reverse(Queue)) {
+ if (S == nullptr)
+ continue;
+ DEBUG_WITH_TYPE("isel", {dbgs() << "HVX selecting: "; S->dump(&DAG);});
+ ISel.Select(S);
+ }
+}
+
bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
MVT ResTy, SDValue Va, SDValue Vb,
SDNode *N) {
@@ -1379,12 +1435,7 @@ bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
// nodes, these nodes would not be selected (since the "local" selection
// only visits nodes that are not in AllNodes).
// To avoid this issue, remove all dead nodes from the DAG now.
- DAG.RemoveDeadNodes();
- DenseSet<SDNode*> AllNodes;
- for (SDNode &S : DAG.allnodes())
- AllNodes.insert(&S);
-
- Deleter DUA(DAG, AllNodes);
+// DAG.RemoveDeadNodes();
SmallVector<SDValue,128> Ops;
LLVMContext &Ctx = *DAG.getContext();
@@ -1434,57 +1485,9 @@ bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
}
assert(!N->use_empty());
- ISel.ReplaceNode(N, LV.getNode());
-
- if (AllNodes.count(LV.getNode())) {
- DAG.RemoveDeadNodes();
- return true;
- }
-
- // The lowered build-vector node will now need to be selected. It needs
- // to be done here because this node and its submodes are not included
- // in the main selection loop.
- // Implement essentially the same topological ordering algorithm as is
- // used in SelectionDAGISel.
-
- SetVector<SDNode*> SubNodes, TmpQ;
- std::map<SDNode*,unsigned> NumOps;
-
- SubNodes.insert(LV.getNode());
- for (unsigned I = 0; I != SubNodes.size(); ++I) {
- unsigned OpN = 0;
- SDNode *S = SubNodes[I];
- for (SDValue Op : S->ops()) {
- if (AllNodes.count(Op.getNode()))
- continue;
- SubNodes.insert(Op.getNode());
- ++OpN;
- }
- NumOps.insert({S, OpN});
- if (OpN == 0)
- TmpQ.insert(S);
- }
-
- for (unsigned I = 0; I != TmpQ.size(); ++I) {
- SDNode *S = TmpQ[I];
- for (SDNode *U : S->uses()) {
- if (!SubNodes.count(U))
- continue;
- auto F = NumOps.find(U);
- assert(F != NumOps.end());
- assert(F->second > 0);
- if (!--F->second)
- TmpQ.insert(F->first);
- }
- }
- assert(SubNodes.size() == TmpQ.size());
- NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
-
- Deleter DUQ(DAG, Queue);
- for (SDNode *S : reverse(Queue))
- if (S != nullptr)
- ISel.Select(S);
-
+ SDValue IS = DAG.getNode(HexagonISD::ISEL, dl, ResTy, LV);
+ ISel.ReplaceNode(N, IS.getNode());
+ select(IS.getNode());
DAG.RemoveDeadNodes();
return true;
}
@@ -1683,7 +1686,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
// The result length must be the same as the length of a single vector,
// or a vector pair.
assert(LogLen == HwLog || LogLen == HwLog+1);
- bool Extend = (LogLen == HwLog);
+ bool HavePairs = LogLen == HwLog+1;
if (!isPermutation(SM.Mask))
return OpRef::fail();
@@ -1767,6 +1770,22 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
// E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1
// F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1
+ // There is one special case that is not a perfect shuffle, but
+ // can be turned into one easily: when the shuffle operates on
+ // a vector pair, but the two vectors in the pair are swapped.
+ // The code below that identifies perfect shuffles will reject
+ // it, unless the order is reversed.
+ SmallVector<int,128> MaskStorage(SM.Mask.begin(), SM.Mask.end());
+ bool InvertedPair = false;
+ if (HavePairs && SM.Mask[0] >= int(HwLen)) {
+ for (int i = 0, e = SM.Mask.size(); i != e; ++i) {
+ int M = SM.Mask[i];
+ MaskStorage[i] = M >= int(HwLen) ? M-HwLen : M+HwLen;
+ }
+ InvertedPair = true;
+ }
+ ArrayRef<int> LocalMask(MaskStorage);
+
auto XorPow2 = [] (ArrayRef<int> Mask, unsigned Num) {
unsigned X = Mask[0] ^ Mask[Num/2];
// Check that the first half has the X's bits clear.
@@ -1786,12 +1805,12 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
assert(VecLen > 2);
for (unsigned I = VecLen; I >= 2; I >>= 1) {
// Examine the initial segment of Mask of size I.
- unsigned X = XorPow2(SM.Mask, I);
+ unsigned X = XorPow2(LocalMask, I);
if (!isPowerOf2_32(X))
return OpRef::fail();
// Check the other segments of Mask.
for (int J = I; J < VecLen; J += I) {
- if (XorPow2(SM.Mask.slice(J, I), I) != X)
+ if (XorPow2(LocalMask.slice(J, I), I) != X)
return OpRef::fail();
}
Perm[Log2_32(X)] = Log2_32(I)-1;
@@ -1895,20 +1914,40 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
}
}
+ // From the cycles, construct the sequence of values that will
+ // then form the control values for vdealvdd/vshuffvdd, i.e.
+ // (M a1 a2)(M a3 a4 a5)... -> a1 a2 a3 a4 a5
+ // This essentially strips the M value from the cycles where
+ // it's present, and performs the insertion of M (then stripping)
+ // for cycles without M (as described in an earlier comment).
SmallVector<unsigned,8> SwapElems;
- if (HwLen == unsigned(VecLen))
+ // When the input is extended (i.e. single vector becomes a pair),
+ // this is done by using an "undef" vector as the second input.
+ // However, then we get
+ // input 1: GOODBITS
+ // input 2: ........
+ // but we need
+ // input 1: ....BITS
+ // input 2: ....GOOD
+ // Then at the end, this needs to be undone. To accomplish this,
+ // artificially add "LogLen-1" at both ends of the sequence.
+ if (!HavePairs)
SwapElems.push_back(LogLen-1);
-
for (const CycleType &C : Cycles) {
+ // Do the transformation: (a1..an) -> (M a1..an)(M a1).
unsigned First = (C[0] == LogLen-1) ? 1 : 0;
SwapElems.append(C.begin()+First, C.end());
if (First == 0)
SwapElems.push_back(C[0]);
}
+ if (!HavePairs)
+ SwapElems.push_back(LogLen-1);
const SDLoc &dl(Results.InpNode);
- OpRef Arg = !Extend ? Va
- : concat(Va, OpRef::undef(SingleTy), Results);
+ OpRef Arg = HavePairs ? Va
+ : concat(Va, OpRef::undef(SingleTy), Results);
+ if (InvertedPair)
+ Arg = concat(OpRef::hi(Arg), OpRef::lo(Arg), Results);
for (unsigned I = 0, E = SwapElems.size(); I != E; ) {
bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1];
@@ -1932,7 +1971,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
Arg = OpRef::res(Results.top());
}
- return !Extend ? Arg : OpRef::lo(Arg);
+ return HavePairs ? Arg : OpRef::lo(Arg);
}
OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
@@ -1996,7 +2035,7 @@ SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
SDValue BV = DAG.getBuildVector(VecTy, dl, Elems);
SDValue LV = Lower.LowerOperation(BV, DAG);
DAG.RemoveDeadNode(BV.getNode());
- return LV;
+ return DAG.getNode(HexagonISD::ISEL, dl, VecTy, LV);
}
void HvxSelector::selectShuffle(SDNode *N) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 768fea639cf9..c8994a3a28a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1517,8 +1517,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setMinimumJumpTableEntries(std::numeric_limits<unsigned>::max());
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::ABS, MVT::i32, Legal);
- setOperationAction(ISD::ABS, MVT::i64, Legal);
+ for (unsigned LegalIntOp :
+ {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) {
+ setOperationAction(LegalIntOp, MVT::i32, Legal);
+ setOperationAction(LegalIntOp, MVT::i64, Legal);
+ }
// Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
// but they only operate on i64.
@@ -1620,7 +1623,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR,
ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT,
ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
- ISD::CONCAT_VECTORS, ISD::VECTOR_SHUFFLE
+ ISD::CONCAT_VECTORS, ISD::VECTOR_SHUFFLE,
+ ISD::SPLAT_VECTOR,
};
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -1677,6 +1681,16 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::AND, NativeVT, Legal);
setOperationAction(ISD::OR, NativeVT, Legal);
setOperationAction(ISD::XOR, NativeVT, Legal);
+
+ if (NativeVT.getVectorElementType() != MVT::i1)
+ setOperationAction(ISD::SPLAT_VECTOR, NativeVT, Legal);
+ }
+
+ for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32}) {
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
}
// Custom lower unaligned loads.
@@ -1843,15 +1857,12 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::VASL: return "HexagonISD::VASL";
case HexagonISD::VASR: return "HexagonISD::VASR";
case HexagonISD::VLSR: return "HexagonISD::VLSR";
- case HexagonISD::VSPLAT: return "HexagonISD::VSPLAT";
case HexagonISD::VEXTRACTW: return "HexagonISD::VEXTRACTW";
case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0";
case HexagonISD::VROR: return "HexagonISD::VROR";
case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE";
case HexagonISD::PTRUE: return "HexagonISD::PTRUE";
case HexagonISD::PFALSE: return "HexagonISD::PFALSE";
- case HexagonISD::VZERO: return "HexagonISD::VZERO";
- case HexagonISD::VSPLATW: return "HexagonISD::VSPLATW";
case HexagonISD::D2P: return "HexagonISD::D2P";
case HexagonISD::P2D: return "HexagonISD::P2D";
case HexagonISD::V2Q: return "HexagonISD::V2Q";
@@ -1862,6 +1873,10 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::TYPECAST: return "HexagonISD::TYPECAST";
case HexagonISD::VALIGN: return "HexagonISD::VALIGN";
case HexagonISD::VALIGNADDR: return "HexagonISD::VALIGNADDR";
+ case HexagonISD::VPACKL: return "HexagonISD::VPACKL";
+ case HexagonISD::VUNPACK: return "HexagonISD::VUNPACK";
+ case HexagonISD::VUNPACKU: return "HexagonISD::VUNPACKU";
+ case HexagonISD::ISEL: return "HexagonISD::ISEL";
case HexagonISD::OP_END: break;
}
return nullptr;
@@ -2064,20 +2079,9 @@ HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::TypeScalarizeVector;
if (Subtarget.useHVXOps()) {
- unsigned HwLen = Subtarget.getVectorLength();
- // If the size of VT is at least half of the vector length,
- // widen the vector. Note: the threshold was not selected in
- // any scientific way.
- ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
- if (llvm::find(Tys, ElemTy) != Tys.end()) {
- unsigned HwWidth = 8*HwLen;
- unsigned VecWidth = VT.getSizeInBits();
- if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
- return TargetLoweringBase::TypeWidenVector;
- }
- // Split vectors of i1 that correspond to (byte) vector pairs.
- if (ElemTy == MVT::i1 && VecLen == 2*HwLen)
- return TargetLoweringBase::TypeSplitVector;
+ unsigned Action = getPreferredHvxVectorAction(VT);
+ if (Action != ~0u)
+ return static_cast<TargetLoweringBase::LegalizeTypeAction>(Action);
}
// Always widen (remaining) vectors of i1.
@@ -2229,26 +2233,33 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
SDValue
HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG)
const {
- if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) {
- if (SDValue S = BVN->getSplatValue()) {
- unsigned NewOpc;
- switch (Op.getOpcode()) {
- case ISD::SHL:
- NewOpc = HexagonISD::VASL;
- break;
- case ISD::SRA:
- NewOpc = HexagonISD::VASR;
- break;
- case ISD::SRL:
- NewOpc = HexagonISD::VLSR;
- break;
- default:
- llvm_unreachable("Unexpected shift opcode");
- }
- return DAG.getNode(NewOpc, SDLoc(Op), ty(Op), Op.getOperand(0), S);
- }
+ unsigned NewOpc;
+ switch (Op.getOpcode()) {
+ case ISD::SHL:
+ NewOpc = HexagonISD::VASL;
+ break;
+ case ISD::SRA:
+ NewOpc = HexagonISD::VASR;
+ break;
+ case ISD::SRL:
+ NewOpc = HexagonISD::VLSR;
+ break;
+ default:
+ llvm_unreachable("Unexpected shift opcode");
}
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ const SDLoc &dl(Op);
+
+ switch (Op1.getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ if (SDValue S = cast<BuildVectorSDNode>(Op1)->getSplatValue())
+ return DAG.getNode(NewOpc, dl, ty(Op), Op0, S);
+ break;
+ case ISD::SPLAT_VECTOR:
+ return DAG.getNode(NewOpc, dl, ty(Op), Op0, Op1.getOperand(0));
+ }
return SDValue();
}
@@ -2325,9 +2336,10 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);
unsigned First, Num = Elem.size();
- for (First = 0; First != Num; ++First)
+ for (First = 0; First != Num; ++First) {
if (!isUndef(Elem[First]))
break;
+ }
if (First == Num)
return DAG.getUNDEF(VecTy);
@@ -2359,18 +2371,16 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
// Then try splat.
bool IsSplat = true;
- for (unsigned i = 0; i != Num; ++i) {
- if (i == First)
- continue;
+ for (unsigned i = First+1; i != Num; ++i) {
if (Elem[i] == Elem[First] || isUndef(Elem[i]))
continue;
IsSplat = false;
break;
}
if (IsSplat) {
- // Legalize the operand to VSPLAT.
+ // Legalize the operand of SPLAT_VECTOR.
SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
- return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext);
+ return DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Ext);
}
// Generate
@@ -2408,9 +2418,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);
unsigned First, Num = Elem.size();
- for (First = 0; First != Num; ++First)
+ for (First = 0; First != Num; ++First) {
if (!isUndef(Elem[First]))
break;
+ }
if (First == Num)
return DAG.getUNDEF(VecTy);
@@ -2421,18 +2432,16 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
// First try splat if possible.
if (ElemTy == MVT::i16) {
bool IsSplat = true;
- for (unsigned i = 0; i != Num; ++i) {
- if (i == First)
- continue;
+ for (unsigned i = First+1; i != Num; ++i) {
if (Elem[i] == Elem[First] || isUndef(Elem[i]))
continue;
IsSplat = false;
break;
}
if (IsSplat) {
- // Legalize the operand to VSPLAT.
+ // Legalize the operand of SPLAT_VECTOR
SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
- return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext);
+ return DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Ext);
}
}
@@ -2650,7 +2659,7 @@ HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
unsigned W = Ty.getSizeInBits();
if (W <= 64)
return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W)));
- return DAG.getNode(HexagonISD::VZERO, dl, Ty);
+ return DAG.getNode(ISD::SPLAT_VECTOR, dl, Ty, getZero(dl, MVT::i32, DAG));
}
if (Ty.isInteger())
@@ -2661,6 +2670,28 @@ HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
}
SDValue
+HexagonTargetLowering::appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG)
+ const {
+ MVT ValTy = ty(Val);
+ assert(ValTy.getVectorElementType() == ResTy.getVectorElementType());
+
+ unsigned ValLen = ValTy.getVectorNumElements();
+ unsigned ResLen = ResTy.getVectorNumElements();
+ if (ValLen == ResLen)
+ return Val;
+
+ const SDLoc &dl(Val);
+ assert(ValLen < ResLen);
+ assert(ResLen % ValLen == 0);
+
+ SmallVector<SDValue, 4> Concats = {Val};
+ for (unsigned i = 1, e = ResLen / ValLen; i < e; ++i)
+ Concats.push_back(DAG.getUNDEF(ValTy));
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, Concats);
+}
+
+SDValue
HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT VecTy = ty(Op);
unsigned BW = VecTy.getSizeInBits();
@@ -2910,8 +2941,10 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
? DAG.getNode(HexagonISD::VALIGNADDR, dl, MVT::i32, BO.first,
DAG.getConstant(NeedAlign, dl, MVT::i32))
: BO.first;
- SDValue Base0 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second, dl);
- SDValue Base1 = DAG.getMemBasePlusOffset(BaseNoOff, BO.second+LoadLen, dl);
+ SDValue Base0 =
+ DAG.getMemBasePlusOffset(BaseNoOff, TypeSize::Fixed(BO.second), dl);
+ SDValue Base1 = DAG.getMemBasePlusOffset(
+ BaseNoOff, TypeSize::Fixed(BO.second + LoadLen), dl);
MachineMemOperand *WideMMO = nullptr;
if (MachineMemOperand *MMO = LN->getMemOperand()) {
@@ -3023,7 +3056,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
if (Opc == ISD::INLINEASM || Opc == ISD::INLINEASM_BR)
return LowerINLINEASM(Op, DAG);
- if (isHvxOperation(Op)) {
+ if (isHvxOperation(Op.getNode(), DAG)) {
// If HVX lowering returns nothing, try the default lowering.
if (SDValue V = LowerHvxOperation(Op, DAG))
return V;
@@ -3084,7 +3117,7 @@ void
HexagonTargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
- if (isHvxOperation(N)) {
+ if (isHvxOperation(N, DAG)) {
LowerHvxOperationWrapper(N, Results, DAG);
if (!Results.empty())
return;
@@ -3103,7 +3136,7 @@ void
HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
- if (isHvxOperation(N)) {
+ if (isHvxOperation(N, DAG)) {
ReplaceHvxNodeResults(N, Results, DAG);
if (!Results.empty())
return;
@@ -3118,10 +3151,12 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::BITCAST:
// Handle a bitcast from v8i1 to i8.
if (N->getValueType(0) == MVT::i8) {
- SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
- N->getOperand(0), DAG);
- SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8);
- Results.push_back(T);
+ if (N->getOperand(0).getValueType() == MVT::v8i1) {
+ SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
+ N->getOperand(0), DAG);
+ SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8);
+ Results.push_back(T);
+ }
}
break;
}
@@ -3130,13 +3165,16 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
SDValue
HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
const {
- SDValue Op(N, 0);
- if (isHvxOperation(Op)) {
+ if (isHvxOperation(N, DCI.DAG)) {
if (SDValue V = PerformHvxDAGCombine(N, DCI))
return V;
return SDValue();
}
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue Op(N, 0);
const SDLoc &dl(Op);
unsigned Opc = Op.getOpcode();
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 7d6e6b6185c8..cfccb14a09c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -30,464 +31,478 @@ namespace llvm {
namespace HexagonISD {
- enum NodeType : unsigned {
- OP_BEGIN = ISD::BUILTIN_OP_END,
-
- CONST32 = OP_BEGIN,
- CONST32_GP, // For marking data present in GP.
- ADDC, // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
- SUBC, // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
- ALLOCA,
-
- AT_GOT, // Index in GOT.
- AT_PCREL, // Offset relative to PC.
-
- CALL, // Function call.
- CALLnr, // Function call that does not return.
- CALLR,
-
- RET_FLAG, // Return with a flag operand.
- BARRIER, // Memory barrier.
- JT, // Jump table.
- CP, // Constant pool.
-
- COMBINE,
- VSPLAT, // Generic splat, selection depends on argument/return
- // types.
- VASL,
- VASR,
- VLSR,
-
- TSTBIT,
- INSERT,
- EXTRACTU,
- VEXTRACTW,
- VINSERTW0,
- VROR,
- TC_RETURN,
- EH_RETURN,
- DCFETCH,
- READCYCLE,
- PTRUE,
- PFALSE,
- D2P, // Convert 8-byte value to 8-bit predicate register. [*]
- P2D, // Convert 8-bit predicate register to 8-byte value. [*]
- V2Q, // Convert HVX vector to a vector predicate reg. [*]
- Q2V, // Convert vector predicate to an HVX vector. [*]
- // [*] The equivalence is defined as "Q <=> (V != 0)",
- // where the != operation compares bytes.
- // Note: V != 0 is implemented as V >u 0.
- QCAT,
- QTRUE,
- QFALSE,
- VZERO,
- VSPLATW, // HVX splat of a 32-bit word with an arbitrary result type.
- TYPECAST, // No-op that's used to convert between different legal
- // types in a register.
- VALIGN, // Align two vectors (in Op0, Op1) to one that would have
- // been loaded from address in Op2.
- VALIGNADDR, // Align vector address: Op0 & -Op1, except when it is
- // an address in a vector load, then it's a no-op.
- OP_END
- };
+enum NodeType : unsigned {
+ OP_BEGIN = ISD::BUILTIN_OP_END,
+
+ CONST32 = OP_BEGIN,
+ CONST32_GP, // For marking data present in GP.
+ ADDC, // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
+ SUBC, // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
+ ALLOCA,
+
+ AT_GOT, // Index in GOT.
+ AT_PCREL, // Offset relative to PC.
+
+ CALL, // Function call.
+ CALLnr, // Function call that does not return.
+ CALLR,
+
+ RET_FLAG, // Return with a flag operand.
+ BARRIER, // Memory barrier.
+ JT, // Jump table.
+ CP, // Constant pool.
+
+ COMBINE,
+ VASL,
+ VASR,
+ VLSR,
+
+ TSTBIT,
+ INSERT,
+ EXTRACTU,
+ VEXTRACTW,
+ VINSERTW0,
+ VROR,
+ TC_RETURN,
+ EH_RETURN,
+ DCFETCH,
+ READCYCLE,
+ PTRUE,
+ PFALSE,
+ D2P, // Convert 8-byte value to 8-bit predicate register. [*]
+ P2D, // Convert 8-bit predicate register to 8-byte value. [*]
+ V2Q, // Convert HVX vector to a vector predicate reg. [*]
+ Q2V, // Convert vector predicate to an HVX vector. [*]
+ // [*] The equivalence is defined as "Q <=> (V != 0)",
+ // where the != operation compares bytes.
+ // Note: V != 0 is implemented as V >u 0.
+ QCAT,
+ QTRUE,
+ QFALSE,
+ TYPECAST, // No-op that's used to convert between different legal
+ // types in a register.
+ VALIGN, // Align two vectors (in Op0, Op1) to one that would have
+ // been loaded from address in Op2.
+ VALIGNADDR, // Align vector address: Op0 & -Op1, except when it is
+ // an address in a vector load, then it's a no-op.
+ VPACKL, // Pack low parts of the input vector to the front of the
+ // output. For example v64i16 VPACKL(v32i32) will pick
+ // the low halfwords and pack them into the first 32
+ // halfwords of the output. The rest of the output is
+ // unspecified.
+ VUNPACK, // Unpacking into low elements with sign extension.
+ VUNPACKU, // Unpacking into low elements with zero extension.
+ ISEL, // Marker for nodes that were created during ISel, and
+ // which need explicit selection (would have been left
+ // unselected otherwise).
+ OP_END
+};
} // end namespace HexagonISD
- class HexagonSubtarget;
-
- class HexagonTargetLowering : public TargetLowering {
- int VarArgsFrameOffset; // Frame offset to start of varargs area.
- const HexagonTargetMachine &HTM;
- const HexagonSubtarget &Subtarget;
-
- bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
- const;
-
- public:
- explicit HexagonTargetLowering(const TargetMachine &TM,
- const HexagonSubtarget &ST);
-
- bool isHVXVectorType(MVT Ty) const;
-
- /// IsEligibleForTailCallOptimization - Check whether the call is eligible
- /// for tail call optimization. Targets which want to do tail call
- /// optimization should implement this function.
- bool IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet,
- bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
-
- bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
- MachineFunction &MF,
- unsigned Intrinsic) const override;
-
- bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
- bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
- bool isCheapToSpeculateCttz() const override { return true; }
- bool isCheapToSpeculateCtlz() const override { return true; }
- bool isCtlzFast() const override { return true; }
-
- bool hasBitTest(SDValue X, SDValue Y) const override;
-
- bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
-
- /// Return true if an FMA operation is faster than a pair of mul and add
- /// instructions. fmuladd intrinsics will be expanded to FMAs when this
- /// method returns true (and FMAs are legal), otherwise fmuladd is
- /// expanded to mul + add.
- bool isFMAFasterThanFMulAndFAdd(const MachineFunction &,
- EVT) const override;
-
- // Should we expand the build vector with shuffles?
- bool shouldExpandBuildVectorWithShuffles(EVT VT,
- unsigned DefinedValues) const override;
-
- bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
- TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
- const override;
-
- SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
- void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const override;
- void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const override;
-
- const char *getTargetNodeName(unsigned Opcode) const override;
-
- SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
- SDValue
- LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- const SDLoc &dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const override;
- SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
- SelectionDAG &DAG) const;
- SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
- SelectionDAG &DAG) const;
- SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
- SelectionDAG &DAG) const;
- SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
- GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
- unsigned ReturnReg, unsigned char OperandFlags) const;
- SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const override;
- SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- const SDLoc &dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals,
- const SmallVectorImpl<SDValue> &OutVals,
- SDValue Callee) const;
-
- SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
- SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-
- bool CanLowerReturn(CallingConv::ID CallConv,
- MachineFunction &MF, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- LLVMContext &Context) const override;
-
- SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SDLoc &dl, SelectionDAG &DAG) const override;
-
- SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
- bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
-
- Register getRegisterByName(const char* RegName, LLT VT,
- const MachineFunction &MF) const override;
-
- /// If a physical register, this returns the register that receives the
- /// exception address on entry to an EH pad.
- Register
- getExceptionPointerRegister(const Constant *PersonalityFn) const override {
- return Hexagon::R0;
- }
-
- /// If a physical register, this returns the register that receives the
- /// exception typeid on entry to a landing pad.
- Register
- getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
- return Hexagon::R1;
- }
-
- SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-
- EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
- EVT VT) const override {
- if (!VT.isVector())
- return MVT::i1;
- else
- return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
- }
-
- bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
- SDValue &Base, SDValue &Offset,
- ISD::MemIndexedMode &AM,
- SelectionDAG &DAG) const override;
-
- ConstraintType getConstraintType(StringRef Constraint) const override;
-
- std::pair<unsigned, const TargetRegisterClass *>
- getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
- StringRef Constraint, MVT VT) const override;
-
- unsigned
- getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
- if (ConstraintCode == "o")
- return InlineAsm::Constraint_o;
- return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
- }
-
- // Intrinsics
- SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
- /// isLegalAddressingMode - Return true if the addressing mode represented
- /// by AM is legal for this target, for a load/store of the specified type.
- /// The type may be VoidTy, in which case only return true if the addressing
- /// mode is legal for a load/store of any legal type.
- /// TODO: Handle pre/postinc as well.
- bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
- Type *Ty, unsigned AS,
- Instruction *I = nullptr) const override;
- /// Return true if folding a constant offset with the given GlobalAddress
- /// is legal. It is frequently not legal in PIC relocation models.
- bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-
- bool isFPImmLegal(const APFloat &Imm, EVT VT,
- bool ForCodeSize) const override;
-
- /// isLegalICmpImmediate - Return true if the specified immediate is legal
- /// icmp immediate, that is the target has icmp instructions which can
- /// compare a register against the immediate without having to materialize
- /// the immediate into a register.
- bool isLegalICmpImmediate(int64_t Imm) const override;
-
- EVT getOptimalMemOpType(const MemOp &Op,
- const AttributeList &FuncAttributes) const override;
-
- bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
- unsigned AddrSpace, Align Alignment,
- MachineMemOperand::Flags Flags,
- bool *Fast) const override;
-
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
- unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
- const override;
-
- /// Returns relocation base for the given PIC jumptable.
- SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
- const override;
-
- bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
- EVT NewVT) const override;
-
- // Handling of atomic RMW instructions.
- Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
- AtomicOrdering Ord) const override;
- Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
- Value *Addr, AtomicOrdering Ord) const override;
- AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
- bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- AtomicExpansionKind
- shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
-
- AtomicExpansionKind
- shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
- return AtomicExpansionKind::LLSC;
- }
-
- private:
- void initializeHVXLowering();
- void validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl,
- unsigned NeedAlign) const;
-
- std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
-
- bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
- SelectionDAG &DAG,
- MutableArrayRef<ConstantInt*> Consts) const;
- SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
+class HexagonSubtarget;
+
+class HexagonTargetLowering : public TargetLowering {
+ int VarArgsFrameOffset; // Frame offset to start of varargs area.
+ const HexagonTargetMachine &HTM;
+ const HexagonSubtarget &Subtarget;
+
+ bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
+ const;
+
+public:
+ explicit HexagonTargetLowering(const TargetMachine &TM,
+ const HexagonSubtarget &ST);
+
+ bool isHVXVectorType(MVT Ty) const;
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Targets which want to do tail call
+ /// optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet,
+ bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ MachineFunction &MF,
+ unsigned Intrinsic) const override;
+
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool isCheapToSpeculateCttz() const override { return true; }
+ bool isCheapToSpeculateCtlz() const override { return true; }
+ bool isCtlzFast() const override { return true; }
+
+ bool hasBitTest(SDValue X, SDValue Y) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+ /// Return true if an FMA operation is faster than a pair of mul and add
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+ /// method returns true (and FMAs are legal), otherwise fmuladd is
+ /// expanded to mul + add.
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &,
+ EVT) const override;
+
+ // Should we expand the build vector with shuffles?
+ bool shouldExpandBuildVectorWithShuffles(EVT VT,
+ unsigned DefinedValues) const override;
+
+ bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+ TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
+ const override;
+
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue
+ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG) const;
+ SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+ GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
+ unsigned ReturnReg, unsigned char OperandFlags) const;
+ SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDValue Callee) const;
+
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+ bool CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &dl, SelectionDAG &DAG) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+
+ Register getRegisterByName(const char* RegName, LLT VT,
+ const MachineFunction &MF) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ Register
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ return Hexagon::R0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ Register
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ return Hexagon::R1;
+ }
+
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
+ EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
+ EVT VT) const override {
+ if (!VT.isVector())
+ return MVT::i1;
+ else
+ return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+ }
+
+ bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+ SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM,
+ SelectionDAG &DAG) const override;
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ unsigned
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "o")
+ return InlineAsm::Constraint_o;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ // Intrinsics
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ /// The type may be VoidTy, in which case only return true if the addressing
+ /// mode is legal for a load/store of any legal type.
+ /// TODO: Handle pre/postinc as well.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+ Type *Ty, unsigned AS,
+ Instruction *I = nullptr) const override;
+ /// Return true if folding a constant offset with the given GlobalAddress
+ /// is legal. It is frequently not legal in PIC relocation models.
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
+
+ /// isLegalICmpImmediate - Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ EVT getOptimalMemOpType(const MemOp &Op,
+ const AttributeList &FuncAttributes) const override;
+
+ bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+ unsigned AddrSpace, Align Alignment,
+ MachineMemOperand::Flags Flags,
+ bool *Fast) const override;
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+ unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
+ const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
+ const override;
+
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+ EVT NewVT) const override;
+
+ // Handling of atomic RMW instructions.
+ Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const override;
+ Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+ Value *Addr, AtomicOrdering Ord) const override;
+ AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ AtomicExpansionKind
+ shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
+ return AtomicExpansionKind::LLSC;
+ }
+
+private:
+ void initializeHVXLowering();
+ unsigned getPreferredHvxVectorAction(MVT VecTy) const;
+
+ void validateConstPtrAlignment(SDValue Ptr, const SDLoc &dl,
+ unsigned NeedAlign) const;
+
+ std::pair<SDValue,int> getBaseAndOffset(SDValue Addr) const;
+
+ bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
+ SelectionDAG &DAG,
+ MutableArrayRef<ConstantInt*> Consts) const;
+ SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
+ SelectionDAG &DAG) const;
+ SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
+ SelectionDAG &DAG) const;
+ SDValue extractVector(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ValTy, MVT ResTy, SelectionDAG &DAG) const;
+ SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
+ const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
+ SDValue expandPredicate(SDValue Vec32, const SDLoc &dl,
SelectionDAG &DAG) const;
- SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
- SelectionDAG &DAG) const;
- SDValue extractVector(SDValue VecV, SDValue IdxV, const SDLoc &dl,
- MVT ValTy, MVT ResTy, SelectionDAG &DAG) const;
- SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
- const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
- SDValue expandPredicate(SDValue Vec32, const SDLoc &dl,
+ SDValue contractPredicate(SDValue Vec64, const SDLoc &dl,
SelectionDAG &DAG) const;
- SDValue contractPredicate(SDValue Vec64, const SDLoc &dl,
+ SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG) const;
+
+ bool isUndef(SDValue Op) const {
+ if (Op.isMachineOpcode())
+ return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
+ return Op.getOpcode() == ISD::UNDEF;
+ }
+ SDValue getInstr(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
+ SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
+ return SDValue(N, 0);
+ }
+ SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const;
+
+ using VectorPair = std::pair<SDValue, SDValue>;
+ using TypePair = std::pair<MVT, MVT>;
+
+ SDValue getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+
+ MVT ty(SDValue Op) const {
+ return Op.getValueType().getSimpleVT();
+ }
+ TypePair ty(const VectorPair &Ops) const {
+ return { Ops.first.getValueType().getSimpleVT(),
+ Ops.second.getValueType().getSimpleVT() };
+ }
+ MVT tyScalar(MVT Ty) const {
+ if (!Ty.isVector())
+ return Ty;
+ return MVT::getIntegerVT(Ty.getSizeInBits());
+ }
+ MVT tyVector(MVT Ty, MVT ElemTy) const {
+ if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
+ return Ty;
+ unsigned TyWidth = Ty.getSizeInBits();
+ unsigned ElemWidth = ElemTy.getSizeInBits();
+ assert((TyWidth % ElemWidth) == 0);
+ return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
+ }
+
+ MVT typeJoin(const TypePair &Tys) const;
+ TypePair typeSplit(MVT Ty) const;
+ MVT typeExtElem(MVT VecTy, unsigned Factor) const;
+ MVT typeTruncElem(MVT VecTy, unsigned Factor) const;
+
+ SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
+
+ bool allowsHvxMemoryAccess(MVT VecTy, MachineMemOperand::Flags Flags,
+ bool *Fast) const;
+ bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
+ MachineMemOperand::Flags Flags,
+ bool *Fast) const;
+
+ bool isHvxSingleTy(MVT Ty) const;
+ bool isHvxPairTy(MVT Ty) const;
+ bool isHvxBoolTy(MVT Ty) const;
+ SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
+ SelectionDAG &DAG) const;
+ SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
+ SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
+ ArrayRef<int> Mask, SelectionDAG &DAG) const;
+
+ SDValue buildHvxVectorReg(ArrayRef<SDValue> Values, const SDLoc &dl,
+ MVT VecTy, SelectionDAG &DAG) const;
+ SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
+ MVT VecTy, SelectionDAG &DAG) const;
+ SDValue createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
+ unsigned BitBytes, bool ZeroFill,
SelectionDAG &DAG) const;
- SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const;
-
- bool isUndef(SDValue Op) const {
- if (Op.isMachineOpcode())
- return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
- return Op.getOpcode() == ISD::UNDEF;
- }
- SDValue getInstr(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
- ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
- SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
- return SDValue(N, 0);
- }
- SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const;
-
- using VectorPair = std::pair<SDValue, SDValue>;
- using TypePair = std::pair<MVT, MVT>;
-
- SDValue getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
- const SDLoc &dl, SelectionDAG &DAG) const;
-
- MVT ty(SDValue Op) const {
- return Op.getValueType().getSimpleVT();
- }
- TypePair ty(const VectorPair &Ops) const {
- return { Ops.first.getValueType().getSimpleVT(),
- Ops.second.getValueType().getSimpleVT() };
- }
- MVT tyScalar(MVT Ty) const {
- if (!Ty.isVector())
- return Ty;
- return MVT::getIntegerVT(Ty.getSizeInBits());
- }
- MVT tyVector(MVT Ty, MVT ElemTy) const {
- if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
- return Ty;
- unsigned TyWidth = Ty.getSizeInBits();
- unsigned ElemWidth = ElemTy.getSizeInBits();
- assert((TyWidth % ElemWidth) == 0);
- return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
- }
-
- MVT typeJoin(const TypePair &Tys) const;
- TypePair typeSplit(MVT Ty) const;
- MVT typeExtElem(MVT VecTy, unsigned Factor) const;
- MVT typeTruncElem(MVT VecTy, unsigned Factor) const;
-
- SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
- SelectionDAG &DAG) const;
- VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
- SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
-
- bool allowsHvxMemoryAccess(MVT VecTy, MachineMemOperand::Flags Flags,
- bool *Fast) const;
- bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
- MachineMemOperand::Flags Flags,
- bool *Fast) const;
-
- bool isHvxSingleTy(MVT Ty) const;
- bool isHvxPairTy(MVT Ty) const;
- bool isHvxBoolTy(MVT Ty) const;
- SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
- SelectionDAG &DAG) const;
- SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
- SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
- ArrayRef<int> Mask, SelectionDAG &DAG) const;
-
- SDValue buildHvxVectorReg(ArrayRef<SDValue> Values, const SDLoc &dl,
- MVT VecTy, SelectionDAG &DAG) const;
- SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
- MVT VecTy, SelectionDAG &DAG) const;
- SDValue createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
- unsigned BitBytes, bool ZeroFill,
- SelectionDAG &DAG) const;
- SDValue extractHvxElementReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ SDValue extractHvxElementReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ResTy, SelectionDAG &DAG) const;
+ SDValue extractHvxElementPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ MVT ResTy, SelectionDAG &DAG) const;
+ SDValue insertHvxElementReg(SDValue VecV, SDValue IdxV, SDValue ValV,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
+ const SDLoc &dl, SelectionDAG &DAG) const;
+ SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
MVT ResTy, SelectionDAG &DAG) const;
- SDValue extractHvxElementPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
+ SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
MVT ResTy, SelectionDAG &DAG) const;
- SDValue insertHvxElementReg(SDValue VecV, SDValue IdxV, SDValue ValV,
+ SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
const SDLoc &dl, SelectionDAG &DAG) const;
- SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
+ SDValue insertHvxSubvectorPred(SDValue VecV, SDValue SubV, SDValue IdxV,
const SDLoc &dl, SelectionDAG &DAG) const;
- SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
- MVT ResTy, SelectionDAG &DAG) const;
- SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
- MVT ResTy, SelectionDAG &DAG) const;
- SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
- const SDLoc &dl, SelectionDAG &DAG) const;
- SDValue insertHvxSubvectorPred(SDValue VecV, SDValue SubV, SDValue IdxV,
- const SDLoc &dl, SelectionDAG &DAG) const;
- SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
- bool ZeroExt, SelectionDAG &DAG) const;
- SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
- SelectionDAG &DAG) const;
+ SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
+ bool ZeroExt, SelectionDAG &DAG) const;
+ SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
+ SelectionDAG &DAG) const;
- SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
- SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
- SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
-
- std::pair<const TargetRegisterClass*, uint8_t>
- findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
- const override;
-
- bool isHvxOperation(SDValue Op) const;
- bool isHvxOperation(SDNode *N) const;
- SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
- void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const;
- void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const;
- SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- };
+ SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const;
+ SDValue WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const;
+
+ std::pair<const TargetRegisterClass*, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
+ const override;
+
+ bool shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const;
+ bool isHvxOperation(SDNode *N, SelectionDAG &DAG) const;
+ SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
+ void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
+ void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
+ SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 7cda915fffe9..29b75814da6e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -14,6 +14,10 @@
using namespace llvm;
+static cl::opt<unsigned> HvxWidenThreshold("hexagon-hvx-widen",
+ cl::Hidden, cl::init(16),
+ cl::desc("Lower threshold (in bytes) for widening to HVX vectors"));
+
static const MVT LegalV64[] = { MVT::v64i8, MVT::v32i16, MVT::v16i32 };
static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
@@ -87,17 +91,28 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::XOR, T, Legal);
setOperationAction(ISD::ADD, T, Legal);
setOperationAction(ISD::SUB, T, Legal);
+ setOperationAction(ISD::MUL, T, Legal);
setOperationAction(ISD::CTPOP, T, Legal);
setOperationAction(ISD::CTLZ, T, Legal);
+ setOperationAction(ISD::SELECT, T, Legal);
+ setOperationAction(ISD::SPLAT_VECTOR, T, Legal);
if (T != ByteV) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
setOperationAction(ISD::BSWAP, T, Legal);
}
+ setOperationAction(ISD::SMIN, T, Legal);
+ setOperationAction(ISD::SMAX, T, Legal);
+ if (T.getScalarType() != MVT::i32) {
+ setOperationAction(ISD::UMIN, T, Legal);
+ setOperationAction(ISD::UMAX, T, Legal);
+ }
+
setOperationAction(ISD::CTTZ, T, Custom);
setOperationAction(ISD::LOAD, T, Custom);
- setOperationAction(ISD::MUL, T, Custom);
+ setOperationAction(ISD::MLOAD, T, Custom);
+ setOperationAction(ISD::MSTORE, T, Custom);
setOperationAction(ISD::MULHS, T, Custom);
setOperationAction(ISD::MULHU, T, Custom);
setOperationAction(ISD::BUILD_VECTOR, T, Custom);
@@ -147,9 +162,12 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+ setOperationAction(ISD::SPLAT_VECTOR, T, Custom);
setOperationAction(ISD::LOAD, T, Custom);
setOperationAction(ISD::STORE, T, Custom);
+ setOperationAction(ISD::MLOAD, T, Custom);
+ setOperationAction(ISD::MSTORE, T, Custom);
setOperationAction(ISD::CTLZ, T, Custom);
setOperationAction(ISD::CTTZ, T, Custom);
setOperationAction(ISD::CTPOP, T, Custom);
@@ -172,6 +190,13 @@ HexagonTargetLowering::initializeHVXLowering() {
// Promote all shuffles to operate on vectors of bytes.
setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
}
+
+ setOperationAction(ISD::SMIN, T, Custom);
+ setOperationAction(ISD::SMAX, T, Custom);
+ if (T.getScalarType() != MVT::i32) {
+ setOperationAction(ISD::UMIN, T, Custom);
+ setOperationAction(ISD::UMAX, T, Custom);
+ }
}
// Boolean vectors.
@@ -188,6 +213,9 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::AND, BoolW, Custom);
setOperationAction(ISD::OR, BoolW, Custom);
setOperationAction(ISD::XOR, BoolW, Custom);
+ // Masked load/store takes a mask that may need splitting.
+ setOperationAction(ISD::MLOAD, BoolW, Custom);
+ setOperationAction(ISD::MSTORE, BoolW, Custom);
}
for (MVT T : LegalV) {
@@ -198,6 +226,7 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::INSERT_VECTOR_ELT, BoolV, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, BoolV, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, BoolV, Custom);
+ setOperationAction(ISD::SELECT, BoolV, Custom);
setOperationAction(ISD::AND, BoolV, Legal);
setOperationAction(ISD::OR, BoolV, Legal);
setOperationAction(ISD::XOR, BoolV, Legal);
@@ -211,16 +240,82 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::SIGN_EXTEND_INREG, T, Legal);
}
+ // Handle store widening for short vectors.
+ unsigned HwLen = Subtarget.getVectorLength();
+ for (MVT ElemTy : Subtarget.getHVXElementTypes()) {
+ if (ElemTy == MVT::i1)
+ continue;
+ int ElemWidth = ElemTy.getFixedSizeInBits();
+ int MaxElems = (8*HwLen) / ElemWidth;
+ for (int N = 2; N < MaxElems; N *= 2) {
+ MVT VecTy = MVT::getVectorVT(ElemTy, N);
+ auto Action = getPreferredVectorAction(VecTy);
+ if (Action == TargetLoweringBase::TypeWidenVector) {
+ setOperationAction(ISD::LOAD, VecTy, Custom);
+ setOperationAction(ISD::STORE, VecTy, Custom);
+ setOperationAction(ISD::SETCC, VecTy, Custom);
+ setOperationAction(ISD::TRUNCATE, VecTy, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VecTy, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, VecTy, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VecTy, Custom);
+
+ MVT BoolTy = MVT::getVectorVT(MVT::i1, N);
+ if (!isTypeLegal(BoolTy))
+ setOperationAction(ISD::SETCC, BoolTy, Custom);
+ }
+ }
+ }
+
+ setTargetDAGCombine(ISD::SPLAT_VECTOR);
setTargetDAGCombine(ISD::VSELECT);
}
+unsigned
+HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const {
+ MVT ElemTy = VecTy.getVectorElementType();
+ unsigned VecLen = VecTy.getVectorNumElements();
+ unsigned HwLen = Subtarget.getVectorLength();
+
+ // Split vectors of i1 that exceed byte vector length.
+ if (ElemTy == MVT::i1 && VecLen > HwLen)
+ return TargetLoweringBase::TypeSplitVector;
+
+ ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
+ // For shorter vectors of i1, widen them if any of the corresponding
+ // vectors of integers needs to be widened.
+ if (ElemTy == MVT::i1) {
+ for (MVT T : Tys) {
+ assert(T != MVT::i1);
+ auto A = getPreferredHvxVectorAction(MVT::getVectorVT(T, VecLen));
+ if (A != ~0u)
+ return A;
+ }
+ return ~0u;
+ }
+
+ // If the size of VecTy is at least half of the vector length,
+ // widen the vector. Note: the threshold was not selected in
+ // any scientific way.
+ if (llvm::is_contained(Tys, ElemTy)) {
+ unsigned VecWidth = VecTy.getSizeInBits();
+ bool HaveThreshold = HvxWidenThreshold.getNumOccurrences() > 0;
+ if (HaveThreshold && 8*HvxWidenThreshold <= VecWidth)
+ return TargetLoweringBase::TypeWidenVector;
+ unsigned HwWidth = 8*HwLen;
+ if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
+ return TargetLoweringBase::TypeWidenVector;
+ }
+
+ // Defer to default.
+ return ~0u;
+}
+
SDValue
HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
const SDLoc &dl, SelectionDAG &DAG) const {
SmallVector<SDValue,4> IntOps;
IntOps.push_back(DAG.getConstant(IntId, dl, MVT::i32));
- for (const SDValue &Op : Ops)
- IntOps.push_back(Op);
+ append_range(IntOps, Ops);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResTy, IntOps);
}
@@ -432,7 +527,9 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
auto *IdxN = dyn_cast<ConstantSDNode>(SplatV.getNode());
if (IdxN && IdxN->isNullValue())
return getZero(dl, VecTy, DAG);
- return DAG.getNode(HexagonISD::VSPLATW, dl, VecTy, SplatV);
+ MVT WordTy = MVT::getVectorVT(MVT::i32, HwLen/4);
+ SDValue S = DAG.getNode(ISD::SPLAT_VECTOR, dl, WordTy, SplatV);
+ return DAG.getBitcast(VecTy, S);
}
// Delay recognizing constant vectors until here, so that we can generate
@@ -571,6 +668,9 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
if (!ZeroFill)
return S;
// Fill the bytes beyond BlockLen with 0s.
+ // V6_pred_scalar2 cannot fill the entire predicate, so it only works
+ // when BlockLen < HwLen.
+ assert(BlockLen < HwLen && "vsetq(v1) prerequisite");
MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
{DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
@@ -1034,6 +1134,7 @@ HexagonTargetLowering::insertHvxSubvectorPred(SDValue VecV, SDValue SubV,
// ByteVec is the target vector VecV rotated in such a way that the
// subvector should be inserted at index 0. Generate a predicate mask
// and use vmux to do the insertion.
+ assert(BlockLen < HwLen && "vsetq(v1) prerequisite");
MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
SDValue Q = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
{DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
@@ -1058,7 +1159,7 @@ HexagonTargetLowering::extendHvxVectorPred(SDValue VecV, const SDLoc &dl,
return DAG.getNode(HexagonISD::Q2V, dl, ResTy, VecV);
assert(ty(VecV).getVectorNumElements() == ResTy.getVectorNumElements());
- SDValue True = DAG.getNode(HexagonISD::VSPLAT, dl, ResTy,
+ SDValue True = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
DAG.getConstant(1, dl, MVT::i32));
SDValue False = getZero(dl, ResTy, DAG);
return DAG.getSelect(dl, ResTy, VecV, True, False);
@@ -1180,12 +1281,19 @@ HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
continue;
}
// A few less complicated cases.
- if (V.getOpcode() == ISD::Constant)
- Elems[i] = DAG.getSExtOrTrunc(V, dl, NTy);
- else if (V.isUndef())
- Elems[i] = DAG.getUNDEF(NTy);
- else
- llvm_unreachable("Unexpected vector element");
+ switch (V.getOpcode()) {
+ case ISD::Constant:
+ Elems[i] = DAG.getSExtOrTrunc(V, dl, NTy);
+ break;
+ case ISD::UNDEF:
+ Elems[i] = DAG.getUNDEF(NTy);
+ break;
+ case ISD::TRUNCATE:
+ Elems[i] = V.getOperand(0);
+ break;
+ default:
+ llvm_unreachable("Unexpected vector element");
+ }
}
}
return DAG.getBuildVector(VecTy, dl, Elems);
@@ -1346,19 +1454,14 @@ HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
// Calculate the vectors of 1 and bitwidth(x).
MVT ElemTy = ty(InpV).getVectorElementType();
unsigned ElemWidth = ElemTy.getSizeInBits();
- // Using uint64_t because a shift by 32 can happen.
- uint64_t Splat1 = 0, SplatW = 0;
- assert(isPowerOf2_32(ElemWidth) && ElemWidth <= 32);
- for (unsigned i = 0; i != 32/ElemWidth; ++i) {
- Splat1 = (Splat1 << ElemWidth) | 1;
- SplatW = (SplatW << ElemWidth) | ElemWidth;
- }
- SDValue Vec1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
- DAG.getConstant(uint32_t(Splat1), dl, MVT::i32));
- SDValue VecW = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
- DAG.getConstant(uint32_t(SplatW), dl, MVT::i32));
- SDValue VecN1 = DAG.getNode(HexagonISD::VSPLATW, dl, ResTy,
+
+ SDValue Vec1 = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
+ DAG.getConstant(1, dl, MVT::i32));
+ SDValue VecW = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
+ DAG.getConstant(ElemWidth, dl, MVT::i32));
+ SDValue VecN1 = DAG.getNode(ISD::SPLAT_VECTOR, dl, ResTy,
DAG.getConstant(-1, dl, MVT::i32));
+
// Do not use DAG.getNOT, because that would create BUILD_VECTOR with
// a BITCAST. Here we can skip the BITCAST (so we don't have to handle
// it separately in custom combine or selection).
@@ -1370,60 +1473,6 @@ HexagonTargetLowering::LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const {
}
SDValue
-HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const {
- MVT ResTy = ty(Op);
- assert(ResTy.isVector() && isHvxSingleTy(ResTy));
- const SDLoc &dl(Op);
- SmallVector<int,256> ShuffMask;
-
- MVT ElemTy = ResTy.getVectorElementType();
- unsigned VecLen = ResTy.getVectorNumElements();
- SDValue Vs = Op.getOperand(0);
- SDValue Vt = Op.getOperand(1);
-
- switch (ElemTy.SimpleTy) {
- case MVT::i8: {
- // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
- // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
- // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
- MVT ExtTy = typeExtElem(ResTy, 2);
- unsigned MpyOpc = ElemTy == MVT::i8 ? Hexagon::V6_vmpybv
- : Hexagon::V6_vmpyhv;
- SDValue M = getInstr(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG);
-
- // Discard high halves of the resulting values, collect the low halves.
- for (unsigned I = 0; I < VecLen; I += 2) {
- ShuffMask.push_back(I); // Pick even element.
- ShuffMask.push_back(I+VecLen); // Pick odd element.
- }
- VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG);
- SDValue BS = getByteShuffle(dl, P.first, P.second, ShuffMask, DAG);
- return DAG.getBitcast(ResTy, BS);
- }
- case MVT::i16:
- // For i16 there is V6_vmpyih, which acts exactly like the MUL opcode.
- // (There is also V6_vmpyhv, which behaves in an analogous way to
- // V6_vmpybv.)
- return getInstr(Hexagon::V6_vmpyih, dl, ResTy, {Vs, Vt}, DAG);
- case MVT::i32: {
- // Use the following sequence for signed word multiply:
- // T0 = V6_vmpyiowh Vs, Vt
- // T1 = V6_vaslw T0, 16
- // T2 = V6_vmpyiewuh_acc T1, Vs, Vt
- SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
- SDValue T0 = getInstr(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG);
- SDValue T1 = getInstr(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG);
- SDValue T2 = getInstr(Hexagon::V6_vmpyiewuh_acc, dl, ResTy,
- {T1, Vs, Vt}, DAG);
- return T2;
- }
- default:
- break;
- }
- return SDValue();
-}
-
-SDValue
HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
MVT ResTy = ty(Op);
assert(ResTy.isVector());
@@ -1462,7 +1511,7 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
assert(ElemTy == MVT::i32);
SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
- if (IsSigned) {
+ auto MulHS_V60 = [&](SDValue Vs, SDValue Vt) {
// mulhs(Vs,Vt) =
// = [(Hi(Vs)*2^16 + Lo(Vs)) *s (Hi(Vt)*2^16 + Lo(Vt))] >> 32
// = [Hi(Vs)*2^16 *s Hi(Vt)*2^16 + Hi(Vs) *su Lo(Vt)*2^16
@@ -1489,6 +1538,20 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
// Add:
SDValue T3 = DAG.getNode(ISD::ADD, dl, ResTy, {S2, T2});
return T3;
+ };
+
+ auto MulHS_V62 = [&](SDValue Vs, SDValue Vt) {
+ MVT PairTy = typeJoin({ResTy, ResTy});
+ SDValue T0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {Vs, Vt}, DAG);
+ SDValue T1 = getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy,
+ {T0, Vs, Vt}, DAG);
+ return opSplit(T1, dl, DAG).second;
+ };
+
+ if (IsSigned) {
+ if (Subtarget.useHVXV62Ops())
+ return MulHS_V62(Vs, Vt);
+ return MulHS_V60(Vs, Vt);
}
// Unsigned mulhw. (Would expansion using signed mulhw be better?)
@@ -1585,6 +1648,26 @@ HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
}
SDValue
+HexagonTargetLowering::LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const {
+ MVT ResTy = ty(Op);
+ if (ResTy.getVectorElementType() != MVT::i1)
+ return Op;
+
+ const SDLoc &dl(Op);
+ unsigned HwLen = Subtarget.getVectorLength();
+ unsigned VecLen = ResTy.getVectorNumElements();
+ assert(HwLen % VecLen == 0);
+ unsigned ElemSize = HwLen / VecLen;
+
+ MVT VecTy = MVT::getVectorVT(MVT::getIntegerVT(ElemSize * 8), VecLen);
+ SDValue S =
+ DAG.getNode(ISD::SELECT, dl, VecTy, Op.getOperand(0),
+ DAG.getNode(HexagonISD::Q2V, dl, VecTy, Op.getOperand(1)),
+ DAG.getNode(HexagonISD::Q2V, dl, VecTy, Op.getOperand(2)));
+ return DAG.getNode(HexagonISD::V2Q, dl, ResTy, S);
+}
+
+SDValue
HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
if (SDValue S = getVectorShiftByInt(Op, DAG))
return S;
@@ -1593,7 +1676,7 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
SDValue
HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
- const SDLoc &dl(Op);
+ const SDLoc &dl(Op);
MVT ResTy = ty(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -1614,6 +1697,76 @@ HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
}
SDValue
+HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ unsigned HwLen = Subtarget.getVectorLength();
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto *MaskN = cast<MaskedLoadStoreSDNode>(Op.getNode());
+ SDValue Mask = MaskN->getMask();
+ SDValue Chain = MaskN->getChain();
+ SDValue Base = MaskN->getBasePtr();
+ auto *MemOp = MF.getMachineMemOperand(MaskN->getMemOperand(), 0, HwLen);
+
+ unsigned Opc = Op->getOpcode();
+ assert(Opc == ISD::MLOAD || Opc == ISD::MSTORE);
+
+ if (Opc == ISD::MLOAD) {
+ MVT ValTy = ty(Op);
+ SDValue Load = DAG.getLoad(ValTy, dl, Chain, Base, MemOp);
+ SDValue Thru = cast<MaskedLoadSDNode>(MaskN)->getPassThru();
+ if (isUndef(Thru))
+ return Load;
+ SDValue VSel = DAG.getNode(ISD::VSELECT, dl, ValTy, Mask, Load, Thru);
+ return DAG.getMergeValues({VSel, Load.getValue(1)}, dl);
+ }
+
+ // MSTORE
+ // HVX only has aligned masked stores.
+
+ // TODO: Fold negations of the mask into the store.
+ unsigned StoreOpc = Hexagon::V6_vS32b_qpred_ai;
+ SDValue Value = cast<MaskedStoreSDNode>(MaskN)->getValue();
+ SDValue Offset0 = DAG.getTargetConstant(0, dl, ty(Base));
+
+ if (MaskN->getAlign().value() % HwLen == 0) {
+ SDValue Store = getInstr(StoreOpc, dl, MVT::Other,
+ {Mask, Base, Offset0, Value, Chain}, DAG);
+ DAG.setNodeMemRefs(cast<MachineSDNode>(Store.getNode()), {MemOp});
+ return Store;
+ }
+
+ // Unaligned case.
+ auto StoreAlign = [&](SDValue V, SDValue A) {
+ SDValue Z = getZero(dl, ty(V), DAG);
+ // TODO: use funnel shifts?
+ // vlalign(Vu,Vv,Rt) rotates the pair Vu:Vv left by Rt and takes the
+ // upper half.
+ SDValue LoV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {V, Z, A}, DAG);
+ SDValue HiV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {Z, V, A}, DAG);
+ return std::make_pair(LoV, HiV);
+ };
+
+ MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+ MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+ SDValue MaskV = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Mask);
+ VectorPair Tmp = StoreAlign(MaskV, Base);
+ VectorPair MaskU = {DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.first),
+ DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.second)};
+ VectorPair ValueU = StoreAlign(Value, Base);
+
+ SDValue Offset1 = DAG.getTargetConstant(HwLen, dl, MVT::i32);
+ SDValue StoreLo =
+ getInstr(StoreOpc, dl, MVT::Other,
+ {MaskU.first, Base, Offset0, ValueU.first, Chain}, DAG);
+ SDValue StoreHi =
+ getInstr(StoreOpc, dl, MVT::Other,
+ {MaskU.second, Base, Offset1, ValueU.second, Chain}, DAG);
+ DAG.setNodeMemRefs(cast<MachineSDNode>(StoreLo.getNode()), {MemOp});
+ DAG.setNodeMemRefs(cast<MachineSDNode>(StoreHi.getNode()), {MemOp});
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi});
+}
+
+SDValue
HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
assert(!Op.isMachineOpcode());
SmallVector<SDValue,2> OpsL, OpsH;
@@ -1648,45 +1801,252 @@ HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
SDValue
HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const {
- LSBaseSDNode *BN = cast<LSBaseSDNode>(Op.getNode());
- assert(BN->isUnindexed());
- MVT MemTy = BN->getMemoryVT().getSimpleVT();
+ auto *MemN = cast<MemSDNode>(Op.getNode());
+
+ MVT MemTy = MemN->getMemoryVT().getSimpleVT();
if (!isHvxPairTy(MemTy))
return Op;
const SDLoc &dl(Op);
unsigned HwLen = Subtarget.getVectorLength();
MVT SingleTy = typeSplit(MemTy).first;
- SDValue Chain = BN->getChain();
- SDValue Base0 = BN->getBasePtr();
- SDValue Base1 = DAG.getMemBasePlusOffset(Base0, HwLen, dl);
+ SDValue Chain = MemN->getChain();
+ SDValue Base0 = MemN->getBasePtr();
+ SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl);
MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr;
- if (MachineMemOperand *MMO = BN->getMemOperand()) {
+ if (MachineMemOperand *MMO = MemN->getMemOperand()) {
MachineFunction &MF = DAG.getMachineFunction();
MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen);
MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen);
}
- unsigned MemOpc = BN->getOpcode();
- SDValue NewOp;
+ unsigned MemOpc = MemN->getOpcode();
if (MemOpc == ISD::LOAD) {
+ assert(cast<LoadSDNode>(Op)->isUnindexed());
SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0);
SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1);
- NewOp = DAG.getMergeValues(
- { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
- DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- Load0.getValue(1), Load1.getValue(1)) }, dl);
- } else {
- assert(MemOpc == ISD::STORE);
+ return DAG.getMergeValues(
+ { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1),
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load0.getValue(1), Load1.getValue(1)) }, dl);
+ }
+ if (MemOpc == ISD::STORE) {
+ assert(cast<StoreSDNode>(Op)->isUnindexed());
VectorPair Vals = opSplit(cast<StoreSDNode>(Op)->getValue(), dl, DAG);
SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0);
SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1);
- NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1);
}
- return NewOp;
+ assert(MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE);
+
+ auto MaskN = cast<MaskedLoadStoreSDNode>(Op);
+ assert(MaskN->isUnindexed());
+ VectorPair Masks = opSplit(MaskN->getMask(), dl, DAG);
+ SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+ if (MemOpc == ISD::MLOAD) {
+ VectorPair Thru =
+ opSplit(cast<MaskedLoadSDNode>(Op)->getPassThru(), dl, DAG);
+ SDValue MLoad0 =
+ DAG.getMaskedLoad(SingleTy, dl, Chain, Base0, Offset, Masks.first,
+ Thru.first, SingleTy, MOp0, ISD::UNINDEXED,
+ ISD::NON_EXTLOAD, false);
+ SDValue MLoad1 =
+ DAG.getMaskedLoad(SingleTy, dl, Chain, Base1, Offset, Masks.second,
+ Thru.second, SingleTy, MOp1, ISD::UNINDEXED,
+ ISD::NON_EXTLOAD, false);
+ return DAG.getMergeValues(
+ { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, MLoad0, MLoad1),
+ DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ MLoad0.getValue(1), MLoad1.getValue(1)) }, dl);
+ }
+ if (MemOpc == ISD::MSTORE) {
+ VectorPair Vals = opSplit(cast<MaskedStoreSDNode>(Op)->getValue(), dl, DAG);
+ SDValue MStore0 = DAG.getMaskedStore(Chain, dl, Vals.first, Base0, Offset,
+ Masks.first, SingleTy, MOp0,
+ ISD::UNINDEXED, false, false);
+ SDValue MStore1 = DAG.getMaskedStore(Chain, dl, Vals.second, Base1, Offset,
+ Masks.second, SingleTy, MOp1,
+ ISD::UNINDEXED, false, false);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MStore0, MStore1);
+ }
+
+ std::string Name = "Unexpected operation: " + Op->getOperationName(&DAG);
+ llvm_unreachable(Name.c_str());
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ auto *LoadN = cast<LoadSDNode>(Op.getNode());
+ assert(LoadN->isUnindexed() && "Not widening indexed loads yet");
+ assert(LoadN->getMemoryVT().getVectorElementType() != MVT::i1 &&
+ "Not widening loads of i1 yet");
+
+ SDValue Chain = LoadN->getChain();
+ SDValue Base = LoadN->getBasePtr();
+ SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+ MVT ResTy = ty(Op);
+ unsigned HwLen = Subtarget.getVectorLength();
+ unsigned ResLen = ResTy.getStoreSize();
+ assert(ResLen < HwLen && "vsetq(v1) prerequisite");
+
+ MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+ SDValue Mask = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+ {DAG.getConstant(ResLen, dl, MVT::i32)}, DAG);
+
+ MVT LoadTy = MVT::getVectorVT(MVT::i8, HwLen);
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto *MemOp = MF.getMachineMemOperand(LoadN->getMemOperand(), 0, HwLen);
+
+ SDValue Load = DAG.getMaskedLoad(LoadTy, dl, Chain, Base, Offset, Mask,
+ DAG.getUNDEF(LoadTy), LoadTy, MemOp,
+ ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
+ SDValue Value = opCastElem(Load, ResTy.getVectorElementType(), DAG);
+ return DAG.getMergeValues({Value, Chain}, dl);
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxStore(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ auto *StoreN = cast<StoreSDNode>(Op.getNode());
+ assert(StoreN->isUnindexed() && "Not widening indexed stores yet");
+ assert(StoreN->getMemoryVT().getVectorElementType() != MVT::i1 &&
+ "Not widening stores of i1 yet");
+
+ SDValue Chain = StoreN->getChain();
+ SDValue Base = StoreN->getBasePtr();
+ SDValue Offset = DAG.getUNDEF(MVT::i32);
+
+ SDValue Value = opCastElem(StoreN->getValue(), MVT::i8, DAG);
+ MVT ValueTy = ty(Value);
+ unsigned ValueLen = ValueTy.getVectorNumElements();
+ unsigned HwLen = Subtarget.getVectorLength();
+ assert(isPowerOf2_32(ValueLen));
+
+ for (unsigned Len = ValueLen; Len < HwLen; ) {
+ Value = opJoin({DAG.getUNDEF(ty(Value)), Value}, dl, DAG);
+ Len = ty(Value).getVectorNumElements(); // This is Len *= 2
+ }
+ assert(ty(Value).getVectorNumElements() == HwLen); // Paranoia
+
+ assert(ValueLen < HwLen && "vsetq(v1) prerequisite");
+ MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
+ SDValue Mask = getInstr(Hexagon::V6_pred_scalar2, dl, BoolTy,
+ {DAG.getConstant(ValueLen, dl, MVT::i32)}, DAG);
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto *MemOp = MF.getMachineMemOperand(StoreN->getMemOperand(), 0, HwLen);
+ return DAG.getMaskedStore(Chain, dl, Value, Base, Offset, Mask, ty(Value),
+ MemOp, ISD::UNINDEXED, false, false);
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+ MVT ElemTy = ty(Op0).getVectorElementType();
+ unsigned HwLen = Subtarget.getVectorLength();
+
+ unsigned WideOpLen = (8 * HwLen) / ElemTy.getSizeInBits();
+ assert(WideOpLen * ElemTy.getSizeInBits() == 8 * HwLen);
+ MVT WideOpTy = MVT::getVectorVT(ElemTy, WideOpLen);
+
+ SDValue WideOp0 = appendUndef(Op0, WideOpTy, DAG);
+ SDValue WideOp1 = appendUndef(Op1, WideOpTy, DAG);
+ EVT ResTy =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), WideOpTy);
+ SDValue SetCC = DAG.getNode(ISD::SETCC, dl, ResTy,
+ {WideOp0, WideOp1, Op.getOperand(2)});
+
+ EVT RetTy = getTypeToTransformTo(*DAG.getContext(), ty(Op));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RetTy,
+ {SetCC, getZero(dl, MVT::i32, DAG)});
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ unsigned HwWidth = 8*Subtarget.getVectorLength();
+
+ SDValue Op0 = Op.getOperand(0);
+ MVT ResTy = ty(Op);
+ MVT OpTy = ty(Op0);
+ if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
+ return SDValue();
+
+ // .-res, op-> ScalarVec Illegal HVX
+ // Scalar ok - -
+ // Illegal widen(insert) widen -
+ // HVX - widen ok
+
+ auto getFactor = [HwWidth](MVT Ty) {
+ unsigned Width = Ty.getSizeInBits();
+ return HwWidth > Width ? HwWidth / Width : 1;
+ };
+
+ auto getWideTy = [getFactor](MVT Ty) {
+ unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
+ return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
+ };
+
+ unsigned Opcode = Op.getOpcode() == ISD::SIGN_EXTEND ? HexagonISD::VUNPACK
+ : HexagonISD::VUNPACKU;
+ SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG);
+ SDValue WideRes = DAG.getNode(Opcode, dl, getWideTy(ResTy), WideOp);
+ return WideRes;
+}
+
+SDValue
+HexagonTargetLowering::WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const {
+ const SDLoc &dl(Op);
+ unsigned HwWidth = 8*Subtarget.getVectorLength();
+
+ SDValue Op0 = Op.getOperand(0);
+ MVT ResTy = ty(Op);
+ MVT OpTy = ty(Op0);
+ if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
+ return SDValue();
+
+ // .-res, op-> ScalarVec Illegal HVX
+ // Scalar ok extract(widen) -
+ // Illegal - widen widen
+ // HVX - - ok
+
+ auto getFactor = [HwWidth](MVT Ty) {
+ unsigned Width = Ty.getSizeInBits();
+ assert(HwWidth % Width == 0);
+ return HwWidth / Width;
+ };
+
+ auto getWideTy = [getFactor](MVT Ty) {
+ unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
+ return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
+ };
+
+ if (Subtarget.isHVXVectorType(OpTy))
+ return DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy), Op0);
+
+ assert(!isTypeLegal(OpTy) && "HVX-widening a truncate of scalar?");
+
+ SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG);
+ SDValue WideRes = DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy),
+ WideOp);
+ // If the original result wasn't legal and was supposed to be widened,
+ // we're done.
+ if (shouldWidenToHvx(ResTy, DAG))
+ return WideRes;
+
+ // The original result type wasn't meant to be widened to HVX, so
+ // leave it as it is. Standard legalization should be able to deal
+ // with it (since now it's a result of a target-idendependent ISD
+ // node).
+ assert(ResTy.isVector());
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResTy,
+ {WideRes, getZero(dl, MVT::i32, DAG)});
}
SDValue
@@ -1703,6 +2063,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
break;
case ISD::LOAD:
case ISD::STORE:
+ case ISD::MLOAD:
+ case ISD::MSTORE:
return SplitHvxMemOp(Op, DAG);
case ISD::CTPOP:
case ISD::CTLZ:
@@ -1716,11 +2078,16 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRA:
case ISD::SHL:
case ISD::SRL:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
case ISD::SETCC:
case ISD::VSELECT:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND_INREG:
+ case ISD::SPLAT_VECTOR:
return SplitHvxPairOp(Op, DAG);
}
}
@@ -1739,16 +2106,18 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SIGN_EXTEND: return LowerHvxSignExt(Op, DAG);
case ISD::ZERO_EXTEND: return LowerHvxZeroExt(Op, DAG);
case ISD::CTTZ: return LowerHvxCttz(Op, DAG);
+ case ISD::SELECT: return LowerHvxSelect(Op, DAG);
case ISD::SRA:
case ISD::SHL:
case ISD::SRL: return LowerHvxShift(Op, DAG);
- case ISD::MUL: return LowerHvxMul(Op, DAG);
case ISD::MULHS:
case ISD::MULHU: return LowerHvxMulh(Op, DAG);
case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG);
case ISD::SETCC:
case ISD::INTRINSIC_VOID: return Op;
case ISD::INTRINSIC_WO_CHAIN: return LowerHvxIntrinsic(Op, DAG);
+ case ISD::MLOAD:
+ case ISD::MSTORE: return LowerHvxMaskedOp(Op, DAG);
// Unaligned loads will be handled by the default lowering.
case ISD::LOAD: return SDValue();
}
@@ -1761,13 +2130,91 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
void
HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ unsigned Opc = N->getOpcode();
+ SDValue Op(N, 0);
+
+ switch (Opc) {
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+ if (SDValue T = WidenHvxExtend(Op, DAG))
+ Results.push_back(T);
+ }
+ break;
+ case ISD::SETCC:
+ if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+ if (SDValue T = WidenHvxSetCC(Op, DAG))
+ Results.push_back(T);
+ }
+ break;
+ case ISD::TRUNCATE:
+ if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+ if (SDValue T = WidenHvxTruncate(Op, DAG))
+ Results.push_back(T);
+ }
+ break;
+ case ISD::STORE: {
+ if (shouldWidenToHvx(ty(cast<StoreSDNode>(N)->getValue()), DAG)) {
+ SDValue Store = WidenHvxStore(Op, DAG);
+ Results.push_back(Store);
+ }
+ break;
+ }
+ case ISD::MLOAD:
+ if (isHvxPairTy(ty(Op))) {
+ SDValue S = SplitHvxMemOp(Op, DAG);
+ assert(S->getOpcode() == ISD::MERGE_VALUES);
+ Results.push_back(S.getOperand(0));
+ Results.push_back(S.getOperand(1));
+ }
+ break;
+ case ISD::MSTORE:
+ if (isHvxPairTy(ty(Op->getOperand(1)))) { // Stored value
+ SDValue S = SplitHvxMemOp(Op, DAG);
+ Results.push_back(S);
+ }
+ break;
+ default:
+ break;
+ }
}
void
HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
unsigned Opc = N->getOpcode();
+ SDValue Op(N, 0);
switch (Opc) {
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ if (shouldWidenToHvx(ty(Op), DAG)) {
+ if (SDValue T = WidenHvxExtend(Op, DAG))
+ Results.push_back(T);
+ }
+ break;
+ case ISD::SETCC:
+ if (shouldWidenToHvx(ty(Op), DAG)) {
+ if (SDValue T = WidenHvxSetCC(Op, DAG))
+ Results.push_back(T);
+ }
+ break;
+ case ISD::TRUNCATE:
+ if (shouldWidenToHvx(ty(Op), DAG)) {
+ if (SDValue T = WidenHvxTruncate(Op, DAG))
+ Results.push_back(T);
+ }
+ break;
+ case ISD::LOAD: {
+ if (shouldWidenToHvx(ty(Op), DAG)) {
+ SDValue Load = WidenHvxLoad(Op, DAG);
+ assert(Load->getOpcode() == ISD::MERGE_VALUES);
+ Results.push_back(Load.getOperand(0));
+ Results.push_back(Load.getOperand(1));
+ }
+ break;
+ }
case ISD::BITCAST:
if (isHvxBoolTy(ty(N->getOperand(0)))) {
SDValue Op(N, 0);
@@ -1784,44 +2231,95 @@ SDValue
HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
const {
const SDLoc &dl(N);
+ SelectionDAG &DAG = DCI.DAG;
SDValue Op(N, 0);
-
unsigned Opc = Op.getOpcode();
- if (Opc == ISD::VSELECT) {
- // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
- SDValue Cond = Op.getOperand(0);
- if (Cond->getOpcode() == ISD::XOR) {
- SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
- if (C1->getOpcode() == HexagonISD::QTRUE) {
- SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
- Op.getOperand(2), Op.getOperand(1));
- return VSel;
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SmallVector<SDValue, 4> Ops(N->ops().begin(), N->ops().end());
+
+ switch (Opc) {
+ case ISD::VSELECT: {
+ // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
+ SDValue Cond = Ops[0];
+ if (Cond->getOpcode() == ISD::XOR) {
+ SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+ if (C1->getOpcode() == HexagonISD::QTRUE)
+ return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, Ops[2], Ops[1]);
+ }
+ break;
+ }
+ case HexagonISD::V2Q:
+ if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
+ if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
+ return C->isNullValue() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
+ : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
+ }
+ break;
+ case HexagonISD::Q2V:
+ if (Ops[0].getOpcode() == HexagonISD::QTRUE)
+ return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
+ DAG.getConstant(-1, dl, MVT::i32));
+ if (Ops[0].getOpcode() == HexagonISD::QFALSE)
+ return getZero(dl, ty(Op), DAG);
+ break;
+ case HexagonISD::VINSERTW0:
+ if (isUndef(Ops[1]))
+ return Ops[0];;
+ break;
+ case HexagonISD::VROR: {
+ if (Ops[0].getOpcode() == HexagonISD::VROR) {
+ SDValue Vec = Ops[0].getOperand(0);
+ SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
+ SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
+ return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
}
+ break;
}
}
+
return SDValue();
}
bool
-HexagonTargetLowering::isHvxOperation(SDValue Op) const {
- // If the type of the result, or any operand type are HVX vector types,
- // this is an HVX operation.
- return Subtarget.isHVXVectorType(ty(Op), true) ||
- llvm::any_of(Op.getNode()->ops(),
- [this] (SDValue V) {
- return Subtarget.isHVXVectorType(ty(V), true);
- });
+HexagonTargetLowering::shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const {
+ auto Action = getPreferredHvxVectorAction(Ty);
+ if (Action == TargetLoweringBase::TypeWidenVector) {
+ EVT WideTy = getTypeToTransformTo(*DAG.getContext(), Ty);
+ assert(WideTy.isSimple());
+ return Subtarget.isHVXVectorType(WideTy.getSimpleVT(), true);
+ }
+ return false;
}
bool
-HexagonTargetLowering::isHvxOperation(SDNode *N) const {
+HexagonTargetLowering::isHvxOperation(SDNode *N, SelectionDAG &DAG) const {
+ if (!Subtarget.useHVXOps())
+ return false;
// If the type of any result, or any operand type are HVX vector types,
// this is an HVX operation.
- auto IsHvxTy = [this] (EVT Ty) {
+ auto IsHvxTy = [this](EVT Ty) {
return Ty.isSimple() && Subtarget.isHVXVectorType(Ty.getSimpleVT(), true);
};
- auto IsHvxOp = [this] (SDValue Op) {
- return Subtarget.isHVXVectorType(ty(Op), true);
+ auto IsHvxOp = [this](SDValue Op) {
+ return Op.getValueType().isSimple() &&
+ Subtarget.isHVXVectorType(ty(Op), true);
+ };
+ if (llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp))
+ return true;
+
+ // Check if this could be an HVX operation after type widening.
+ auto IsWidenedToHvx = [this, &DAG](SDValue Op) {
+ if (!Op.getValueType().isSimple())
+ return false;
+ MVT ValTy = ty(Op);
+ return ValTy.isVector() && shouldWidenToHvx(ValTy, DAG);
};
- return llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp);
+
+ for (int i = 0, e = N->getNumValues(); i != e; ++i) {
+ if (IsWidenedToHvx(SDValue(N, i)))
+ return true;
+ }
+ return llvm::any_of(N->ops(), IsWidenedToHvx);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index d1cd23c3be3e..26fc093d15a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1639,8 +1639,9 @@ bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
return false;
}
-bool HexagonInstrInfo::DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const {
+bool HexagonInstrInfo::ClobbersPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred,
+ bool SkipDead) const {
const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
@@ -2721,6 +2722,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
case Hexagon::PS_vloadrw_nt_ai:
case Hexagon::V6_vL32b_ai:
case Hexagon::V6_vS32b_ai:
+ case Hexagon::V6_vS32b_qpred_ai:
+ case Hexagon::V6_vS32b_nqpred_ai:
case Hexagon::V6_vL32b_nt_ai:
case Hexagon::V6_vS32b_nt_ai:
case Hexagon::V6_vL32Ub_ai:
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 847b9a672891..11717996935d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -238,8 +238,8 @@ public:
/// If the specified instruction defines any predicate
/// or condition code register(s) used for predication, returns true as well
/// as the definition predicate(s) by reference.
- bool DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const override;
+ bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+ bool SkipDead) const override;
/// Return true if the specified instruction can be predicated.
/// By default, this returns true for every instruction with a
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index 1245ee7974b5..796979e59061 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -1,4 +1,4 @@
-//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
+//===- HexagonIntrinsicsV60.td - V60 instruction intrinsics -*- tablegen *-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 2c1e0cadd9ee..76cc8f402c5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "HexagonLoopIdiomRecognition.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
@@ -16,6 +17,7 @@
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -40,6 +42,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsHexagon.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
@@ -108,136 +111,151 @@ static const char *HexagonVolatileMemcpyName
namespace llvm {
- void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
- Pass *createHexagonLoopIdiomPass();
+void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+Pass *createHexagonLoopIdiomPass();
} // end namespace llvm
namespace {
- class HexagonLoopIdiomRecognize : public LoopPass {
- public:
- static char ID;
-
- explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
- initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
- }
+class HexagonLoopIdiomRecognize {
+public:
+ explicit HexagonLoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+ LoopInfo *LF, const TargetLibraryInfo *TLI,
+ ScalarEvolution *SE)
+ : AA(AA), DT(DT), LF(LF), TLI(TLI), SE(SE) {}
+
+ bool run(Loop *L);
+
+private:
+ int getSCEVStride(const SCEVAddRecExpr *StoreEv);
+ bool isLegalStore(Loop *CurLoop, StoreInst *SI);
+ void collectStores(Loop *CurLoop, BasicBlock *BB,
+ SmallVectorImpl<StoreInst *> &Stores);
+ bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
+ bool coverLoop(Loop *L, SmallVectorImpl<Instruction *> &Insts) const;
+ bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks);
+ bool runOnCountableLoop(Loop *L);
+
+ AliasAnalysis *AA;
+ const DataLayout *DL;
+ DominatorTree *DT;
+ LoopInfo *LF;
+ const TargetLibraryInfo *TLI;
+ ScalarEvolution *SE;
+ bool HasMemcpy, HasMemmove;
+};
+
+class HexagonLoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+ static char ID;
+
+ explicit HexagonLoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+ initializeHexagonLoopIdiomRecognizeLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
- StringRef getPassName() const override {
- return "Recognize Hexagon-specific loop idioms";
- }
+ StringRef getPassName() const override {
+ return "Recognize Hexagon-specific loop idioms";
+ }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequiredID(LCSSAID);
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<TargetLibraryInfoWrapperPass>();
- }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ }
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
- private:
- int getSCEVStride(const SCEVAddRecExpr *StoreEv);
- bool isLegalStore(Loop *CurLoop, StoreInst *SI);
- void collectStores(Loop *CurLoop, BasicBlock *BB,
- SmallVectorImpl<StoreInst*> &Stores);
- bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
- bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
- bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
- SmallVectorImpl<BasicBlock*> &ExitBlocks);
- bool runOnCountableLoop(Loop *L);
-
- AliasAnalysis *AA;
- const DataLayout *DL;
- DominatorTree *DT;
- LoopInfo *LF;
- const TargetLibraryInfo *TLI;
- ScalarEvolution *SE;
- bool HasMemcpy, HasMemmove;
+struct Simplifier {
+ struct Rule {
+ using FuncType = std::function<Value *(Instruction *, LLVMContext &)>;
+ Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
+ StringRef Name; // For debugging.
+ FuncType Fn;
};
- struct Simplifier {
- struct Rule {
- using FuncType = std::function<Value* (Instruction*, LLVMContext&)>;
- Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
- StringRef Name; // For debugging.
- FuncType Fn;
- };
-
- void addRule(StringRef N, const Rule::FuncType &F) {
- Rules.push_back(Rule(N, F));
- }
+ void addRule(StringRef N, const Rule::FuncType &F) {
+ Rules.push_back(Rule(N, F));
+ }
- private:
- struct WorkListType {
- WorkListType() = default;
+private:
+ struct WorkListType {
+ WorkListType() = default;
- void push_back(Value* V) {
- // Do not push back duplicates.
- if (!S.count(V)) { Q.push_back(V); S.insert(V); }
+ void push_back(Value *V) {
+ // Do not push back duplicates.
+ if (!S.count(V)) {
+ Q.push_back(V);
+ S.insert(V);
}
+ }
- Value *pop_front_val() {
- Value *V = Q.front(); Q.pop_front(); S.erase(V);
- return V;
- }
+ Value *pop_front_val() {
+ Value *V = Q.front();
+ Q.pop_front();
+ S.erase(V);
+ return V;
+ }
- bool empty() const { return Q.empty(); }
+ bool empty() const { return Q.empty(); }
- private:
- std::deque<Value*> Q;
- std::set<Value*> S;
- };
+ private:
+ std::deque<Value *> Q;
+ std::set<Value *> S;
+ };
- using ValueSetType = std::set<Value *>;
+ using ValueSetType = std::set<Value *>;
- std::vector<Rule> Rules;
+ std::vector<Rule> Rules;
- public:
- struct Context {
- using ValueMapType = DenseMap<Value *, Value *>;
+public:
+ struct Context {
+ using ValueMapType = DenseMap<Value *, Value *>;
- Value *Root;
- ValueSetType Used; // The set of all cloned values used by Root.
- ValueSetType Clones; // The set of all cloned values.
- LLVMContext &Ctx;
+ Value *Root;
+ ValueSetType Used; // The set of all cloned values used by Root.
+ ValueSetType Clones; // The set of all cloned values.
+ LLVMContext &Ctx;
- Context(Instruction *Exp)
+ Context(Instruction *Exp)
: Ctx(Exp->getParent()->getParent()->getContext()) {
- initialize(Exp);
- }
-
- ~Context() { cleanup(); }
+ initialize(Exp);
+ }
- void print(raw_ostream &OS, const Value *V) const;
- Value *materialize(BasicBlock *B, BasicBlock::iterator At);
+ ~Context() { cleanup(); }
- private:
- friend struct Simplifier;
+ void print(raw_ostream &OS, const Value *V) const;
+ Value *materialize(BasicBlock *B, BasicBlock::iterator At);
- void initialize(Instruction *Exp);
- void cleanup();
+ private:
+ friend struct Simplifier;
- template <typename FuncT> void traverse(Value *V, FuncT F);
- void record(Value *V);
- void use(Value *V);
- void unuse(Value *V);
+ void initialize(Instruction *Exp);
+ void cleanup();
- bool equal(const Instruction *I, const Instruction *J) const;
- Value *find(Value *Tree, Value *Sub) const;
- Value *subst(Value *Tree, Value *OldV, Value *NewV);
- void replace(Value *OldV, Value *NewV);
- void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
- };
+ template <typename FuncT> void traverse(Value *V, FuncT F);
+ void record(Value *V);
+ void use(Value *V);
+ void unuse(Value *V);
- Value *simplify(Context &C);
+ bool equal(const Instruction *I, const Instruction *J) const;
+ Value *find(Value *Tree, Value *Sub) const;
+ Value *subst(Value *Tree, Value *OldV, Value *NewV);
+ void replace(Value *OldV, Value *NewV);
+ void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
};
+ Value *simplify(Context &C);
+};
+
struct PE {
PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
@@ -253,10 +271,10 @@ namespace {
} // end anonymous namespace
-char HexagonLoopIdiomRecognize::ID = 0;
+char HexagonLoopIdiomRecognizeLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
- "Recognize Hexagon-specific loop idioms", false, false)
+INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
+ "Recognize Hexagon-specific loop idioms", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
@@ -264,8 +282,8 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
- "Recognize Hexagon-specific loop idioms", false, false)
+INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
+ "Recognize Hexagon-specific loop idioms", false, false)
template <typename FuncT>
void Simplifier::Context::traverse(Value *V, FuncT F) {
@@ -1973,7 +1991,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
// Get the location that may be stored across the loop. Since the access
// is strided positively through memory, we say that the modified location
// starts at the pointer and has infinite size.
- LocationSize AccessSize = LocationSize::unknown();
+ LocationSize AccessSize = LocationSize::afterPointer();
// If the loop iterates a fixed number of times, we can refine the access
// size to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -2404,14 +2422,11 @@ bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
return Changed;
}
-bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool HexagonLoopIdiomRecognize::run(Loop *L) {
const Module &M = *L->getHeader()->getParent()->getParent();
if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
return false;
- if (skipLoop(L))
- return false;
-
// If the loop could not be converted to canonical form, it must have an
// indirectbr in it, just give up.
if (!L->getLoopPreheader())
@@ -2422,13 +2437,7 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
if (Name == "memset" || Name == "memcpy" || Name == "memmove")
return false;
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DL = &L->getHeader()->getModule()->getDataLayout();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent());
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
HasMemcpy = TLI->has(LibFunc_memcpy);
HasMemmove = TLI->has(LibFunc_memmove);
@@ -2438,6 +2447,30 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
return false;
}
+bool HexagonLoopIdiomRecognizeLegacyPass::runOnLoop(Loop *L,
+ LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ return HexagonLoopIdiomRecognize(AA, DT, LF, TLI, SE).run(L);
+}
+
Pass *llvm::createHexagonLoopIdiomPass() {
- return new HexagonLoopIdiomRecognize();
+ return new HexagonLoopIdiomRecognizeLegacyPass();
+}
+
+PreservedAnalyses
+HexagonLoopIdiomRecognitionPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ return HexagonLoopIdiomRecognize(&AR.AA, &AR.DT, &AR.LI, &AR.TLI, &AR.SE)
+ .run(&L)
+ ? getLoopPassPreservedAnalyses()
+ : PreservedAnalyses::all();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h
new file mode 100644
index 000000000000..28ec83b05dac
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.h
@@ -0,0 +1,24 @@
+//===- HexagonLoopIdiomRecognition.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONLOOPIDIOMRECOGNITION_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONLOOPIDIOMRECOGNITION_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+struct HexagonLoopIdiomRecognitionPass
+ : PassInfoMixin<HexagonLoopIdiomRecognitionPass> {
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONLOOPIDIOMRECOGNITION_H
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 188d91355a35..9507de95231f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -104,7 +104,7 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
HexagonMCInstrInfo::setOuterLoop(MCB);
return;
}
- MCInst *MCI = new (AP.OutContext) MCInst;
+ MCInst *MCI = AP.OutContext.createMCInst();
MCI->setOpcode(MI->getOpcode());
assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
"MCI opcode should have been set on construction");
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index c718e5f2d9fb..2cdfbe7845b6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -246,7 +246,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
LLVM_DEBUG(dbgs() << "\t\t[DefNode]: "
<< Print<NodeAddr<DefNode *>>(DA, *DFG) << "\n");
- RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
+ RegisterRef DR = DA.Addr->getRegRef(*DFG);
auto UseSet = LV->getAllReachedUses(DR, DA);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index d818e0897f75..e026bb6d601d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -11,7 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -19,8 +21,6 @@
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "Hexagon.h"
-
using namespace llvm;
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
index cc10627955fb..d216c511a994 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -1,4 +1,4 @@
-//==- HexagonPatterns.td - Target Description for Hexagon -*- tablegen -*-===//
+//===- HexagonPatterns.td - Selection Patterns for Hexagon -*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -229,6 +229,21 @@ def NegImm32: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
}]>;
+def SplatB: SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ assert(isUInt<8>(V) || V >> 8 == 0xFFFFFF);
+ V &= 0xFF;
+ uint32_t S = V << 24 | V << 16 | V << 8 | V;
+ return CurDAG->getTargetConstant(S, SDLoc(N), MVT::i32);
+}]>;
+
+def SplatH: SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ assert(isUInt<16>(V) || V >> 16 == 0xFFFF);
+ V &= 0xFFFF;
+ return CurDAG->getTargetConstant(V << 16 | V, SDLoc(N), MVT::i32);
+}]>;
+
// Helpers for type promotions/contractions.
def I1toI32: OutPatFrag<(ops node:$Rs), (C2_muxii (i1 $Rs), 1, 0)>;
@@ -351,12 +366,14 @@ multiclass NopCast_pat<ValueType Ty1, ValueType Ty2, RegisterClass RC> {
def: Pat<(Ty2 (bitconvert (Ty1 RC:$Val))), (Ty2 RC:$Val)>;
}
-
// Frags for commonly used SDNodes.
def Add: pf2<add>; def And: pf2<and>; def Sra: pf2<sra>;
def Sub: pf2<sub>; def Or: pf2<or>; def Srl: pf2<srl>;
def Mul: pf2<mul>; def Xor: pf2<xor>; def Shl: pf2<shl>;
+def Smin: pf2<smin>; def Smax: pf2<smax>;
+def Umin: pf2<umin>; def Umax: pf2<umax>;
+
def Rol: pf2<rotl>;
// --(1) Immediate -------------------------------------------------------
@@ -909,25 +926,14 @@ let AddedComplexity = 200 in {
defm: SelMinMax16_pats<setult, A2_minu, A2_maxu>;
}
-let AddedComplexity = 200 in {
- defm: MinMax_pats<A2_min, A2_max, select, setgt, i1, I32>;
- defm: MinMax_pats<A2_min, A2_max, select, setge, i1, I32>;
- defm: MinMax_pats<A2_max, A2_min, select, setlt, i1, I32>;
- defm: MinMax_pats<A2_max, A2_min, select, setle, i1, I32>;
- defm: MinMax_pats<A2_minu, A2_maxu, select, setugt, i1, I32>;
- defm: MinMax_pats<A2_minu, A2_maxu, select, setuge, i1, I32>;
- defm: MinMax_pats<A2_maxu, A2_minu, select, setult, i1, I32>;
- defm: MinMax_pats<A2_maxu, A2_minu, select, setule, i1, I32>;
-
- defm: MinMax_pats<A2_minp, A2_maxp, select, setgt, i1, I64>;
- defm: MinMax_pats<A2_minp, A2_maxp, select, setge, i1, I64>;
- defm: MinMax_pats<A2_maxp, A2_minp, select, setlt, i1, I64>;
- defm: MinMax_pats<A2_maxp, A2_minp, select, setle, i1, I64>;
- defm: MinMax_pats<A2_minup, A2_maxup, select, setugt, i1, I64>;
- defm: MinMax_pats<A2_minup, A2_maxup, select, setuge, i1, I64>;
- defm: MinMax_pats<A2_maxup, A2_minup, select, setult, i1, I64>;
- defm: MinMax_pats<A2_maxup, A2_minup, select, setule, i1, I64>;
-}
+def: OpR_RR_pat<A2_min, Smin, i32, I32, I32>;
+def: OpR_RR_pat<A2_max, Smax, i32, I32, I32>;
+def: OpR_RR_pat<A2_minu, Umin, i32, I32, I32>;
+def: OpR_RR_pat<A2_maxu, Umax, i32, I32, I32>;
+def: OpR_RR_pat<A2_minp, Smin, i64, I64, I64>;
+def: OpR_RR_pat<A2_maxp, Smax, i64, I64, I64>;
+def: OpR_RR_pat<A2_minup, Umin, i64, I64, I64>;
+def: OpR_RR_pat<A2_maxup, Umax, i64, I64, I64>;
let AddedComplexity = 100 in {
defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setogt, i1, F32>;
@@ -943,18 +949,20 @@ let AddedComplexity = 100, Predicates = [HasV67] in {
defm: MinMax_pats<F2_dfmax, F2_dfmin, select, setole, i1, F64>;
}
-defm: MinMax_pats<A2_vminb, A2_vmaxb, vselect, setgt, v8i1, V8I8>;
-defm: MinMax_pats<A2_vminb, A2_vmaxb, vselect, setge, v8i1, V8I8>;
-defm: MinMax_pats<A2_vminh, A2_vmaxh, vselect, setgt, v4i1, V4I16>;
-defm: MinMax_pats<A2_vminh, A2_vmaxh, vselect, setge, v4i1, V4I16>;
-defm: MinMax_pats<A2_vminw, A2_vmaxw, vselect, setgt, v2i1, V2I32>;
-defm: MinMax_pats<A2_vminw, A2_vmaxw, vselect, setge, v2i1, V2I32>;
-defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setugt, v8i1, V8I8>;
-defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setuge, v8i1, V8I8>;
-defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setugt, v4i1, V4I16>;
-defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setuge, v4i1, V4I16>;
-defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setugt, v2i1, V2I32>;
-defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setuge, v2i1, V2I32>;
+def: OpR_RR_pat<A2_vminb, Smin, v8i8, V8I8>;
+def: OpR_RR_pat<A2_vmaxb, Smax, v8i8, V8I8>;
+def: OpR_RR_pat<A2_vminub, Umin, v8i8, V8I8>;
+def: OpR_RR_pat<A2_vmaxub, Umax, v8i8, V8I8>;
+
+def: OpR_RR_pat<A2_vminh, Smin, v4i16, V4I16>;
+def: OpR_RR_pat<A2_vmaxh, Smax, v4i16, V4I16>;
+def: OpR_RR_pat<A2_vminuh, Umin, v4i16, V4I16>;
+def: OpR_RR_pat<A2_vmaxuh, Umax, v4i16, V4I16>;
+
+def: OpR_RR_pat<A2_vminw, Smin, v2i32, V2I32>;
+def: OpR_RR_pat<A2_vmaxw, Smax, v2i32, V2I32>;
+def: OpR_RR_pat<A2_vminuw, Umin, v2i32, V2I32>;
+def: OpR_RR_pat<A2_vmaxuw, Umax, v2i32, V2I32>;
// --(7) Insert/extract --------------------------------------------------
//
@@ -991,21 +999,26 @@ def: Pat<(HexagonEXTRACTU I32:$Rs, I32:$Width, I32:$Off),
def: Pat<(HexagonEXTRACTU I64:$Rs, I32:$Width, I32:$Off),
(S2_extractup_rp I64:$Rs, (Combinew $Width, $Off))>;
-def SDTHexagonVSPLAT:
- SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-
-def HexagonVSPLAT: SDNode<"HexagonISD::VSPLAT", SDTHexagonVSPLAT>;
-
-def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
-def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
-def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
+def: Pat<(v4i8 (splat_vector anyint:$V)), (ToI32 (SplatB $V))>;
+def: Pat<(v2i16 (splat_vector anyint:$V)), (ToI32 (SplatH $V))>;
+def: Pat<(v8i8 (splat_vector anyint:$V)),
+ (Combinew (ToI32 (SplatB $V)), (ToI32 (SplatB $V)))>;
+def: Pat<(v4i16 (splat_vector anyint:$V)),
+ (Combinew (ToI32 (SplatH $V)), (ToI32 (SplatH $V)))>;
+let AddedComplexity = 10 in
+def: Pat<(v2i32 (splat_vector s8_0ImmPred:$s8)),
(A2_combineii imm:$s8, imm:$s8)>;
-def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (Combinew I32:$Rs, I32:$Rs)>;
+def: Pat<(v2i32 (splat_vector anyimm:$V)), (Combinew (ToI32 $V), (ToI32 $V))>;
+
+def: Pat<(v4i8 (splat_vector I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
+def: Pat<(v2i16 (splat_vector I32:$Rs)), (LoReg (S2_vsplatrh I32:$Rs))>;
+def: Pat<(v4i16 (splat_vector I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+def: Pat<(v2i32 (splat_vector I32:$Rs)), (Combinew I32:$Rs, I32:$Rs)>;
let AddedComplexity = 10 in
-def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
+def: Pat<(v8i8 (splat_vector I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
Requires<[HasV62]>;
-def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)),
+def: Pat<(v8i8 (splat_vector I32:$Rs)),
(Combinew (S2_vsplatrb I32:$Rs), (S2_vsplatrb I32:$Rs))>;
@@ -1082,9 +1095,9 @@ def FShl32r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
(HiReg (S2_asl_r_p (Combinew $Rs, $Rt), $Ru))>;
def FShl64i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S),
- (S2_lsr_i_p_or (S2_asl_i_p $Rt, $S), $Rs, (Subi<64> $S))>;
+ (S2_lsr_i_p_or (S2_asl_i_p $Rs, $S), $Rt, (Subi<64> $S))>;
def FShl64r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
- (S2_lsr_r_p_or (S2_asl_r_p $Rt, $Ru), $Rs, (A2_subri 64, $Ru))>;
+ (S2_lsr_r_p_or (S2_asl_r_p $Rs, $Ru), $Rt, (A2_subri 64, $Ru))>;
// Combined SDNodeXForm: (Divu8 (Subi<64> $S))
def Divu64_8: SDNodeXForm<imm, [{
@@ -1307,17 +1320,17 @@ def: OpR_RR_pat<S2_asr_r_vh, pf2<HexagonVASR>, v4i16, V4I16, I32>;
def: OpR_RR_pat<S2_lsr_r_vw, pf2<HexagonVLSR>, v2i32, V2I32, I32>;
def: OpR_RR_pat<S2_lsr_r_vh, pf2<HexagonVLSR>, v4i16, V4I16, I32>;
-def: Pat<(sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c))),
+def: Pat<(sra V2I32:$b, (v2i32 (splat_vector u5_0ImmPred:$c))),
(S2_asr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c))),
+def: Pat<(srl V2I32:$b, (v2i32 (splat_vector u5_0ImmPred:$c))),
(S2_lsr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c))),
+def: Pat<(shl V2I32:$b, (v2i32 (splat_vector u5_0ImmPred:$c))),
(S2_asl_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
+def: Pat<(sra V4I16:$b, (v4i16 (splat_vector u4_0ImmPred:$c))),
(S2_asr_i_vh V4I16:$b, imm:$c)>;
-def: Pat<(srl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
+def: Pat<(srl V4I16:$b, (v4i16 (splat_vector u4_0ImmPred:$c))),
(S2_lsr_i_vh V4I16:$b, imm:$c)>;
-def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
+def: Pat<(shl V4I16:$b, (v4i16 (splat_vector u4_0ImmPred:$c))),
(S2_asl_i_vh V4I16:$b, imm:$c)>;
def: Pat<(HexagonVASR V2I16:$Rs, u4_0ImmPred:$S),
@@ -1688,8 +1701,6 @@ def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
(F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
(F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
- (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
(PS_vmulw V2I32:$Rs, V2I32:$Rt)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 078a7135c55b..cd894c555adc 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -1,3 +1,15 @@
+//===- HexagonPatternsHVX.td - Selection Patterns for HVX --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+def SDTVecUnaryOp:
+ SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+
def SDTVecBinOp:
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>;
@@ -9,9 +21,6 @@ def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
[SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
def HexagonVINSERTW0: SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
-def SDTHexagonVSPLATW: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-def HexagonVSPLATW: SDNode<"HexagonISD::VSPLATW", SDTHexagonVSPLATW>;
-
def HwLen2: SDNodeXForm<imm, [{
const auto &ST = static_cast<const HexagonSubtarget&>(CurDAG->getSubtarget());
return CurDAG->getTargetConstant(ST.getVectorLength()/2, SDLoc(N), MVT::i32);
@@ -33,37 +42,29 @@ def Combineq: OutPatFrag<(ops node:$Qs, node:$Qt),
def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>;
def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
-def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>;
def HexagonQCAT: SDNode<"HexagonISD::QCAT", SDTVecBinOp>;
def HexagonQTRUE: SDNode<"HexagonISD::QTRUE", SDTVecLeaf>;
def HexagonQFALSE: SDNode<"HexagonISD::QFALSE", SDTVecLeaf>;
+def HexagonVPACKL: SDNode<"HexagonISD::VPACKL", SDTVecUnaryOp>;
+def HexagonVUNPACK: SDNode<"HexagonISD::VUNPACK", SDTVecUnaryOp>;
+def HexagonVUNPACKU: SDNode<"HexagonISD::VUNPACKU", SDTVecUnaryOp>;
-def vzero: PatFrag<(ops), (HexagonVZERO)>;
+def vzero: PatFrag<(ops), (splat_vector (i32 0))>;
def qtrue: PatFrag<(ops), (HexagonQTRUE)>;
def qfalse: PatFrag<(ops), (HexagonQFALSE)>;
def qcat: PatFrag<(ops node:$Qs, node:$Qt),
(HexagonQCAT node:$Qs, node:$Qt)>;
-def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
+def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
+def vpackl: PatFrag<(ops node:$Vs), (HexagonVPACKL node:$Vs)>;
+def vunpack: PatFrag<(ops node:$Vs), (HexagonVUNPACK node:$Vs)>;
+def vunpacku: PatFrag<(ops node:$Vs), (HexagonVUNPACKU node:$Vs)>;
def VSxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackb $Vs)>;
def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh $Vs)>;
def VZxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackub $Vs)>;
def VZxth: OutPatFrag<(ops node:$Vs), (V6_vunpackuh $Vs)>;
-def SplatB: SDNodeXForm<imm, [{
- uint32_t V = N->getZExtValue();
- assert(isUInt<8>(V));
- uint32_t S = V << 24 | V << 16 | V << 8 | V;
- return CurDAG->getTargetConstant(S, SDLoc(N), MVT::i32);
-}]>;
-
-def SplatH: SDNodeXForm<imm, [{
- uint32_t V = N->getZExtValue();
- assert(isUInt<16>(V));
- return CurDAG->getTargetConstant(V << 16 | V, SDLoc(N), MVT::i32);
-}]>;
-
def IsVecOff : PatLeaf<(i32 imm), [{
int32_t V = N->getSExtValue();
int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass);
@@ -171,16 +172,19 @@ let Predicates = [UseHVX] in {
}
let Predicates = [UseHVX] in {
- def: Pat<(VecI8 vzero), (V6_vd0)>;
- def: Pat<(VecI16 vzero), (V6_vd0)>;
- def: Pat<(VecI32 vzero), (V6_vd0)>;
- def: Pat<(VecPI8 vzero), (PS_vdd0)>;
- def: Pat<(VecPI16 vzero), (PS_vdd0)>;
- def: Pat<(VecPI32 vzero), (PS_vdd0)>;
-
- def: Pat<(concat_vectors (VecI8 vzero), (VecI8 vzero)), (PS_vdd0)>;
- def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
- def: Pat<(concat_vectors (VecI32 vzero), (VecI32 vzero)), (PS_vdd0)>;
+ let AddedComplexity = 100 in {
+ // These should be preferred over a vsplat of 0.
+ def: Pat<(VecI8 vzero), (V6_vd0)>;
+ def: Pat<(VecI16 vzero), (V6_vd0)>;
+ def: Pat<(VecI32 vzero), (V6_vd0)>;
+ def: Pat<(VecPI8 vzero), (PS_vdd0)>;
+ def: Pat<(VecPI16 vzero), (PS_vdd0)>;
+ def: Pat<(VecPI32 vzero), (PS_vdd0)>;
+
+ def: Pat<(concat_vectors (VecI8 vzero), (VecI8 vzero)), (PS_vdd0)>;
+ def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
+ def: Pat<(concat_vectors (VecI32 vzero), (VecI32 vzero)), (PS_vdd0)>;
+ }
def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
(Combinev HvxVR:$Vt, HvxVR:$Vs)>;
@@ -207,63 +211,70 @@ let Predicates = [UseHVX] in {
(V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
}
-def Vsplatib: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatB $V)))>;
-def Vsplatih: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatH $V)))>;
-def Vsplatiw: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 $V))>;
-
-def Vsplatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
-def Vsplatrh: OutPatFrag<(ops node:$Rs),
- (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
-def Vsplatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
+// Splats for HvxV60
+def V60splatib: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatB $V)))>;
+def V60splatih: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatH $V)))>;
+def V60splatiw: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 $V))>;
+def V60splatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
+def V60splatrh: OutPatFrag<(ops node:$Rs),
+ (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
+def V60splatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
+
+// Splats for HvxV62+
+def V62splatib: OutPatFrag<(ops node:$V), (V6_lvsplatb (ToI32 $V))>;
+def V62splatih: OutPatFrag<(ops node:$V), (V6_lvsplath (ToI32 $V))>;
+def V62splatiw: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 $V))>;
+def V62splatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatb $Rs)>;
+def V62splatrh: OutPatFrag<(ops node:$Rs), (V6_lvsplath $Rs)>;
+def V62splatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
def Rep: OutPatFrag<(ops node:$N), (Combinev $N, $N)>;
-let Predicates = [UseHVX] in {
+let Predicates = [UseHVX,UseHVXV60] in {
let AddedComplexity = 10 in {
- def: Pat<(VecI8 (HexagonVSPLAT u8_0ImmPred:$V)), (Vsplatib $V)>;
- def: Pat<(VecI16 (HexagonVSPLAT u16_0ImmPred:$V)), (Vsplatih $V)>;
- def: Pat<(VecI32 (HexagonVSPLAT anyimm:$V)), (Vsplatiw $V)>;
- def: Pat<(VecPI8 (HexagonVSPLAT u8_0ImmPred:$V)), (Rep (Vsplatib $V))>;
- def: Pat<(VecPI16 (HexagonVSPLAT u16_0ImmPred:$V)), (Rep (Vsplatih $V))>;
- def: Pat<(VecPI32 (HexagonVSPLAT anyimm:$V)), (Rep (Vsplatiw $V))>;
+ def: Pat<(VecI8 (splat_vector u8_0ImmPred:$V)), (V60splatib $V)>;
+ def: Pat<(VecI16 (splat_vector u16_0ImmPred:$V)), (V60splatih $V)>;
+ def: Pat<(VecI32 (splat_vector anyimm:$V)), (V60splatiw $V)>;
+ def: Pat<(VecPI8 (splat_vector u8_0ImmPred:$V)), (Rep (V60splatib $V))>;
+ def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)), (Rep (V60splatih $V))>;
+ def: Pat<(VecPI32 (splat_vector anyimm:$V)), (Rep (V60splatiw $V))>;
+ }
+ def: Pat<(VecI8 (splat_vector I32:$Rs)), (V60splatrb $Rs)>;
+ def: Pat<(VecI16 (splat_vector I32:$Rs)), (V60splatrh $Rs)>;
+ def: Pat<(VecI32 (splat_vector I32:$Rs)), (V60splatrw $Rs)>;
+ def: Pat<(VecPI8 (splat_vector I32:$Rs)), (Rep (V60splatrb $Rs))>;
+ def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (V60splatrh $Rs))>;
+ def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V60splatrw $Rs))>;
+}
+let Predicates = [UseHVX,UseHVXV62] in {
+ let AddedComplexity = 30 in {
+ def: Pat<(VecI8 (splat_vector u8_0ImmPred:$V)), (V62splatib imm:$V)>;
+ def: Pat<(VecI16 (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>;
+ def: Pat<(VecI32 (splat_vector anyimm:$V)), (V62splatiw imm:$V)>;
+ def: Pat<(VecPI8 (splat_vector u8_0ImmPred:$V)),
+ (Rep (V62splatib imm:$V))>;
+ def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)),
+ (Rep (V62splatih imm:$V))>;
+ def: Pat<(VecPI32 (splat_vector anyimm:$V)),
+ (Rep (V62splatiw imm:$V))>;
+ }
+ let AddedComplexity = 20 in {
+ def: Pat<(VecI8 (splat_vector I32:$Rs)), (V62splatrb $Rs)>;
+ def: Pat<(VecI16 (splat_vector I32:$Rs)), (V62splatrh $Rs)>;
+ def: Pat<(VecI32 (splat_vector I32:$Rs)), (V62splatrw $Rs)>;
+ def: Pat<(VecPI8 (splat_vector I32:$Rs)), (Rep (V62splatrb $Rs))>;
+ def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (V62splatrh $Rs))>;
+ def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V62splatrw $Rs))>;
}
- def: Pat<(VecI8 (HexagonVSPLAT I32:$Rs)), (Vsplatrb $Rs)>;
- def: Pat<(VecI16 (HexagonVSPLAT I32:$Rs)), (Vsplatrh $Rs)>;
- def: Pat<(VecI32 (HexagonVSPLAT I32:$Rs)), (Vsplatrw $Rs)>;
- def: Pat<(VecPI8 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrb $Rs))>;
- def: Pat<(VecPI16 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrh $Rs))>;
- def: Pat<(VecPI32 (HexagonVSPLAT I32:$Rs)), (Rep (Vsplatrw $Rs))>;
-
- def: Pat<(VecI8 (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
- def: Pat<(VecI16 (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
- def: Pat<(VecI32 (HexagonVSPLATW I32:$Rs)), (Vsplatrw $Rs)>;
- def: Pat<(VecPI8 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
- def: Pat<(VecPI16 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
- def: Pat<(VecPI32 (HexagonVSPLATW I32:$Rs)), (Rep (Vsplatrw $Rs))>;
}
class Vneg1<ValueType VecTy>
- : PatFrag<(ops), (VecTy (HexagonVSPLATW (i32 -1)))>;
+ : PatFrag<(ops), (VecTy (splat_vector (i32 -1)))>;
class Vnot<ValueType VecTy>
: PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>;
let Predicates = [UseHVX] in {
- let AddedComplexity = 220 in {
- defm: MinMax_pats<V6_vminb, V6_vmaxb, vselect, setgt, VecQ8, HVI8>;
- defm: MinMax_pats<V6_vminb, V6_vmaxb, vselect, setge, VecQ8, HVI8>;
- defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setugt, VecQ8, HVI8>;
- defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setuge, VecQ8, HVI8>;
- defm: MinMax_pats<V6_vminh, V6_vmaxh, vselect, setgt, VecQ16, HVI16>;
- defm: MinMax_pats<V6_vminh, V6_vmaxh, vselect, setge, VecQ16, HVI16>;
- defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setugt, VecQ16, HVI16>;
- defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setuge, VecQ16, HVI16>;
- defm: MinMax_pats<V6_vminw, V6_vmaxw, vselect, setgt, VecQ32, HVI32>;
- defm: MinMax_pats<V6_vminw, V6_vmaxw, vselect, setge, VecQ32, HVI32>;
- }
-}
-
-let Predicates = [UseHVX] in {
let AddedComplexity = 200 in {
def: Pat<(Vnot<VecI8> HVI8:$Vs), (V6_vnot HvxVR:$Vs)>;
def: Pat<(Vnot<VecI16> HVI16:$Vs), (V6_vnot HvxVR:$Vs)>;
@@ -292,6 +303,17 @@ let Predicates = [UseHVX] in {
def: OpR_RR_pat<V6_vxor, Xor, VecI16, HVI16>;
def: OpR_RR_pat<V6_vxor, Xor, VecI32, HVI32>;
+ def: OpR_RR_pat<V6_vminb, Smin, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vmaxb, Smax, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vminub, Umin, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vmaxub, Umax, VecI8, HVI8>;
+ def: OpR_RR_pat<V6_vminh, Smin, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vmaxh, Smax, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vminuh, Umin, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vmaxuh, Umax, VecI16, HVI16>;
+ def: OpR_RR_pat<V6_vminw, Smin, VecI32, HVI32>;
+ def: OpR_RR_pat<V6_vmaxw, Smax, VecI32, HVI32>;
+
def: Pat<(vselect HQ8:$Qu, HVI8:$Vs, HVI8:$Vt),
(V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
def: Pat<(vselect HQ16:$Qu, HVI16:$Vs, HVI16:$Vt),
@@ -308,6 +330,20 @@ let Predicates = [UseHVX] in {
}
let Predicates = [UseHVX] in {
+ // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
+ // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
+ // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...).
+ def: Pat<(mul HVI8:$Vs, HVI8:$Vt),
+ (V6_vshuffeb (HiVec (V6_vmpybv HvxVR:$Vs, HvxVR:$Vt)),
+ (LoVec (V6_vmpybv HvxVR:$Vs, HvxVR:$Vt)))>;
+ def: Pat<(mul HVI16:$Vs, HVI16:$Vt),
+ (V6_vmpyih HvxVR:$Vs, HvxVR:$Vt)>;
+ def: Pat<(mul HVI32:$Vs, HVI32:$Vt),
+ (V6_vmpyiewuh_acc (V6_vmpyieoh HvxVR:$Vs, HvxVR:$Vt),
+ HvxVR:$Vs, HvxVR:$Vt)>;
+}
+
+let Predicates = [UseHVX] in {
def: Pat<(VecPI16 (sext HVI8:$Vs)), (VSxtb $Vs)>;
def: Pat<(VecPI32 (sext HVI16:$Vs)), (VSxth $Vs)>;
def: Pat<(VecPI16 (zext HVI8:$Vs)), (VZxtb $Vs)>;
@@ -364,6 +400,14 @@ let Predicates = [UseHVX] in {
(V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>;
}
+ // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen).
+ def: Pat<(VecI8 (valign HVI8:$Vt, HVI8:$Vs, I32:$Rt)),
+ (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+ def: Pat<(VecI16 (valign HVI16:$Vt, HVI16:$Vs, I32:$Rt)),
+ (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+ def: Pat<(VecI32 (valign HVI32:$Vt, HVI32:$Vs, I32:$Rt)),
+ (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>;
+
def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt),
(V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt),
(V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>;
@@ -393,10 +437,43 @@ let Predicates = [UseHVX] in {
def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>;
def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>;
- def: Pat<(VecI16 (bswap HVI16:$Vs)),
- (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x01010101)))>;
- def: Pat<(VecI32 (bswap HVI32:$Vs)),
- (V6_vdelta HvxVR:$Vs, (V6_lvsplatw (A2_tfrsi 0x03030303)))>;
+ // Vpackl is a pseudo-op that is used when legalizing widened truncates.
+ // It should never be produced with a register pair in the output, but
+ // it can happen to have a pair as an input.
+ def: Pat<(VecI8 (vpackl HVI16:$Vs)), (V6_vdealb HvxVR:$Vs)>;
+ def: Pat<(VecI8 (vpackl HVI32:$Vs)), (V6_vdealb4w (IMPLICIT_DEF), HvxVR:$Vs)>;
+ def: Pat<(VecI16 (vpackl HVI32:$Vs)), (V6_vdealh HvxVR:$Vs)>;
+ def: Pat<(VecI8 (vpackl HWI16:$Vs)), (V6_vpackeb (HiVec $Vs), (LoVec $Vs))>;
+ def: Pat<(VecI8 (vpackl HWI32:$Vs)),
+ (V6_vpackeb (IMPLICIT_DEF), (V6_vpackeh (HiVec $Vs), (LoVec $Vs)))>;
+ def: Pat<(VecI16 (vpackl HWI32:$Vs)), (V6_vpackeh (HiVec $Vs), (LoVec $Vs))>;
+
+ def: Pat<(VecI16 (vunpack HVI8:$Vs)), (LoVec (VSxtb $Vs))>;
+ def: Pat<(VecI32 (vunpack HVI8:$Vs)), (LoVec (VSxth (LoVec (VSxtb $Vs))))>;
+ def: Pat<(VecI32 (vunpack HVI16:$Vs)), (LoVec (VSxth $Vs))>;
+ def: Pat<(VecPI16 (vunpack HVI8:$Vs)), (VSxtb $Vs)>;
+ def: Pat<(VecPI32 (vunpack HVI8:$Vs)), (VSxth (LoVec (VSxtb $Vs)))>;
+ def: Pat<(VecPI32 (vunpack HVI32:$Vs)), (VSxth $Vs)>;
+
+ def: Pat<(VecI16 (vunpacku HVI8:$Vs)), (LoVec (VZxtb $Vs))>;
+ def: Pat<(VecI32 (vunpacku HVI8:$Vs)), (LoVec (VZxth (LoVec (VZxtb $Vs))))>;
+ def: Pat<(VecI32 (vunpacku HVI16:$Vs)), (LoVec (VZxth $Vs))>;
+ def: Pat<(VecPI16 (vunpacku HVI8:$Vs)), (VZxtb $Vs)>;
+ def: Pat<(VecPI32 (vunpacku HVI8:$Vs)), (VZxth (LoVec (VZxtb $Vs)))>;
+ def: Pat<(VecPI32 (vunpacku HVI32:$Vs)), (VZxth $Vs)>;
+
+ let Predicates = [UseHVX,UseHVXV60] in {
+ def: Pat<(VecI16 (bswap HVI16:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x01)))>;
+ def: Pat<(VecI32 (bswap HVI32:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x03)))>;
+ }
+ let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in {
+ def: Pat<(VecI16 (bswap HVI16:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (V62splatib (i32 0x01)))>;
+ def: Pat<(VecI32 (bswap HVI32:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (V62splatib (i32 0x03)))>;
+ }
def: Pat<(VecI8 (ctpop HVI8:$Vs)),
(V6_vpackeb (V6_vpopcounth (HiVec (V6_vunpackub HvxVR:$Vs))),
@@ -406,10 +483,17 @@ let Predicates = [UseHVX] in {
(V6_vaddw (LoVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))),
(HiVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))))>;
+ let Predicates = [UseHVX,UseHVXV60] in
def: Pat<(VecI8 (ctlz HVI8:$Vs)),
(V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))),
(V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))),
- (V6_lvsplatw (A2_tfrsi 0x08080808)))>;
+ (V60splatib (i32 0x08)))>;
+ let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in
+ def: Pat<(VecI8 (ctlz HVI8:$Vs)),
+ (V6_vsubb (V6_vpackeb (V6_vcl0h (HiVec (V6_vunpackub HvxVR:$Vs))),
+ (V6_vcl0h (LoVec (V6_vunpackub HvxVR:$Vs)))),
+ (V62splatib (i32 0x08)))>;
+
def: Pat<(VecI16 (ctlz HVI16:$Vs)), (V6_vcl0h HvxVR:$Vs)>;
def: Pat<(VecI32 (ctlz HVI32:$Vs)), (V6_vcl0w HvxVR:$Vs)>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index d0b02f035d1e..fc31139e13ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -139,8 +139,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
Register DstReg = Dst.getReg();
Register SrcReg = Src.getReg();
// Just handle virtual registers.
- if (Register::isVirtualRegister(DstReg) &&
- Register::isVirtualRegister(SrcReg)) {
+ if (DstReg.isVirtual() && SrcReg.isVirtual()) {
// Map the following:
// %170 = SXTW %166
// PeepholeMap[170] = %166
@@ -188,8 +187,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
Register DstReg = Dst.getReg();
Register SrcReg = Src.getReg();
// Just handle virtual registers.
- if (Register::isVirtualRegister(DstReg) &&
- Register::isVirtualRegister(SrcReg)) {
+ if (DstReg.isVirtual() && SrcReg.isVirtual()) {
// Map the following:
// %170 = NOT_xx %166
// PeepholeMap[170] = %166
@@ -210,8 +208,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
Register DstReg = Dst.getReg();
Register SrcReg = Src.getReg();
- if (Register::isVirtualRegister(DstReg) &&
- Register::isVirtualRegister(SrcReg)) {
+ if (DstReg.isVirtual() && SrcReg.isVirtual()) {
// Try to find in the map.
if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
// Change the 1st operand.
@@ -242,7 +239,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
if (RC0->getID() == Hexagon::PredRegsRegClassID) {
// Handle instructions that have a prediate register in op0
// (most cases of predicable instructions).
- if (Register::isVirtualRegister(Reg0)) {
+ if (Reg0.isVirtual()) {
// Try to find in the map.
if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
// Change the 1st operand and, flip the opcode.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 52f247977094..5ece577e8285 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -207,7 +207,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FI = MI.getOperand(FIOp).getIndex();
// Select the base pointer (BP) and calculate the actual offset from BP
// to the beginning of the object at index FI.
- int Offset = HFI.getFrameIndexReference(MF, FI, BP);
+ int Offset = HFI.getFrameIndexReference(MF, FI, BP).getFixed();
// Add the offset from the instruction.
int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
bool IsKill = false;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index b45d871e04d6..c8c66ebb69cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -97,7 +97,7 @@ namespace {
bool isFixedInstr(const MachineInstr *MI) const;
void partitionRegisters(UUSetMap &P2Rs);
int32_t profit(const MachineInstr *MI) const;
- int32_t profit(unsigned Reg) const;
+ int32_t profit(Register Reg) const;
bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
@@ -211,7 +211,7 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
if (!Op.isReg())
continue;
Register R = Op.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
return true;
}
return false;
@@ -259,7 +259,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
if (&MO == &Op || !MO.isReg() || MO.getSubReg())
continue;
Register T = MO.getReg();
- if (!Register::isVirtualRegister(T)) {
+ if (!T.isVirtual()) {
FixedRegs.set(x);
continue;
}
@@ -399,8 +399,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
return 0;
}
-int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const {
- assert(Register::isVirtualRegister(Reg));
+int32_t HexagonSplitDoubleRegs::profit(Register Reg) const {
+ assert(Reg.isVirtual());
const MachineInstr *DefI = MRI->getVRegDef(Reg);
switch (DefI->getOpcode()) {
@@ -574,12 +574,9 @@ void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
LoopVector WorkQ;
- for (auto I : *MLI)
- WorkQ.push_back(I);
- for (unsigned i = 0; i < WorkQ.size(); ++i) {
- for (auto I : *WorkQ[i])
- WorkQ.push_back(I);
- }
+ append_range(WorkQ, *MLI);
+ for (unsigned i = 0; i < WorkQ.size(); ++i)
+ append_range(WorkQ, *WorkQ[i]);
USet Rs;
for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
@@ -605,7 +602,7 @@ void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
// For register operands, set the subregister.
Register R = Op.getReg();
unsigned SR = Op.getSubReg();
- bool isVirtReg = Register::isVirtualRegister(R);
+ bool isVirtReg = R.isVirtual();
bool isKill = Op.isKill();
if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
isKill = false;
@@ -1106,7 +1103,7 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
if (!Op.isReg() || !Op.isUse())
continue;
Register R = Op.getReg();
- if (!Register::isVirtualRegister(R))
+ if (!R.isVirtual())
continue;
if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 2b7e1bcba9a3..87b1c43961d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -10,10 +10,10 @@
//
//===----------------------------------------------------------------------===//
+#include "HexagonSubtarget.h"
#include "Hexagon.h"
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
#include "MCTargetDesc/HexagonMCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
@@ -26,6 +26,7 @@
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
#include <map>
@@ -38,7 +39,6 @@ using namespace llvm;
#define GET_SUBTARGETINFO_TARGET_DESC
#include "HexagonGenSubtargetInfo.inc"
-
static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
cl::Hidden, cl::ZeroOrMore, cl::init(true));
@@ -77,7 +77,8 @@ static cl::opt<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
StringRef FS, const TargetMachine &TM)
- : HexagonGenSubtargetInfo(TT, CPU, FS), OptLevel(TM.getOptLevel()),
+ : HexagonGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+ OptLevel(TM.getOptLevel()),
CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
RegInfo(getHwMode()), TLInfo(TM, *this),
@@ -104,7 +105,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
UseBSBScheduling = hasV60Ops() && EnableBSBSched;
- ParseSubtargetFeatures(CPUString, FS);
+ ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
if (OverrideLongCalls.getPosition())
UseLongCalls = OverrideLongCalls;
@@ -124,6 +125,76 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
return *this;
}
+bool HexagonSubtarget::isHVXElementType(MVT Ty, bool IncludeBool) const {
+ if (!useHVXOps())
+ return false;
+ if (Ty.isVector())
+ Ty = Ty.getVectorElementType();
+ if (IncludeBool && Ty == MVT::i1)
+ return true;
+ ArrayRef<MVT> ElemTypes = getHVXElementTypes();
+ return llvm::is_contained(ElemTypes, Ty);
+}
+
+bool HexagonSubtarget::isHVXVectorType(MVT VecTy, bool IncludeBool) const {
+ if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
+ return false;
+ MVT ElemTy = VecTy.getVectorElementType();
+ if (!IncludeBool && ElemTy == MVT::i1)
+ return false;
+
+ unsigned HwLen = getVectorLength();
+ unsigned NumElems = VecTy.getVectorNumElements();
+ ArrayRef<MVT> ElemTypes = getHVXElementTypes();
+
+ if (IncludeBool && ElemTy == MVT::i1) {
+ // Boolean HVX vector types are formed from regular HVX vector types
+ // by replacing the element type with i1.
+ for (MVT T : ElemTypes)
+ if (NumElems * T.getSizeInBits() == 8 * HwLen)
+ return true;
+ return false;
+ }
+
+ unsigned VecWidth = VecTy.getSizeInBits();
+ if (VecWidth != 8 * HwLen && VecWidth != 16 * HwLen)
+ return false;
+ return llvm::is_contained(ElemTypes, ElemTy);
+}
+
+bool HexagonSubtarget::isTypeForHVX(Type *VecTy, bool IncludeBool) const {
+ if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy))
+ return false;
+ // Avoid types like <2 x i32*>.
+ if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
+ return false;
+ // The given type may be something like <17 x i32>, which is not MVT,
+ // but can be represented as (non-simple) EVT.
+ EVT Ty = EVT::getEVT(VecTy, /*HandleUnknown*/false);
+ if (Ty.getSizeInBits() <= 64 || !Ty.getVectorElementType().isSimple())
+ return false;
+
+ auto isHvxTy = [this, IncludeBool](MVT SimpleTy) {
+ if (isHVXVectorType(SimpleTy, IncludeBool))
+ return true;
+ auto Action = getTargetLowering()->getPreferredVectorAction(SimpleTy);
+ return Action == TargetLoweringBase::TypeWidenVector;
+ };
+
+ // Round up EVT to have power-of-2 elements, and keep checking if it
+ // qualifies for HVX, dividing it in half after each step.
+ MVT ElemTy = Ty.getVectorElementType().getSimpleVT();
+ unsigned VecLen = PowerOf2Ceil(Ty.getVectorNumElements());
+ while (ElemTy.getSizeInBits() * VecLen > 64) {
+ MVT SimpleTy = MVT::getVectorVT(ElemTy, VecLen);
+ if (SimpleTy.isValid() && isHvxTy(SimpleTy))
+ return true;
+ VecLen /= 2;
+ }
+
+ return false;
+}
+
void HexagonSubtarget::UsrOverflowMutation::apply(ScheduleDAGInstrs *DAG) {
for (SUnit &SU : DAG->SUnits) {
if (!SU.isInstr())
@@ -420,14 +491,14 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
for (auto &I : Src->Succs) {
if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
continue;
- unsigned DepR = I.getReg();
+ Register DepR = I.getReg();
int DefIdx = -1;
for (unsigned OpNum = 0; OpNum < SrcI->getNumOperands(); OpNum++) {
const MachineOperand &MO = SrcI->getOperand(OpNum);
bool IsSameOrSubReg = false;
if (MO.isReg()) {
- unsigned MOReg = MO.getReg();
- if (Register::isVirtualRegister(DepR)) {
+ Register MOReg = MO.getReg();
+ if (DepR.isVirtual()) {
IsSameOrSubReg = (MOReg == DepR);
} else {
IsSameOrSubReg = getRegisterInfo()->isSubRegisterEq(DepR, MOReg);
@@ -456,7 +527,7 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
// Update the latency of opposite edge too.
T.setSUnit(Src);
- auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+ auto F = find(Dst->Preds, T);
assert(F != Dst->Preds.end());
F->setLatency(I.getLatency());
}
@@ -473,7 +544,7 @@ void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
// Update the latency of opposite edge too.
T.setSUnit(Src);
- auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+ auto F = find(Dst->Preds, T);
assert(F != Dst->Preds.end());
F->setLatency(Lat);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index de4f245519e4..7b7fb8d04f47 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -135,7 +135,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
bool hasV5Ops() const {
return getHexagonArchVersion() >= Hexagon::ArchEnum::V5;
@@ -275,31 +275,9 @@ public:
return makeArrayRef(Types);
}
- bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const {
- if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
- return false;
- MVT ElemTy = VecTy.getVectorElementType();
- if (!IncludeBool && ElemTy == MVT::i1)
- return false;
-
- unsigned HwLen = getVectorLength();
- unsigned NumElems = VecTy.getVectorNumElements();
- ArrayRef<MVT> ElemTypes = getHVXElementTypes();
-
- if (IncludeBool && ElemTy == MVT::i1) {
- // Boolean HVX vector types are formed from regular HVX vector types
- // by replacing the element type with i1.
- for (MVT T : ElemTypes)
- if (NumElems * T.getSizeInBits() == 8*HwLen)
- return true;
- return false;
- }
-
- unsigned VecWidth = VecTy.getSizeInBits();
- if (VecWidth != 8*HwLen && VecWidth != 16*HwLen)
- return false;
- return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; });
- }
+ bool isHVXElementType(MVT Ty, bool IncludeBool = false) const;
+ bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const;
+ bool isTypeForHVX(Type *VecTy, bool IncludeBool = false) const;
unsigned getTypeAlignment(MVT Ty) const {
if (isHVXVectorType(Ty, true))
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 3fe42ea13f51..9195bb3dc725 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -13,14 +13,17 @@
#include "HexagonTargetMachine.h"
#include "Hexagon.h"
#include "HexagonISelLowering.h"
+#include "HexagonLoopIdiomRecognition.h"
#include "HexagonMachineScheduler.h"
#include "HexagonTargetObjectFile.h"
#include "HexagonTargetTransformInfo.h"
+#include "HexagonVectorLoopCarriedReuse.h"
#include "TargetInfo/HexagonTargetInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -97,10 +100,17 @@ static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
static cl::opt<bool> EnableVExtractOpt("hexagon-opt-vextract", cl::Hidden,
cl::ZeroOrMore, cl::init(true), cl::desc("Enable vextract optimization"));
+static cl::opt<bool> EnableVectorCombine("hexagon-vector-combine", cl::Hidden,
+ cl::ZeroOrMore, cl::init(true), cl::desc("Enable HVX vector combining"));
+
static cl::opt<bool> EnableInitialCFGCleanup("hexagon-initial-cfg-cleanup",
cl::Hidden, cl::ZeroOrMore, cl::init(true),
cl::desc("Simplify the CFG after atomic expansion pass"));
+static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
+ cl::ZeroOrMore, cl::init(true),
+ cl::desc("Enable instsimplify"));
+
/// HexagonTargetMachineModule - Note that this is used on hosts that
/// cannot link in a library unless there are references into the
/// library. In particular, it seems that it is not possible to get
@@ -132,16 +142,17 @@ namespace llvm {
void initializeHexagonExpandCondsetsPass(PassRegistry&);
void initializeHexagonGenMuxPass(PassRegistry&);
void initializeHexagonHardwareLoopsPass(PassRegistry&);
- void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
- void initializeHexagonVectorLoopCarriedReusePass(PassRegistry&);
+ void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
void initializeHexagonNewValueJumpPass(PassRegistry&);
void initializeHexagonOptAddrModePass(PassRegistry&);
void initializeHexagonPacketizerPass(PassRegistry&);
void initializeHexagonRDFOptPass(PassRegistry&);
void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+ void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
+ void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
void initializeHexagonVExtractPass(PassRegistry&);
Pass *createHexagonLoopIdiomPass();
- Pass *createHexagonVectorLoopCarriedReusePass();
+ Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
FunctionPass *createHexagonBitSimplify();
FunctionPass *createHexagonBranchRelaxation();
@@ -162,22 +173,21 @@ namespace llvm {
CodeGenOpt::Level OptLevel);
FunctionPass *createHexagonLoopRescheduling();
FunctionPass *createHexagonNewValueJump();
- FunctionPass *createHexagonOptimizeSZextends();
FunctionPass *createHexagonOptAddrMode();
+ FunctionPass *createHexagonOptimizeSZextends();
FunctionPass *createHexagonPacketizer(bool Minimal);
FunctionPass *createHexagonPeephole();
FunctionPass *createHexagonRDFOpt();
FunctionPass *createHexagonSplitConst32AndConst64();
FunctionPass *createHexagonSplitDoubleRegs();
FunctionPass *createHexagonStoreWidening();
+ FunctionPass *createHexagonVectorCombineLegacyPass();
FunctionPass *createHexagonVectorPrint();
FunctionPass *createHexagonVExtract();
} // end namespace llvm;
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::Static;
- return *RM;
+ return RM.getValueOr(Reloc::Static);
}
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
@@ -191,13 +201,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
initializeHexagonEarlyIfConversionPass(PR);
initializeHexagonGenMuxPass(PR);
initializeHexagonHardwareLoopsPass(PR);
- initializeHexagonLoopIdiomRecognizePass(PR);
- initializeHexagonVectorLoopCarriedReusePass(PR);
+ initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR);
initializeHexagonNewValueJumpPass(PR);
initializeHexagonOptAddrModePass(PR);
initializeHexagonPacketizerPass(PR);
initializeHexagonRDFOptPass(PR);
initializeHexagonSplitDoubleRegsPass(PR);
+ initializeHexagonVectorCombineLegacyPass(PR);
+ initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PR);
initializeHexagonVExtractPass(PR);
}
@@ -231,12 +242,10 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute FSAttr =
FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
// Append the preexisting target features last, so that +mattr overrides
// the "unsafe-fp-math" function attribute.
// Creating a separate target feature is not strictly necessary, it only
@@ -264,10 +273,22 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {
PM.add(createHexagonLoopIdiomPass());
});
PMB.addExtension(
- PassManagerBuilder::EP_LoopOptimizerEnd,
- [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
- PM.add(createHexagonVectorLoopCarriedReusePass());
- });
+ PassManagerBuilder::EP_LoopOptimizerEnd,
+ [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ PM.add(createHexagonVectorLoopCarriedReuseLegacyPass());
+ });
+}
+
+void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) {
+ PB.registerLateLoopOptimizationsEPCallback(
+ [=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
+ LPM.addPass(HexagonLoopIdiomRecognitionPass());
+ });
+ PB.registerLoopOptimizerEndEPCallback(
+ [=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
+ LPM.addPass(HexagonVectorLoopCarriedReusePass());
+ });
}
TargetTransformInfo
@@ -312,7 +333,8 @@ void HexagonPassConfig::addIRPasses() {
bool NoOpt = (getOptLevel() == CodeGenOpt::None);
if (!NoOpt) {
- addPass(createConstantPropagationPass());
+ if (EnableInstSimplify)
+ addPass(createInstSimplifyLegacyPass());
addPass(createDeadCodeEliminationPass());
}
@@ -320,9 +342,16 @@ void HexagonPassConfig::addIRPasses() {
if (!NoOpt) {
if (EnableInitialCFGCleanup)
- addPass(createCFGSimplificationPass(1, true, true, false, true));
+ addPass(createCFGSimplificationPass(SimplifyCFGOptions()
+ .forwardSwitchCondToPhi(true)
+ .convertSwitchToLookupTable(true)
+ .needCanonicalLoops(false)
+ .hoistCommonInsts(true)
+ .sinkCommonInsts(true)));
if (EnableLoopPrefetch)
addPass(createLoopDataPrefetchPass());
+ if (EnableVectorCombine)
+ addPass(createHexagonVectorCombineLegacyPass());
if (EnableCommGEP)
addPass(createHexagonCommonGEP());
// Replace certain combinations of shifts and ands with extracts.
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 7ee4474e90e3..fa174128f708 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -37,6 +37,8 @@ public:
static unsigned getModuleMatchQuality(const Module &M);
void adjustPassManager(PassManagerBuilder &PMB) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) override;
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index cfc8ed813c92..595cf94e3f1d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -331,6 +331,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
case Type::LabelTyID:
case Type::MetadataTyID:
case Type::X86_MMXTyID:
+ case Type::X86_AMXTyID:
case Type::TokenTyID:
return 0;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 80c8736cb74a..1cefa6a04640 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/User.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
using namespace llvm;
@@ -34,6 +35,9 @@ static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
cl::init(true), cl::Hidden,
cl::desc("Control lookup table emission on Hexagon target"));
+static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true),
+ cl::Hidden, cl::desc("Enable masked loads/stores for HVX"));
+
// Constant "cost factor" to make floating point operations more expensive
// in terms of vectorization cost. This isn't the best way, but it should
// do. Ultimately, the cost should use cycles.
@@ -43,22 +47,6 @@ bool HexagonTTIImpl::useHVX() const {
return ST.useHVXOps() && HexagonAutoHVX;
}
-bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
- assert(VecTy->isVectorTy());
- if (isa<ScalableVectorType>(VecTy))
- return false;
- // Avoid types like <2 x i32*>.
- if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
- return false;
- EVT VecVT = EVT::getEVT(VecTy);
- if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
- return false;
- if (ST.isHVXVectorType(VecVT.getSimpleVT()))
- return true;
- auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT());
- return Action == TargetLoweringBase::TypeWidenVector;
-}
-
unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
return VTy->getNumElements();
@@ -84,7 +72,7 @@ void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
// Only try to peel innermost loops with small runtime trip counts.
- if (L && L->empty() && canPeel(L) &&
+ if (L && L->isInnermost() && canPeel(L) &&
SE.getSmallConstantTripCount(L) == 0 &&
SE.getSmallConstantMaxTripCount(L) > 0 &&
SE.getSmallConstantMaxTripCount(L) <= 5) {
@@ -105,7 +93,7 @@ unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
}
unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
- return useHVX() ? 2 : 0;
+ return useHVX() ? 2 : 1;
}
unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
@@ -113,7 +101,7 @@ unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
}
unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
- return useHVX() ? ST.getVectorLength()*8 : 0;
+ return useHVX() ? ST.getVectorLength()*8 : 32;
}
unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
@@ -168,7 +156,7 @@ unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
if (Src->isVectorTy()) {
VectorType *VecTy = cast<VectorType>(Src);
unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedSize();
- if (useHVX() && isTypeForHVX(VecTy)) {
+ if (useHVX() && ST.isTypeForHVX(VecTy)) {
unsigned RegWidth = getRegisterBitWidth(true);
assert(RegWidth && "Non-zero vector register width expected");
// Cost of HVX loads.
@@ -239,13 +227,16 @@ unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(
}
unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy, TTI::TargetCostKind CostKind, const Instruction *I) {
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
if (Opcode == Instruction::FCmp)
return LT.first + FloatFactor * getTypeNumElements(ValTy);
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
unsigned HexagonTTIImpl::getArithmeticInstrCost(
@@ -270,7 +261,9 @@ unsigned HexagonTTIImpl::getArithmeticInstrCost(
}
unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
- Type *SrcTy, TTI::TargetCostKind CostKind, const Instruction *I) {
+ Type *SrcTy, TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
@@ -305,6 +298,14 @@ unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return 1;
}
+bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) {
+ return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
+}
+
+bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) {
+ return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
+}
+
/// --- Vector TTI end ---
unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 5fe397486402..835358d3fed0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -43,7 +43,6 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
const HexagonTargetLowering *getTLI() const { return &TLI; }
bool useHVX() const;
- bool isTypeForHVX(Type *VecTy) const;
// Returns the number of vector elements of Ty, if Ty is a vector type,
// or 1 if Ty is a scalar type. It is incorrect to call this function
@@ -134,6 +133,8 @@ public:
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
unsigned getArithmeticInstrCost(
@@ -146,14 +147,18 @@ public:
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr);
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
return 1;
}
+ bool isLegalMaskedStore(Type *DataType, Align Alignment);
+ bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+
/// @}
int getUserCost(const User *U, ArrayRef<const Value *> Operands,
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
new file mode 100644
index 000000000000..a605fdfcf100
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -0,0 +1,1487 @@
+//===-- HexagonVectorCombine.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// HexagonVectorCombine is a utility class implementing a variety of functions
+// that assist in vector-based optimizations.
+//
+// AlignVectors: replace unaligned vector loads and stores with aligned ones.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+
+#include <algorithm>
+#include <deque>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "hexagon-vc"
+
+using namespace llvm;
+
+namespace {
+class HexagonVectorCombine {
+public:
+ HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
+ DominatorTree &DT_, TargetLibraryInfo &TLI_,
+ const TargetMachine &TM_)
+ : F(F_), DL(F.getParent()->getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
+ TLI(TLI_),
+ HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
+
+ bool run();
+
+ // Common integer type.
+ IntegerType *getIntTy() const;
+ // Byte type: either scalar (when Length = 0), or vector with given
+ // element count.
+ Type *getByteTy(int ElemCount = 0) const;
+ // Boolean type: either scalar (when Length = 0), or vector with given
+ // element count.
+ Type *getBoolTy(int ElemCount = 0) const;
+ // Create a ConstantInt of type returned by getIntTy with the value Val.
+ ConstantInt *getConstInt(int Val) const;
+ // Get the integer value of V, if it exists.
+ Optional<APInt> getIntValue(const Value *Val) const;
+ // Is V a constant 0, or a vector of 0s?
+ bool isZero(const Value *Val) const;
+ // Is V an undef value?
+ bool isUndef(const Value *Val) const;
+
+ int getSizeOf(const Value *Val) const;
+ int getSizeOf(const Type *Ty) const;
+ int getTypeAlignment(Type *Ty) const;
+
+ VectorType *getByteVectorTy(int ScLen) const;
+ Constant *getNullValue(Type *Ty) const;
+ Constant *getFullValue(Type *Ty) const;
+
+ Value *insertb(IRBuilder<> &Builder, Value *Dest, Value *Src, int Start,
+ int Length, int Where) const;
+ Value *vlalignb(IRBuilder<> &Builder, Value *Lo, Value *Hi, Value *Amt) const;
+ Value *vralignb(IRBuilder<> &Builder, Value *Lo, Value *Hi, Value *Amt) const;
+ Value *concat(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) const;
+ Value *vresize(IRBuilder<> &Builder, Value *Val, int NewSize,
+ Value *Pad) const;
+ Value *rescale(IRBuilder<> &Builder, Value *Mask, Type *FromTy,
+ Type *ToTy) const;
+ Value *vlsb(IRBuilder<> &Builder, Value *Val) const;
+ Value *vbytes(IRBuilder<> &Builder, Value *Val) const;
+
+ Value *createHvxIntrinsic(IRBuilder<> &Builder, Intrinsic::ID IntID,
+ Type *RetTy, ArrayRef<Value *> Args) const;
+
+ Optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
+
+ template <typename T = std::vector<Instruction *>>
+ bool isSafeToMoveBeforeInBB(const Instruction &In,
+ BasicBlock::const_iterator To,
+ const T &Ignore = {}) const;
+
+ Function &F;
+ const DataLayout &DL;
+ AliasAnalysis &AA;
+ AssumptionCache &AC;
+ DominatorTree &DT;
+ TargetLibraryInfo &TLI;
+ const HexagonSubtarget &HST;
+
+private:
+#ifndef NDEBUG
+ // These two functions are only used for assertions at the moment.
+ bool isByteVecTy(Type *Ty) const;
+ bool isSectorTy(Type *Ty) const;
+#endif
+ Value *getElementRange(IRBuilder<> &Builder, Value *Lo, Value *Hi, int Start,
+ int Length) const;
+};
+
+class AlignVectors {
+public:
+ AlignVectors(HexagonVectorCombine &HVC_) : HVC(HVC_) {}
+
+ bool run();
+
+private:
+ using InstList = std::vector<Instruction *>;
+
+ struct Segment {
+ void *Data;
+ int Start;
+ int Size;
+ };
+
+ struct AddrInfo {
+ AddrInfo(const AddrInfo &) = default;
+ AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
+ Align H)
+ : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
+ NeedAlign(HVC.getTypeAlignment(ValTy)) {}
+
+ // XXX: add Size member?
+ Instruction *Inst;
+ Value *Addr;
+ Type *ValTy;
+ Align HaveAlign;
+ Align NeedAlign;
+ int Offset = 0; // Offset (in bytes) from the first member of the
+ // containing AddrList.
+ };
+ using AddrList = std::vector<AddrInfo>;
+
+ struct InstrLess {
+ bool operator()(const Instruction *A, const Instruction *B) const {
+ return A->comesBefore(B);
+ }
+ };
+ using DepList = std::set<Instruction *, InstrLess>;
+
+ struct MoveGroup {
+ MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
+ : Base(B), Main{AI.Inst}, IsHvx(Hvx), IsLoad(Load) {}
+ Instruction *Base; // Base instruction of the parent address group.
+ InstList Main; // Main group of instructions.
+ InstList Deps; // List of dependencies.
+ bool IsHvx; // Is this group of HVX instructions?
+ bool IsLoad; // Is this a load group?
+ };
+ using MoveList = std::vector<MoveGroup>;
+
+ struct ByteSpan {
+ struct Segment {
+ Segment(Value *Val, int Begin, int Len)
+ : Val(Val), Start(Begin), Size(Len) {}
+ Segment(const Segment &Seg) = default;
+ Value *Val;
+ int Start;
+ int Size;
+ };
+
+ struct Block {
+ Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
+ Block(Value *Val, int Off, int Len, int Pos)
+ : Seg(Val, Off, Len), Pos(Pos) {}
+ Block(const Block &Blk) = default;
+ Segment Seg;
+ int Pos;
+ };
+
+ int extent() const;
+ ByteSpan section(int Start, int Length) const;
+ ByteSpan &shift(int Offset);
+
+ int size() const { return Blocks.size(); }
+ Block &operator[](int i) { return Blocks[i]; }
+
+ std::vector<Block> Blocks;
+
+ using iterator = decltype(Blocks)::iterator;
+ iterator begin() { return Blocks.begin(); }
+ iterator end() { return Blocks.end(); }
+ using const_iterator = decltype(Blocks)::const_iterator;
+ const_iterator begin() const { return Blocks.begin(); }
+ const_iterator end() const { return Blocks.end(); }
+ };
+
+ Align getAlignFromValue(const Value *V) const;
+ Optional<MemoryLocation> getLocation(const Instruction &In) const;
+ Optional<AddrInfo> getAddrInfo(Instruction &In) const;
+ bool isHvx(const AddrInfo &AI) const;
+
+ Value *getPayload(Value *Val) const;
+ Value *getMask(Value *Val) const;
+ Value *getPassThrough(Value *Val) const;
+
+ Value *createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr, Type *ValTy,
+ int Adjust) const;
+ Value *createAlignedPointer(IRBuilder<> &Builder, Value *Ptr, Type *ValTy,
+ int Alignment) const;
+ Value *createAlignedLoad(IRBuilder<> &Builder, Type *ValTy, Value *Ptr,
+ int Alignment, Value *Mask, Value *PassThru) const;
+ Value *createAlignedStore(IRBuilder<> &Builder, Value *Val, Value *Ptr,
+ int Alignment, Value *Mask) const;
+
+ bool createAddressGroups();
+ MoveList createLoadGroups(const AddrList &Group) const;
+ MoveList createStoreGroups(const AddrList &Group) const;
+ bool move(const MoveGroup &Move) const;
+ bool realignGroup(const MoveGroup &Move) const;
+
+ friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
+ friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
+ friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
+
+ std::map<Instruction *, AddrList> AddrGroups;
+ HexagonVectorCombine &HVC;
+};
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
+ OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
+ OS << "Addr: " << *AI.Addr << '\n';
+ OS << "Type: " << *AI.ValTy << '\n';
+ OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
+ OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
+ OS << "Offset: " << AI.Offset;
+ return OS;
+}
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
+ OS << "Main\n";
+ for (Instruction *I : MG.Main)
+ OS << " " << *I << '\n';
+ OS << "Deps\n";
+ for (Instruction *I : MG.Deps)
+ OS << " " << *I << '\n';
+ return OS;
+}
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
+ OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
+ for (const AlignVectors::ByteSpan::Block &B : BS) {
+ OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
+ << *B.Seg.Val << '\n';
+ }
+ OS << ']';
+ return OS;
+}
+
+} // namespace
+
+namespace {
+
+template <typename T> T *getIfUnordered(T *MaybeT) {
+ return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
+}
+template <typename T> T *isCandidate(Instruction *In) {
+ return dyn_cast<T>(In);
+}
+template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
+ return getIfUnordered(dyn_cast<LoadInst>(In));
+}
+template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
+ return getIfUnordered(dyn_cast<StoreInst>(In));
+}
+
+#if !defined(_MSC_VER) || _MSC_VER >= 1924
+// VS2017 has trouble compiling this:
+// error C2976: 'std::map': too few template arguments
+template <typename Pred, typename... Ts>
+void erase_if(std::map<Ts...> &map, Pred p)
+#else
+template <typename Pred, typename T, typename U>
+void erase_if(std::map<T, U> &map, Pred p)
+#endif
+{
+ for (auto i = map.begin(), e = map.end(); i != e;) {
+ if (p(*i))
+ i = map.erase(i);
+ else
+ i = std::next(i);
+ }
+}
+
+// Forward other erase_ifs to the LLVM implementations.
+template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
+ llvm::erase_if(std::forward<T>(container), p);
+}
+
+} // namespace
+
+// --- Begin AlignVectors
+
+auto AlignVectors::ByteSpan::extent() const -> int {
+ if (size() == 0)
+ return 0;
+ int Min = Blocks[0].Pos;
+ int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
+ for (int i = 1, e = size(); i != e; ++i) {
+ Min = std::min(Min, Blocks[i].Pos);
+ Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
+ }
+ return Max - Min;
+}
+
+auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
+ ByteSpan Section;
+ for (const ByteSpan::Block &B : Blocks) {
+ int L = std::max(B.Pos, Start); // Left end.
+ int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
+ if (L < R) {
+ // How much to chop off the beginning of the segment:
+ int Off = L > B.Pos ? L - B.Pos : 0;
+ Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
+ }
+ }
+ return Section;
+}
+
+auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
+ for (Block &B : Blocks)
+ B.Pos += Offset;
+ return *this;
+}
+
+auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
+ const auto *C = dyn_cast<ConstantInt>(V);
+ assert(C && "Alignment must be a compile-time constant integer");
+ return C->getAlignValue();
+}
+
+auto AlignVectors::getAddrInfo(Instruction &In) const -> Optional<AddrInfo> {
+ if (auto *L = isCandidate<LoadInst>(&In))
+ return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
+ L->getAlign());
+ if (auto *S = isCandidate<StoreInst>(&In))
+ return AddrInfo(HVC, S, S->getPointerOperand(),
+ S->getValueOperand()->getType(), S->getAlign());
+ if (auto *II = isCandidate<IntrinsicInst>(&In)) {
+ Intrinsic::ID ID = II->getIntrinsicID();
+ switch (ID) {
+ case Intrinsic::masked_load:
+ return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
+ getAlignFromValue(II->getArgOperand(1)));
+ case Intrinsic::masked_store:
+ return AddrInfo(HVC, II, II->getArgOperand(1),
+ II->getArgOperand(0)->getType(),
+ getAlignFromValue(II->getArgOperand(2)));
+ }
+ }
+ return Optional<AddrInfo>();
+}
+
+auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
+ return HVC.HST.isTypeForHVX(AI.ValTy);
+}
+
+auto AlignVectors::getPayload(Value *Val) const -> Value * {
+ if (auto *In = dyn_cast<Instruction>(Val)) {
+ Intrinsic::ID ID = 0;
+ if (auto *II = dyn_cast<IntrinsicInst>(In))
+ ID = II->getIntrinsicID();
+ if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
+ return In->getOperand(0);
+ }
+ return Val;
+}
+
+auto AlignVectors::getMask(Value *Val) const -> Value * {
+ if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::masked_load:
+ return II->getArgOperand(2);
+ case Intrinsic::masked_store:
+ return II->getArgOperand(3);
+ }
+ }
+
+ Type *ValTy = getPayload(Val)->getType();
+ if (auto *VecTy = dyn_cast<VectorType>(ValTy)) {
+ int ElemCount = VecTy->getElementCount().getFixedValue();
+ return HVC.getFullValue(HVC.getBoolTy(ElemCount));
+ }
+ return HVC.getFullValue(HVC.getBoolTy());
+}
+
+auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
+ if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
+ if (II->getIntrinsicID() == Intrinsic::masked_load)
+ return II->getArgOperand(3);
+ }
+ return UndefValue::get(getPayload(Val)->getType());
+}
+
+auto AlignVectors::createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr,
+ Type *ValTy, int Adjust) const
+ -> Value * {
+ // The adjustment is in bytes, but if it's a multiple of the type size,
+ // we don't need to do pointer casts.
+ Type *ElemTy = cast<PointerType>(Ptr->getType())->getElementType();
+ int ElemSize = HVC.getSizeOf(ElemTy);
+ if (Adjust % ElemSize == 0) {
+ Value *Tmp0 = Builder.CreateGEP(Ptr, HVC.getConstInt(Adjust / ElemSize));
+ return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
+ }
+
+ PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
+ Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy);
+ Value *Tmp1 = Builder.CreateGEP(Tmp0, HVC.getConstInt(Adjust));
+ return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo());
+}
+
+auto AlignVectors::createAlignedPointer(IRBuilder<> &Builder, Value *Ptr,
+ Type *ValTy, int Alignment) const
+ -> Value * {
+ Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy());
+ Value *Mask = HVC.getConstInt(-Alignment);
+ Value *And = Builder.CreateAnd(AsInt, Mask);
+ return Builder.CreateIntToPtr(And, ValTy->getPointerTo());
+}
+
+auto AlignVectors::createAlignedLoad(IRBuilder<> &Builder, Type *ValTy,
+ Value *Ptr, int Alignment, Value *Mask,
+ Value *PassThru) const -> Value * {
+ assert(!HVC.isUndef(Mask)); // Should this be allowed?
+ if (HVC.isZero(Mask))
+ return PassThru;
+ if (Mask == ConstantInt::getTrue(Mask->getType()))
+ return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment));
+ return Builder.CreateMaskedLoad(Ptr, Align(Alignment), Mask, PassThru);
+}
+
+auto AlignVectors::createAlignedStore(IRBuilder<> &Builder, Value *Val,
+ Value *Ptr, int Alignment,
+ Value *Mask) const -> Value * {
+ if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
+ return UndefValue::get(Val->getType());
+ if (Mask == ConstantInt::getTrue(Mask->getType()))
+ return Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
+ return Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
+}
+
+auto AlignVectors::createAddressGroups() -> bool {
+ // An address group created here may contain instructions spanning
+ // multiple basic blocks.
+ AddrList WorkStack;
+
+ auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
+ for (AddrInfo &W : WorkStack) {
+ if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
+ return std::make_pair(W.Inst, *D);
+ }
+ return std::make_pair(nullptr, 0);
+ };
+
+ auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
+ BasicBlock &Block = *DomN->getBlock();
+ for (Instruction &I : Block) {
+ auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
+ if (!AI)
+ continue;
+ auto F = findBaseAndOffset(*AI);
+ Instruction *GroupInst;
+ if (Instruction *BI = F.first) {
+ AI->Offset = F.second;
+ GroupInst = BI;
+ } else {
+ WorkStack.push_back(*AI);
+ GroupInst = AI->Inst;
+ }
+ AddrGroups[GroupInst].push_back(*AI);
+ }
+
+ for (DomTreeNode *C : DomN->children())
+ Visit(C, Visit);
+
+ while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
+ WorkStack.pop_back();
+ };
+
+ traverseBlock(HVC.DT.getRootNode(), traverseBlock);
+ assert(WorkStack.empty());
+
+ // AddrGroups are formed.
+
+ // Remove groups of size 1.
+ erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
+ // Remove groups that don't use HVX types.
+ erase_if(AddrGroups, [&](auto &G) {
+ return !llvm::any_of(
+ G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
+ });
+ // Remove groups where everything is properly aligned.
+ erase_if(AddrGroups, [&](auto &G) {
+ return llvm::all_of(G.second,
+ [&](auto &I) { return I.HaveAlign >= I.NeedAlign; });
+ });
+
+ return !AddrGroups.empty();
+}
+
+auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
+ // Form load groups.
+ // To avoid complications with moving code across basic blocks, only form
+ // groups that are contained within a single basic block.
+
+ auto getUpwardDeps = [](Instruction *In, Instruction *Base) {
+ BasicBlock *Parent = Base->getParent();
+ assert(In->getParent() == Parent &&
+ "Base and In should be in the same block");
+ assert(Base->comesBefore(In) && "Base should come before In");
+
+ DepList Deps;
+ std::deque<Instruction *> WorkQ = {In};
+ while (!WorkQ.empty()) {
+ Instruction *D = WorkQ.front();
+ WorkQ.pop_front();
+ Deps.insert(D);
+ for (Value *Op : D->operands()) {
+ if (auto *I = dyn_cast<Instruction>(Op)) {
+ if (I->getParent() == Parent && Base->comesBefore(I))
+ WorkQ.push_back(I);
+ }
+ }
+ }
+ return Deps;
+ };
+
+ auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
+ assert(!Move.Main.empty() && "Move group should have non-empty Main");
+ // Don't mix HVX and non-HVX instructions.
+ if (Move.IsHvx != isHvx(Info))
+ return false;
+ // Leading instruction in the load group.
+ Instruction *Base = Move.Main.front();
+ if (Base->getParent() != Info.Inst->getParent())
+ return false;
+
+ auto isSafeToMoveToBase = [&](const Instruction *I) {
+ return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator());
+ };
+ DepList Deps = getUpwardDeps(Info.Inst, Base);
+ if (!llvm::all_of(Deps, isSafeToMoveToBase))
+ return false;
+
+ // The dependencies will be moved together with the load, so make sure
+ // that none of them could be moved independently in another group.
+ Deps.erase(Info.Inst);
+ auto inAddrMap = [&](Instruction *I) { return AddrGroups.count(I) > 0; };
+ if (llvm::any_of(Deps, inAddrMap))
+ return false;
+ Move.Main.push_back(Info.Inst);
+ llvm::append_range(Move.Deps, Deps);
+ return true;
+ };
+
+ MoveList LoadGroups;
+
+ for (const AddrInfo &Info : Group) {
+ if (!Info.Inst->mayReadFromMemory())
+ continue;
+ if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
+ LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
+ }
+
+ // Erase singleton groups.
+ erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
+ return LoadGroups;
+}
+
+auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
+ // Form store groups.
+ // To avoid complications with moving code across basic blocks, only form
+ // groups that are contained within a single basic block.
+
+ auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
+ assert(!Move.Main.empty() && "Move group should have non-empty Main");
+ // For stores with return values we'd have to collect downward depenencies.
+ // There are no such stores that we handle at the moment, so omit that.
+ assert(Info.Inst->getType()->isVoidTy() &&
+ "Not handling stores with return values");
+ // Don't mix HVX and non-HVX instructions.
+ if (Move.IsHvx != isHvx(Info))
+ return false;
+ // For stores we need to be careful whether it's safe to move them.
+ // Stores that are otherwise safe to move together may not appear safe
+ // to move over one another (i.e. isSafeToMoveBefore may return false).
+ Instruction *Base = Move.Main.front();
+ if (Base->getParent() != Info.Inst->getParent())
+ return false;
+ if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
+ return false;
+ Move.Main.push_back(Info.Inst);
+ return true;
+ };
+
+ MoveList StoreGroups;
+
+ for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
+ const AddrInfo &Info = *I;
+ if (!Info.Inst->mayWriteToMemory())
+ continue;
+ if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
+ StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
+ }
+
+ // Erase singleton groups.
+ erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
+ return StoreGroups;
+}
+
+auto AlignVectors::move(const MoveGroup &Move) const -> bool {
+ assert(!Move.Main.empty() && "Move group should have non-empty Main");
+ Instruction *Where = Move.Main.front();
+
+ if (Move.IsLoad) {
+ // Move all deps to before Where, keeping order.
+ for (Instruction *D : Move.Deps)
+ D->moveBefore(Where);
+ // Move all main instructions to after Where, keeping order.
+ ArrayRef<Instruction *> Main(Move.Main);
+ for (Instruction *M : Main.drop_front(1)) {
+ M->moveAfter(Where);
+ Where = M;
+ }
+ } else {
+ // NOTE: Deps are empty for "store" groups. If they need to be
+ // non-empty, decide on the order.
+ assert(Move.Deps.empty());
+ // Move all main instructions to before Where, inverting order.
+ ArrayRef<Instruction *> Main(Move.Main);
+ for (Instruction *M : Main.drop_front(1)) {
+ M->moveBefore(Where);
+ Where = M;
+ }
+ }
+
+ return Move.Main.size() + Move.Deps.size() > 1;
+}
+
+auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
+ // TODO: Needs support for masked loads/stores of "scalar" vectors.
+ if (!Move.IsHvx)
+ return false;
+
+ // Return the element with the maximum alignment from Range,
+ // where GetValue obtains the value to compare from an element.
+ auto getMaxOf = [](auto Range, auto GetValue) {
+ return *std::max_element(
+ Range.begin(), Range.end(),
+ [&GetValue](auto &A, auto &B) { return GetValue(A) < GetValue(B); });
+ };
+
+ const AddrList &BaseInfos = AddrGroups.at(Move.Base);
+
+ // Conceptually, there is a vector of N bytes covering the addresses
+ // starting from the minimum offset (i.e. Base.Addr+Start). This vector
+ // represents a contiguous memory region that spans all accessed memory
+ // locations.
+ // The correspondence between loaded or stored values will be expressed
+ // in terms of this vector. For example, the 0th element of the vector
+ // from the Base address info will start at byte Start from the beginning
+ // of this conceptual vector.
+ //
+ // This vector will be loaded/stored starting at the nearest down-aligned
+ // address and the amount od the down-alignment will be AlignVal:
+ // valign(load_vector(align_down(Base+Start)), AlignVal)
+
+ std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
+ AddrList MoveInfos;
+ llvm::copy_if(
+ BaseInfos, std::back_inserter(MoveInfos),
+ [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
+
+ // Maximum alignment present in the whole address group.
+ const AddrInfo &WithMaxAlign =
+ getMaxOf(BaseInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
+ Align MaxGiven = WithMaxAlign.HaveAlign;
+
+ // Minimum alignment present in the move address group.
+ const AddrInfo &WithMinOffset =
+ getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
+
+ const AddrInfo &WithMaxNeeded =
+ getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
+ Align MinNeeded = WithMaxNeeded.NeedAlign;
+
+ // Set the builder at the top instruction in the move group.
+ Instruction *TopIn = Move.IsLoad ? Move.Main.front() : Move.Main.back();
+ IRBuilder<> Builder(TopIn);
+ Value *AlignAddr = nullptr; // Actual aligned address.
+ Value *AlignVal = nullptr; // Right-shift amount (for valign).
+
+ if (MinNeeded <= MaxGiven) {
+ int Start = WithMinOffset.Offset;
+ int OffAtMax = WithMaxAlign.Offset;
+ // Shift the offset of the maximally aligned instruction (OffAtMax)
+ // back by just enough multiples of the required alignment to cover the
+ // distance from Start to OffAtMax.
+ // Calculate the address adjustment amount based on the address with the
+ // maximum alignment. This is to allow a simple gep instruction instead
+ // of potential bitcasts to i8*.
+ int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
+ AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
+ WithMaxAlign.ValTy, Adjust);
+ int Diff = Start - (OffAtMax + Adjust);
+ AlignVal = HVC.getConstInt(Diff);
+ // Sanity.
+ assert(Diff >= 0);
+ assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
+ } else {
+ // WithMinOffset is the lowest address in the group,
+ // WithMinOffset.Addr = Base+Start.
+ // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
+ // mask off unnecessary bits, so it's ok to just the original pointer as
+ // the alignment amount.
+ // Do an explicit down-alignment of the address to avoid creating an
+ // aligned instruction with an address that is not really aligned.
+ AlignAddr = createAlignedPointer(Builder, WithMinOffset.Addr,
+ WithMinOffset.ValTy, MinNeeded.value());
+ AlignVal = Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy());
+ }
+
+ ByteSpan VSpan;
+ for (const AddrInfo &AI : MoveInfos) {
+ VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
+ AI.Offset - WithMinOffset.Offset);
+ }
+
+ // The aligned loads/stores will use blocks that are either scalars,
+ // or HVX vectors. Let "sector" be the unified term for such a block.
+ // blend(scalar, vector) -> sector...
+ int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
+ : std::max<int>(MinNeeded.value(), 4);
+ assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
+ assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
+
+ Type *SecTy = HVC.getByteTy(ScLen);
+ int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+
+ if (Move.IsLoad) {
+ ByteSpan ASpan;
+ auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
+ auto *Undef = UndefValue::get(SecTy);
+
+ for (int i = 0; i != NumSectors + 1; ++i) {
+ Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
+ // FIXME: generate a predicated load?
+ Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
+ ASpan.Blocks.emplace_back(Load, ScLen, i * ScLen);
+ }
+
+ for (int j = 0; j != NumSectors; ++j) {
+ ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
+ ASpan[j + 1].Seg.Val, AlignVal);
+ }
+
+ for (ByteSpan::Block &B : VSpan) {
+ ByteSpan Section = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
+ Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
+ for (ByteSpan::Block &S : Section) {
+ Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
+ Accum =
+ HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
+ }
+ // Instead of casting everything to bytes for the vselect, cast to the
+ // original value type. This will avoid complications with casting masks.
+ // For example, in cases when the original mask applied to i32, it could
+ // be converted to a mask applicable to i8 via pred_typecast intrinsic,
+ // but if the mask is not exactly of HVX length, extra handling would be
+ // needed to make it work.
+ Type *ValTy = getPayload(B.Seg.Val)->getType();
+ Value *Cast = Builder.CreateBitCast(Accum, ValTy);
+ Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
+ getPassThrough(B.Seg.Val));
+ B.Seg.Val->replaceAllUsesWith(Sel);
+ }
+ } else {
+ // Stores.
+ ByteSpan ASpanV, ASpanM;
+
+ // Return a vector value corresponding to the input value Val:
+ // either <1 x Val> for scalar Val, or Val itself for vector Val.
+ auto MakeVec = [](IRBuilder<> &Builder, Value *Val) -> Value * {
+ Type *Ty = Val->getType();
+ if (Ty->isVectorTy())
+ return Val;
+ auto *VecTy = VectorType::get(Ty, 1, /*Scalable*/ false);
+ return Builder.CreateBitCast(Val, VecTy);
+ };
+
+ // Create an extra "undef" sector at the beginning and at the end.
+ // They will be used as the left/right filler in the vlalign step.
+ for (int i = -1; i != NumSectors + 1; ++i) {
+ // For stores, the size of each section is an aligned vector length.
+ // Adjust the store offsets relative to the section start offset.
+ ByteSpan Section = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
+ Value *AccumV = UndefValue::get(SecTy);
+ Value *AccumM = HVC.getNullValue(SecTy);
+ for (ByteSpan::Block &S : Section) {
+ Value *Pay = getPayload(S.Seg.Val);
+ Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
+ Pay->getType(), HVC.getByteTy());
+ AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
+ S.Seg.Start, S.Seg.Size, S.Pos);
+ AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
+ S.Seg.Start, S.Seg.Size, S.Pos);
+ }
+ ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
+ ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
+ }
+
+ // vlalign
+ for (int j = 1; j != NumSectors + 2; ++j) {
+ ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanV[j - 1].Seg.Val,
+ ASpanV[j].Seg.Val, AlignVal);
+ ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, ASpanM[j - 1].Seg.Val,
+ ASpanM[j].Seg.Val, AlignVal);
+ }
+
+ for (int i = 0; i != NumSectors + 1; ++i) {
+ Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
+ Value *Val = ASpanV[i].Seg.Val;
+ Value *Mask = ASpanM[i].Seg.Val; // bytes
+ if (!HVC.isUndef(Val) && !HVC.isZero(Mask))
+ createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
+ }
+ }
+
+ for (auto *Inst : Move.Main)
+ Inst->eraseFromParent();
+
+ return true;
+}
+
+auto AlignVectors::run() -> bool {
+ if (!createAddressGroups())
+ return false;
+
+ bool Changed = false;
+ MoveList LoadGroups, StoreGroups;
+
+ for (auto &G : AddrGroups) {
+ llvm::append_range(LoadGroups, createLoadGroups(G.second));
+ llvm::append_range(StoreGroups, createStoreGroups(G.second));
+ }
+
+ for (auto &M : LoadGroups)
+ Changed |= move(M);
+ for (auto &M : StoreGroups)
+ Changed |= move(M);
+
+ for (auto &M : LoadGroups)
+ Changed |= realignGroup(M);
+ for (auto &M : StoreGroups)
+ Changed |= realignGroup(M);
+
+ return Changed;
+}
+
+// --- End AlignVectors
+
+auto HexagonVectorCombine::run() -> bool {
+ if (!HST.useHVXOps())
+ return false;
+
+ bool Changed = AlignVectors(*this).run();
+ return Changed;
+}
+
+auto HexagonVectorCombine::getIntTy() const -> IntegerType * {
+ return Type::getInt32Ty(F.getContext());
+}
+
+auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
+ assert(ElemCount >= 0);
+ IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
+ if (ElemCount == 0)
+ return ByteTy;
+ return VectorType::get(ByteTy, ElemCount, /*Scalable*/ false);
+}
+
+auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
+ assert(ElemCount >= 0);
+ IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
+ if (ElemCount == 0)
+ return BoolTy;
+ return VectorType::get(BoolTy, ElemCount, /*Scalable*/ false);
+}
+
+auto HexagonVectorCombine::getConstInt(int Val) const -> ConstantInt * {
+ return ConstantInt::getSigned(getIntTy(), Val);
+}
+
+auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
+ if (auto *C = dyn_cast<Constant>(Val))
+ return C->isZeroValue();
+ return false;
+}
+
+auto HexagonVectorCombine::getIntValue(const Value *Val) const
+ -> Optional<APInt> {
+ if (auto *CI = dyn_cast<ConstantInt>(Val))
+ return CI->getValue();
+ return None;
+}
+
+auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
+ return isa<UndefValue>(Val);
+}
+
+auto HexagonVectorCombine::getSizeOf(const Value *Val) const -> int {
+ return getSizeOf(Val->getType());
+}
+
+auto HexagonVectorCombine::getSizeOf(const Type *Ty) const -> int {
+ return DL.getTypeStoreSize(const_cast<Type *>(Ty)).getFixedValue();
+}
+
+auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
+ // The actual type may be shorter than the HVX vector, so determine
+ // the alignment based on subtarget info.
+ if (HST.isTypeForHVX(Ty))
+ return HST.getVectorLength();
+ return DL.getABITypeAlign(Ty).value();
+}
+
+auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
+ assert(Ty->isIntOrIntVectorTy());
+ auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
+ return Zero;
+}
+
+auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
+ assert(Ty->isIntOrIntVectorTy());
+ auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
+ return Minus1;
+}
+
+// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
+auto HexagonVectorCombine::insertb(IRBuilder<> &Builder, Value *Dst, Value *Src,
+ int Start, int Length, int Where) const
+ -> Value * {
+ assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
+ int SrcLen = getSizeOf(Src);
+ int DstLen = getSizeOf(Dst);
+ assert(0 <= Start && Start + Length <= SrcLen);
+ assert(0 <= Where && Where + Length <= DstLen);
+
+ int P2Len = PowerOf2Ceil(SrcLen | DstLen);
+ auto *Undef = UndefValue::get(getByteTy());
+ Value *P2Src = vresize(Builder, Src, P2Len, Undef);
+ Value *P2Dst = vresize(Builder, Dst, P2Len, Undef);
+
+ SmallVector<int, 256> SMask(P2Len);
+ for (int i = 0; i != P2Len; ++i) {
+ // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
+ // Otherwise, pick Dst[i];
+ SMask[i] =
+ (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
+ }
+
+ Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask);
+ return vresize(Builder, P2Insert, DstLen, Undef);
+}
+
+auto HexagonVectorCombine::vlalignb(IRBuilder<> &Builder, Value *Lo, Value *Hi,
+ Value *Amt) const -> Value * {
+ assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
+ assert(isSectorTy(Hi->getType()));
+ if (isZero(Amt))
+ return Hi;
+ int VecLen = getSizeOf(Hi);
+ if (auto IntAmt = getIntValue(Amt))
+ return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
+ VecLen);
+
+ if (HST.isTypeForHVX(Hi->getType())) {
+ int HwLen = HST.getVectorLength();
+ assert(VecLen == HwLen && "Expecting an exact HVX type");
+ Intrinsic::ID V6_vlalignb = HwLen == 64
+ ? Intrinsic::hexagon_V6_vlalignb
+ : Intrinsic::hexagon_V6_vlalignb_128B;
+ return createHvxIntrinsic(Builder, V6_vlalignb, Hi->getType(),
+ {Hi, Lo, Amt});
+ }
+
+ if (VecLen == 4) {
+ Value *Pair = concat(Builder, {Lo, Hi});
+ Value *Shift = Builder.CreateLShr(Builder.CreateShl(Pair, Amt), 32);
+ Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
+ return Builder.CreateBitCast(Trunc, Hi->getType());
+ }
+ if (VecLen == 8) {
+ Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt);
+ return vralignb(Builder, Lo, Hi, Sub);
+ }
+ llvm_unreachable("Unexpected vector length");
+}
+
+auto HexagonVectorCombine::vralignb(IRBuilder<> &Builder, Value *Lo, Value *Hi,
+ Value *Amt) const -> Value * {
+ assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
+ assert(isSectorTy(Lo->getType()));
+ if (isZero(Amt))
+ return Lo;
+ int VecLen = getSizeOf(Lo);
+ if (auto IntAmt = getIntValue(Amt))
+ return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
+
+ if (HST.isTypeForHVX(Lo->getType())) {
+ int HwLen = HST.getVectorLength();
+ assert(VecLen == HwLen && "Expecting an exact HVX type");
+ Intrinsic::ID V6_valignb = HwLen == 64 ? Intrinsic::hexagon_V6_valignb
+ : Intrinsic::hexagon_V6_valignb_128B;
+ return createHvxIntrinsic(Builder, V6_valignb, Lo->getType(),
+ {Hi, Lo, Amt});
+ }
+
+ if (VecLen == 4) {
+ Value *Pair = concat(Builder, {Lo, Hi});
+ Value *Shift = Builder.CreateLShr(Pair, Amt);
+ Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
+ return Builder.CreateBitCast(Trunc, Lo->getType());
+ }
+ if (VecLen == 8) {
+ Type *Int64Ty = Type::getInt64Ty(F.getContext());
+ Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty);
+ Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty);
+ Function *FI = Intrinsic::getDeclaration(F.getParent(),
+ Intrinsic::hexagon_S2_valignrb);
+ Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt});
+ return Builder.CreateBitCast(Call, Lo->getType());
+ }
+ llvm_unreachable("Unexpected vector length");
+}
+
+// Concatenates a sequence of vectors of the same type.
+auto HexagonVectorCombine::concat(IRBuilder<> &Builder,
+ ArrayRef<Value *> Vecs) const -> Value * {
+ assert(!Vecs.empty());
+ SmallVector<int, 256> SMask;
+ std::vector<Value *> Work[2];
+ int ThisW = 0, OtherW = 1;
+
+ Work[ThisW].assign(Vecs.begin(), Vecs.end());
+ while (Work[ThisW].size() > 1) {
+ auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
+ int ElemCount = Ty->getElementCount().getFixedValue();
+ SMask.resize(ElemCount * 2);
+ std::iota(SMask.begin(), SMask.end(), 0);
+
+ Work[OtherW].clear();
+ if (Work[ThisW].size() % 2 != 0)
+ Work[ThisW].push_back(UndefValue::get(Ty));
+ for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
+ Value *Joined = Builder.CreateShuffleVector(Work[ThisW][i],
+ Work[ThisW][i + 1], SMask);
+ Work[OtherW].push_back(Joined);
+ }
+ std::swap(ThisW, OtherW);
+ }
+
+ // Since there may have been some undefs appended to make shuffle operands
+ // have the same type, perform the last shuffle to only pick the original
+ // elements.
+ SMask.resize(Vecs.size() * getSizeOf(Vecs.front()->getType()));
+ std::iota(SMask.begin(), SMask.end(), 0);
+ Value *Total = Work[OtherW].front();
+ return Builder.CreateShuffleVector(Total, SMask);
+}
+
+auto HexagonVectorCombine::vresize(IRBuilder<> &Builder, Value *Val,
+ int NewSize, Value *Pad) const -> Value * {
+ assert(isa<VectorType>(Val->getType()));
+ auto *ValTy = cast<VectorType>(Val->getType());
+ assert(ValTy->getElementType() == Pad->getType());
+
+ int CurSize = ValTy->getElementCount().getFixedValue();
+ if (CurSize == NewSize)
+ return Val;
+ // Truncate?
+ if (CurSize > NewSize)
+ return getElementRange(Builder, Val, /*Unused*/ Val, 0, NewSize);
+ // Extend.
+ SmallVector<int, 128> SMask(NewSize);
+ std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
+ std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
+ Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad);
+ return Builder.CreateShuffleVector(Val, PadVec, SMask);
+}
+
+auto HexagonVectorCombine::rescale(IRBuilder<> &Builder, Value *Mask,
+ Type *FromTy, Type *ToTy) const -> Value * {
+ // Mask is a vector <N x i1>, where each element corresponds to an
+ // element of FromTy. Remap it so that each element will correspond
+ // to an element of ToTy.
+ assert(isa<VectorType>(Mask->getType()));
+
+ Type *FromSTy = FromTy->getScalarType();
+ Type *ToSTy = ToTy->getScalarType();
+ if (FromSTy == ToSTy)
+ return Mask;
+
+ int FromSize = getSizeOf(FromSTy);
+ int ToSize = getSizeOf(ToSTy);
+ assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
+
+ auto *MaskTy = cast<VectorType>(Mask->getType());
+ int FromCount = MaskTy->getElementCount().getFixedValue();
+ int ToCount = (FromCount * FromSize) / ToSize;
+ assert((FromCount * FromSize) % ToSize == 0);
+
+ // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
+ // -> trunc to <M x i1>.
+ Value *Ext = Builder.CreateSExt(
+ Mask, VectorType::get(FromSTy, FromCount, /*Scalable*/ false));
+ Value *Cast = Builder.CreateBitCast(
+ Ext, VectorType::get(ToSTy, ToCount, /*Scalable*/ false));
+ return Builder.CreateTrunc(
+ Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable*/ false));
+}
+
+// Bitcast to bytes, and return least significant bits.
+auto HexagonVectorCombine::vlsb(IRBuilder<> &Builder, Value *Val) const
+ -> Value * {
+ Type *ScalarTy = Val->getType()->getScalarType();
+ if (ScalarTy == getBoolTy())
+ return Val;
+
+ Value *Bytes = vbytes(Builder, Val);
+ if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
+ return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)));
+ // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
+ // <1 x i1>.
+ return Builder.CreateTrunc(Bytes, getBoolTy());
+}
+
+// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
+auto HexagonVectorCombine::vbytes(IRBuilder<> &Builder, Value *Val) const
+ -> Value * {
+ Type *ScalarTy = Val->getType()->getScalarType();
+ if (ScalarTy == getByteTy())
+ return Val;
+
+ if (ScalarTy != getBoolTy())
+ return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)));
+ // For bool, return a sext from i1 to i8.
+ if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
+ return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy));
+ return Builder.CreateSExt(Val, getByteTy());
+}
+
+auto HexagonVectorCombine::createHvxIntrinsic(IRBuilder<> &Builder,
+ Intrinsic::ID IntID, Type *RetTy,
+ ArrayRef<Value *> Args) const
+ -> Value * {
+ int HwLen = HST.getVectorLength();
+ Type *BoolTy = Type::getInt1Ty(F.getContext());
+ Type *Int32Ty = Type::getInt32Ty(F.getContext());
+ // HVX vector -> v16i32/v32i32
+ // HVX vector predicate -> v512i1/v1024i1
+ auto getTypeForIntrin = [&](Type *Ty) -> Type * {
+ if (HST.isTypeForHVX(Ty, /*IncludeBool*/ true)) {
+ Type *ElemTy = cast<VectorType>(Ty)->getElementType();
+ if (ElemTy == Int32Ty)
+ return Ty;
+ if (ElemTy == BoolTy)
+ return VectorType::get(BoolTy, 8 * HwLen, /*Scalable*/ false);
+ return VectorType::get(Int32Ty, HwLen / 4, /*Scalable*/ false);
+ }
+ // Non-HVX type. It should be a scalar.
+ assert(Ty == Int32Ty || Ty->isIntegerTy(64));
+ return Ty;
+ };
+
+ auto getCast = [&](IRBuilder<> &Builder, Value *Val,
+ Type *DestTy) -> Value * {
+ Type *SrcTy = Val->getType();
+ if (SrcTy == DestTy)
+ return Val;
+ if (HST.isTypeForHVX(SrcTy, /*IncludeBool*/ true)) {
+ if (cast<VectorType>(SrcTy)->getElementType() == BoolTy) {
+ // This should take care of casts the other way too, for example
+ // v1024i1 -> v32i1.
+ Intrinsic::ID TC = HwLen == 64
+ ? Intrinsic::hexagon_V6_pred_typecast
+ : Intrinsic::hexagon_V6_pred_typecast_128B;
+ Function *FI = Intrinsic::getDeclaration(F.getParent(), TC,
+ {DestTy, Val->getType()});
+ return Builder.CreateCall(FI, {Val});
+ }
+ // Non-predicate HVX vector.
+ return Builder.CreateBitCast(Val, DestTy);
+ }
+ // Non-HVX type. It should be a scalar, and it should already have
+ // a valid type.
+ llvm_unreachable("Unexpected type");
+ };
+
+ SmallVector<Value *, 4> IntOps;
+ for (Value *A : Args)
+ IntOps.push_back(getCast(Builder, A, getTypeForIntrin(A->getType())));
+ Function *FI = Intrinsic::getDeclaration(F.getParent(), IntID);
+ Value *Call = Builder.CreateCall(FI, IntOps);
+
+ Type *CallTy = Call->getType();
+ if (CallTy == RetTy)
+ return Call;
+ // Scalar types should have RetTy matching the call return type.
+ assert(HST.isTypeForHVX(CallTy, /*IncludeBool*/ true));
+ if (cast<VectorType>(CallTy)->getElementType() == BoolTy)
+ return getCast(Builder, Call, RetTy);
+ return Builder.CreateBitCast(Call, RetTy);
+}
+
+auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
+ Value *Ptr1) const
+ -> Optional<int> {
+ struct Builder : IRBuilder<> {
+ Builder(BasicBlock *B) : IRBuilder<>(B) {}
+ ~Builder() {
+ for (Instruction *I : llvm::reverse(ToErase))
+ I->eraseFromParent();
+ }
+ SmallVector<Instruction *, 8> ToErase;
+ };
+
+#define CallBuilder(B, F) \
+ [&](auto &B_) { \
+ Value *V = B_.F; \
+ if (auto *I = dyn_cast<Instruction>(V)) \
+ B_.ToErase.push_back(I); \
+ return V; \
+ }(B)
+
+ auto Simplify = [&](Value *V) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ SimplifyQuery Q(DL, &TLI, &DT, &AC, I);
+ if (Value *S = SimplifyInstruction(I, Q))
+ return S;
+ }
+ return V;
+ };
+
+ auto StripBitCast = [](Value *V) {
+ while (auto *C = dyn_cast<BitCastInst>(V))
+ V = C->getOperand(0);
+ return V;
+ };
+
+ Ptr0 = StripBitCast(Ptr0);
+ Ptr1 = StripBitCast(Ptr1);
+ if (!isa<GetElementPtrInst>(Ptr0) || !isa<GetElementPtrInst>(Ptr1))
+ return None;
+
+ auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
+ auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
+ if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
+ return None;
+
+ Builder B(Gep0->getParent());
+ Value *BasePtr = Gep0->getPointerOperand();
+ int Scale = DL.getTypeStoreSize(BasePtr->getType()->getPointerElementType());
+
+ // FIXME: for now only check GEPs with a single index.
+ if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
+ return None;
+
+ Value *Idx0 = Gep0->getOperand(1);
+ Value *Idx1 = Gep1->getOperand(1);
+
+ // First, try to simplify the subtraction directly.
+ if (auto *Diff = dyn_cast<ConstantInt>(
+ Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
+ return Diff->getSExtValue() * Scale;
+
+ KnownBits Known0 = computeKnownBits(Idx0, DL, 0, &AC, Gep0, &DT);
+ KnownBits Known1 = computeKnownBits(Idx1, DL, 0, &AC, Gep1, &DT);
+ APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
+ if (Unknown.isAllOnesValue())
+ return None;
+
+ Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
+ Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
+ Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
+ Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
+ int Diff0 = 0;
+ if (auto *C = dyn_cast<ConstantInt>(SubU)) {
+ Diff0 = C->getSExtValue();
+ } else {
+ return None;
+ }
+
+ Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
+ Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
+ Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
+ Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
+ int Diff1 = 0;
+ if (auto *C = dyn_cast<ConstantInt>(SubK)) {
+ Diff1 = C->getSExtValue();
+ } else {
+ return None;
+ }
+
+ return (Diff0 + Diff1) * Scale;
+
+#undef CallBuilder
+}
+
+template <typename T>
+auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
+ BasicBlock::const_iterator To,
+ const T &Ignore) const
+ -> bool {
+ auto getLocOrNone = [this](const Instruction &I) -> Optional<MemoryLocation> {
+ if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::masked_load:
+ return MemoryLocation::getForArgument(II, 0, TLI);
+ case Intrinsic::masked_store:
+ return MemoryLocation::getForArgument(II, 1, TLI);
+ }
+ }
+ return MemoryLocation::getOrNone(&I);
+ };
+
+ // The source and the destination must be in the same basic block.
+ const BasicBlock &Block = *In.getParent();
+ assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
+ // No PHIs.
+ if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
+ return false;
+
+ if (!mayBeMemoryDependent(In))
+ return true;
+ bool MayWrite = In.mayWriteToMemory();
+ auto MaybeLoc = getLocOrNone(In);
+
+ auto From = In.getIterator();
+ if (From == To)
+ return true;
+ bool MoveUp = (To != Block.end() && To->comesBefore(&In));
+ auto Range =
+ MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
+ for (auto It = Range.first; It != Range.second; ++It) {
+ const Instruction &I = *It;
+ if (llvm::is_contained(Ignore, &I))
+ continue;
+ // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
+ if (I.mayThrow())
+ return false;
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ if (!CB->hasFnAttr(Attribute::WillReturn))
+ return false;
+ if (!CB->hasFnAttr(Attribute::NoSync))
+ return false;
+ }
+ if (I.mayReadOrWriteMemory()) {
+ auto MaybeLocI = getLocOrNone(I);
+ if (MayWrite || I.mayWriteToMemory()) {
+ if (!MaybeLoc || !MaybeLocI)
+ return false;
+ if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+#ifndef NDEBUG
+auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ return VecTy->getElementType() == getByteTy();
+ return false;
+}
+
+auto HexagonVectorCombine::isSectorTy(Type *Ty) const -> bool {
+ if (!isByteVecTy(Ty))
+ return false;
+ int Size = getSizeOf(Ty);
+ if (HST.isTypeForHVX(Ty))
+ return Size == static_cast<int>(HST.getVectorLength());
+ return Size == 4 || Size == 8;
+}
+#endif
+
+auto HexagonVectorCombine::getElementRange(IRBuilder<> &Builder, Value *Lo,
+ Value *Hi, int Start,
+ int Length) const -> Value * {
+ assert(0 <= Start && Start < Length);
+ SmallVector<int, 128> SMask(Length);
+ std::iota(SMask.begin(), SMask.end(), Start);
+ return Builder.CreateShuffleVector(Lo, Hi, SMask);
+}
+
+// Pass management.
+
+namespace llvm {
+void initializeHexagonVectorCombineLegacyPass(PassRegistry &);
+FunctionPass *createHexagonVectorCombineLegacyPass();
+} // namespace llvm
+
+namespace {
+class HexagonVectorCombineLegacy : public FunctionPass {
+public:
+ static char ID;
+
+ HexagonVectorCombineLegacy() : FunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Hexagon Vector Combine"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
+ HexagonVectorCombine HVC(F, AA, AC, DT, TLI, TM);
+ return HVC.run();
+ }
+};
+} // namespace
+
+char HexagonVectorCombineLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
+ "Hexagon Vector Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
+ "Hexagon Vector Combine", false, false)
+
+FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {
+ return new HexagonVectorCombineLegacy();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 42451e02ba36..310536458de9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -11,110 +11,9 @@
// to identify loop carried dependences. This is scalar replacement for vector
// types.
//
-//-----------------------------------------------------------------------------
-// Motivation: Consider the case where we have the following loop structure.
-//
-// Loop:
-// t0 = a[i];
-// t1 = f(t0);
-// t2 = g(t1);
-// ...
-// t3 = a[i+1];
-// t4 = f(t3);
-// t5 = g(t4);
-// t6 = op(t2, t5)
-// cond_branch <Loop>
-//
-// This can be converted to
-// t00 = a[0];
-// t10 = f(t00);
-// t20 = g(t10);
-// Loop:
-// t2 = t20;
-// t3 = a[i+1];
-// t4 = f(t3);
-// t5 = g(t4);
-// t6 = op(t2, t5)
-// t20 = t5
-// cond_branch <Loop>
-//
-// SROA does a good job of reusing a[i+1] as a[i] in the next iteration.
-// Such a loop comes to this pass in the following form.
-//
-// LoopPreheader:
-// X0 = a[0];
-// Loop:
-// X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
-// t1 = f(X2) <-- I1
-// t2 = g(t1)
-// ...
-// X1 = a[i+1]
-// t4 = f(X1) <-- I2
-// t5 = g(t4)
-// t6 = op(t2, t5)
-// cond_branch <Loop>
-//
-// In this pass, we look for PHIs such as X2 whose incoming values come only
-// from the Loop Preheader and over the backedge and additionaly, both these
-// values are the results of the same operation in terms of opcode. We call such
-// a PHI node a dependence chain or DepChain. In this case, the dependence of X2
-// over X1 is carried over only one iteration and so the DepChain is only one
-// PHI node long.
-//
-// Then, we traverse the uses of the PHI (X2) and the uses of the value of the
-// PHI coming over the backedge (X1). We stop at the first pair of such users
-// I1 (of X2) and I2 (of X1) that meet the following conditions.
-// 1. I1 and I2 are the same operation, but with different operands.
-// 2. X2 and X1 are used at the same operand number in the two instructions.
-// 3. All other operands Op1 of I1 and Op2 of I2 are also such that there is a
-// a DepChain from Op1 to Op2 of the same length as that between X2 and X1.
-//
-// We then make the following transformation
-// LoopPreheader:
-// X0 = a[0];
-// Y0 = f(X0);
-// Loop:
-// X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
-// Y2 = PHI<(Y0, LoopPreheader), (t4, Loop)>
-// t1 = f(X2) <-- Will be removed by DCE.
-// t2 = g(Y2)
-// ...
-// X1 = a[i+1]
-// t4 = f(X1)
-// t5 = g(t4)
-// t6 = op(t2, t5)
-// cond_branch <Loop>
-//
-// We proceed until we cannot find any more such instructions I1 and I2.
-//
-// --- DepChains & Loop carried dependences ---
-// Consider a single basic block loop such as
-//
-// LoopPreheader:
-// X0 = ...
-// Y0 = ...
-// Loop:
-// X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
-// Y2 = PHI<(Y0, LoopPreheader), (X2, Loop)>
-// ...
-// X1 = ...
-// ...
-// cond_branch <Loop>
-//
-// Then there is a dependence between X2 and X1 that goes back one iteration,
-// i.e. X1 is used as X2 in the very next iteration. We represent this as a
-// DepChain from X2 to X1 (X2->X1).
-// Similarly, there is a dependence between Y2 and X1 that goes back two
-// iterations. X1 is used as Y2 two iterations after it is computed. This is
-// represented by a DepChain as (Y2->X2->X1).
-//
-// A DepChain has the following properties.
-// 1. Num of edges in DepChain = Number of Instructions in DepChain = Number of
-// iterations of carried dependence + 1.
-// 2. All instructions in the DepChain except the last are PHIs.
-//
//===----------------------------------------------------------------------===//
+#include "HexagonVectorLoopCarriedReuse.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -161,8 +60,8 @@ static cl::opt<int> HexagonVLCRIterationLim("hexagon-vlcr-iteration-lim",
namespace llvm {
-void initializeHexagonVectorLoopCarriedReusePass(PassRegistry&);
-Pass *createHexagonVectorLoopCarriedReusePass();
+void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
+Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
} // end namespace llvm
@@ -262,13 +161,13 @@ namespace {
return OS;
}
- class HexagonVectorLoopCarriedReuse : public LoopPass {
+ class HexagonVectorLoopCarriedReuseLegacyPass : public LoopPass {
public:
static char ID;
- explicit HexagonVectorLoopCarriedReuse() : LoopPass(ID) {
+ explicit HexagonVectorLoopCarriedReuseLegacyPass() : LoopPass(ID) {
PassRegistry *PR = PassRegistry::getPassRegistry();
- initializeHexagonVectorLoopCarriedReusePass(*PR);
+ initializeHexagonVectorLoopCarriedReuseLegacyPassPass(*PR);
}
StringRef getPassName() const override {
@@ -276,7 +175,6 @@ namespace {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addPreservedID(LCSSAID);
@@ -284,6 +182,13 @@ namespace {
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ };
+
+ class HexagonVectorLoopCarriedReuse {
+ public:
+ HexagonVectorLoopCarriedReuse(Loop *L) : CurLoop(L){};
+
+ bool run();
private:
SetVector<DepChain *> Dependences;
@@ -305,33 +210,49 @@ namespace {
} // end anonymous namespace
-char HexagonVectorLoopCarriedReuse::ID = 0;
+char HexagonVectorLoopCarriedReuseLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(HexagonVectorLoopCarriedReuse, "hexagon-vlcr",
- "Hexagon-specific predictive commoning for HVX vectors", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_BEGIN(HexagonVectorLoopCarriedReuseLegacyPass, "hexagon-vlcr",
+ "Hexagon-specific predictive commoning for HVX vectors",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
-INITIALIZE_PASS_END(HexagonVectorLoopCarriedReuse, "hexagon-vlcr",
- "Hexagon-specific predictive commoning for HVX vectors", false, false)
+INITIALIZE_PASS_END(HexagonVectorLoopCarriedReuseLegacyPass, "hexagon-vlcr",
+ "Hexagon-specific predictive commoning for HVX vectors",
+ false, false)
+
+PreservedAnalyses
+HexagonVectorLoopCarriedReusePass::run(Loop &L, LoopAnalysisManager &LAM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ HexagonVectorLoopCarriedReuse Vlcr(&L);
+ if (!Vlcr.run())
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
-bool HexagonVectorLoopCarriedReuse::runOnLoop(Loop *L, LPPassManager &LPM) {
+bool HexagonVectorLoopCarriedReuseLegacyPass::runOnLoop(Loop *L,
+ LPPassManager &LPM) {
if (skipLoop(L))
return false;
+ HexagonVectorLoopCarriedReuse Vlcr(L);
+ return Vlcr.run();
+}
- if (!L->getLoopPreheader())
+bool HexagonVectorLoopCarriedReuse::run() {
+ if (!CurLoop->getLoopPreheader())
return false;
// Work only on innermost loops.
- if (!L->getSubLoops().empty())
+ if (!CurLoop->getSubLoops().empty())
return false;
// Work only on single basic blocks loops.
- if (L->getNumBlocks() != 1)
+ if (CurLoop->getNumBlocks() != 1)
return false;
- CurLoop = L;
-
return doVLCR();
}
@@ -745,6 +666,6 @@ void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() {
++i) { dbgs() << *Dependences[i] << "\n"; });
}
-Pass *llvm::createHexagonVectorLoopCarriedReusePass() {
- return new HexagonVectorLoopCarriedReuse();
+Pass *llvm::createHexagonVectorLoopCarriedReuseLegacyPass() {
+ return new HexagonVectorLoopCarriedReuseLegacyPass();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
new file mode 100644
index 000000000000..f1e0c5804ace
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.h
@@ -0,0 +1,139 @@
+//===- HexagonVectorLoopCarriedReuse.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass removes the computation of provably redundant expressions that have
+// been computed earlier in a previous iteration. It relies on the use of PHIs
+// to identify loop carried dependences. This is scalar replacement for vector
+// types.
+//
+//-----------------------------------------------------------------------------
+// Motivation: Consider the case where we have the following loop structure.
+//
+// Loop:
+// t0 = a[i];
+// t1 = f(t0);
+// t2 = g(t1);
+// ...
+// t3 = a[i+1];
+// t4 = f(t3);
+// t5 = g(t4);
+// t6 = op(t2, t5)
+// cond_branch <Loop>
+//
+// This can be converted to
+// t00 = a[0];
+// t10 = f(t00);
+// t20 = g(t10);
+// Loop:
+// t2 = t20;
+// t3 = a[i+1];
+// t4 = f(t3);
+// t5 = g(t4);
+// t6 = op(t2, t5)
+// t20 = t5
+// cond_branch <Loop>
+//
+// SROA does a good job of reusing a[i+1] as a[i] in the next iteration.
+// Such a loop comes to this pass in the following form.
+//
+// LoopPreheader:
+// X0 = a[0];
+// Loop:
+// X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
+// t1 = f(X2) <-- I1
+// t2 = g(t1)
+// ...
+// X1 = a[i+1]
+// t4 = f(X1) <-- I2
+// t5 = g(t4)
+// t6 = op(t2, t5)
+// cond_branch <Loop>
+//
+// In this pass, we look for PHIs such as X2 whose incoming values come only
+// from the Loop Preheader and over the backedge and additionaly, both these
+// values are the results of the same operation in terms of opcode. We call such
+// a PHI node a dependence chain or DepChain. In this case, the dependence of X2
+// over X1 is carried over only one iteration and so the DepChain is only one
+// PHI node long.
+//
+// Then, we traverse the uses of the PHI (X2) and the uses of the value of the
+// PHI coming over the backedge (X1). We stop at the first pair of such users
+// I1 (of X2) and I2 (of X1) that meet the following conditions.
+// 1. I1 and I2 are the same operation, but with different operands.
+// 2. X2 and X1 are used at the same operand number in the two instructions.
+// 3. All other operands Op1 of I1 and Op2 of I2 are also such that there is a
+// a DepChain from Op1 to Op2 of the same length as that between X2 and X1.
+//
+// We then make the following transformation
+// LoopPreheader:
+// X0 = a[0];
+// Y0 = f(X0);
+// Loop:
+// X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
+// Y2 = PHI<(Y0, LoopPreheader), (t4, Loop)>
+// t1 = f(X2) <-- Will be removed by DCE.
+// t2 = g(Y2)
+// ...
+// X1 = a[i+1]
+// t4 = f(X1)
+// t5 = g(t4)
+// t6 = op(t2, t5)
+// cond_branch <Loop>
+//
+// We proceed until we cannot find any more such instructions I1 and I2.
+//
+// --- DepChains & Loop carried dependences ---
+// Consider a single basic block loop such as
+//
+// LoopPreheader:
+// X0 = ...
+// Y0 = ...
+// Loop:
+// X2 = PHI<(X0, LoopPreheader), (X1, Loop)>
+// Y2 = PHI<(Y0, LoopPreheader), (X2, Loop)>
+// ...
+// X1 = ...
+// ...
+// cond_branch <Loop>
+//
+// Then there is a dependence between X2 and X1 that goes back one iteration,
+// i.e. X1 is used as X2 in the very next iteration. We represent this as a
+// DepChain from X2 to X1 (X2->X1).
+// Similarly, there is a dependence between Y2 and X1 that goes back two
+// iterations. X1 is used as Y2 two iterations after it is computed. This is
+// represented by a DepChain as (Y2->X2->X1).
+//
+// A DepChain has the following properties.
+// 1. Num of edges in DepChain = Number of Instructions in DepChain = Number of
+// iterations of carried dependence + 1.
+// 2. All instructions in the DepChain except the last are PHIs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONVLCR_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONVLCR_H
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class Loop;
+
+/// Hexagon Vector Loop Carried Reuse Pass
+struct HexagonVectorLoopCarriedReusePass
+ : public PassInfoMixin<HexagonVectorLoopCarriedReusePass> {
+ HexagonVectorLoopCarriedReusePass() {}
+
+ /// Run pass over the Loop.
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONVLCR_H
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index e7069819fa57..627c53cadd84 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -74,7 +74,7 @@ public:
void setExtender(MCContext &Context) const {
if (Extender == nullptr)
- const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst;
+ const_cast<HexagonAsmBackend *>(this)->Extender = Context.createMCInst();
}
MCInst *takeExtender() const {
@@ -736,7 +736,7 @@ public:
auto &Inst = const_cast<MCInst &>(RF.getInst());
while (Size > 0 &&
HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) {
- MCInst *Nop = new (Context) MCInst;
+ MCInst *Nop = Context.createMCInst();
Nop->setOpcode(Hexagon::A2_nop);
Inst.addOperand(MCOperand::createInst(Nop));
Size -= 4;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index cd96a23e1b94..76658378c0cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -34,6 +34,7 @@ public:
static char const *getRegisterName(unsigned RegNo);
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 2b0bbdafa381..e7ade7834a9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "MCTargetDesc/HexagonMCShuffler.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -209,7 +210,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
case Hexagon::A2_tfrsi:
Rt = L.getOperand(0);
compoundOpcode = J4_jumpseti;
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rt);
@@ -222,7 +223,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
Rs = L.getOperand(1);
compoundOpcode = J4_jumpsetr;
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rt);
CompoundInsn->addOperand(Rs);
@@ -236,7 +237,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
Rt = L.getOperand(2);
compoundOpcode = cmpeqBitOpcode[getCompoundOp(R)];
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(Rt);
@@ -249,7 +250,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
Rt = L.getOperand(2);
compoundOpcode = cmpgtBitOpcode[getCompoundOp(R)];
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(Rt);
@@ -262,7 +263,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
Rt = L.getOperand(2);
compoundOpcode = cmpgtuBitOpcode[getCompoundOp(R)];
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(Rt);
@@ -280,7 +281,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
Rs = L.getOperand(1);
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(L.getOperand(2));
@@ -298,7 +299,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
Rs = L.getOperand(1);
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(L.getOperand(2));
@@ -309,7 +310,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
LLVM_DEBUG(dbgs() << "CX: C2_cmpgtui\n");
Rs = L.getOperand(1);
compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(L.getOperand(2));
@@ -320,7 +321,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
LLVM_DEBUG(dbgs() << "CX: S2_tstbit_i\n");
Rs = L.getOperand(1);
compoundOpcode = tstBitOpcode[getCompoundOp(R)];
- CompoundInsn = new (Context) MCInst;
+ CompoundInsn = Context.createMCInst();
CompoundInsn->setOpcode(compoundOpcode);
CompoundInsn->addOperand(Rs);
CompoundInsn->addOperand(R.getOperand(1));
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index f9f342a07f6d..fa12fe1da448 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -110,7 +110,7 @@ HexagonMCInstrInfo::bundleInstructions(MCInstrInfo const &MCII,
iterator_range<MCInst::const_iterator>
HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
assert(isBundle(MCI));
- return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
+ return drop_begin(MCI, bundleInstructionsOffset);
}
size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 7514d0e67744..5e4138ae6e09 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -468,7 +468,8 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
StringRef CPUName = Features.first;
StringRef ArchFS = Features.second;
- MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
+ MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(
+ TT, CPUName, /*TuneCPU*/ CPUName, ArchFS);
if (X != nullptr && (CPUName == "hexagonv67t"))
addArchSubtarget(X, ArchFS);
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 2788b86181e2..8a44ba32606e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -326,7 +326,7 @@ bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) {
}
// Verify the CVI slot subscriptions.
- std::stable_sort(begin(), end(), HexagonInstr::lessCVI);
+ llvm::stable_sort(*this, HexagonInstr::lessCVI);
// create vector of hvx instructions to check
HVXInstsT hvxInsts;
hvxInsts.clear();
@@ -609,8 +609,7 @@ llvm::Optional<HexagonShuffler::HexagonPacket>
HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) const {
HexagonPacket PacketResult = Packet;
HexagonUnitAuction AuctionCore(Summary.ReservedSlotMask);
- std::stable_sort(PacketResult.begin(), PacketResult.end(),
- HexagonInstr::lessCore);
+ llvm::stable_sort(PacketResult, HexagonInstr::lessCore);
const bool ValidSlots =
llvm::all_of(insts(PacketResult), [&AuctionCore](HexagonInstr const &I) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
index 5a98debd3c00..894bdf38fe17 100644
--- a/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -195,8 +195,7 @@ bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
// If it's a code node, add all ref nodes from it.
uint16_t Kind = BA.Addr->getKind();
if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) {
- for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
- DRNs.push_back(N);
+ append_range(DRNs, NodeAddr<CodeNode*>(BA).Addr->members(DFG));
DINs.push_back(DFG.addr<InstrNode*>(I));
} else {
llvm_unreachable("Unexpected code node");
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 32ccf7172594..b96e178109d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -1020,7 +1020,7 @@ SDValue LanaiTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Size = Op.getOperand(1);
SDLoc DL(Op);
- unsigned SPReg = getStackPointerRegisterToSaveRestore();
+ Register SPReg = getStackPointerRegisterToSaveRestore();
// Get a reference to the stack pointer.
SDValue StackPointer = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i32);
@@ -1500,8 +1500,7 @@ void LanaiTargetLowering::computeKnownBitsForTargetNode(
KnownBits Known2;
Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
- Known.Zero &= Known2.Zero;
- Known.One &= Known2.One;
+ Known = KnownBits::commonBits(Known, Known2);
break;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
index ebf91e08fbc8..d9d7847a0c5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -27,7 +27,7 @@ void LanaiSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (CPUName.empty())
CPUName = "generic";
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
}
LanaiSubtarget &LanaiSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -41,6 +41,6 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
const TargetOptions & /*Options*/,
CodeModel::Model /*CodeModel*/,
CodeGenOpt::Level /*OptLevel*/)
- : LanaiGenSubtargetInfo(TargetTriple, Cpu, FeatureString),
+ : LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString),
FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
InstrInfo(), TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h
index 116c83a4df91..7955bfe0d8b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiSubtarget.h
@@ -17,7 +17,6 @@
#include "LanaiISelLowering.h"
#include "LanaiInstrInfo.h"
#include "LanaiSelectionDAGInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetMachine.h"
@@ -38,7 +37,7 @@ public:
// ParseSubtargetFeatures - Parses features string setting specified
// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
LanaiSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 69387119f1f4..a31f59214ec7 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -48,9 +48,7 @@ static std::string computeDataLayout() {
}
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::PIC_;
- return *RM;
+ return RM.getValueOr(Reloc::PIC_);
}
LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h
index fb2bc0644fe8..00922f44f33a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -13,12 +13,10 @@
#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
#define LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
-#include "LanaiFrameLowering.h"
#include "LanaiISelLowering.h"
#include "LanaiInstrInfo.h"
#include "LanaiSelectionDAGInfo.h"
#include "LanaiSubtarget.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 7366d5059c9f..263f838e44a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -67,7 +67,8 @@ public:
}
int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr) {
return getIntImmCost(Imm, Ty, CostKind);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 0fb27a926003..a17afe5e62f6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -69,11 +69,6 @@ public:
return Lanai::NumTargetFixupKinds;
}
- bool mayNeedRelaxation(const MCInst & /*Inst*/,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
-
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
index ce6df2969d73..f0d287c858d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
@@ -43,6 +43,7 @@ public:
void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 2ff893273c92..e850b98de806 100644
--- a/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -56,7 +56,7 @@ createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
if (CPUName.empty())
CPUName = "generic";
- return createLanaiMCSubtargetInfoImpl(TT, CPUName, FS);
+ return createLanaiMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU*/ CPUName, FS);
}
static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 9529b5e802d5..f32418c5be55 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -12,7 +12,6 @@
#include "TargetInfo/MSP430TargetInfo.h"
#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index 958212dc77c9..071e1484196b 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -90,11 +90,6 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
-
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
index 6a6b07f2eba0..08c466377ee3 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
@@ -26,6 +26,7 @@ namespace llvm {
const MCSubtargetInfo &STI, raw_ostream &O) override;
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &O);
void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index f207d24ce04b..c352ea563454 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -44,7 +44,7 @@ static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *
createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
- return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
+ return createMSP430MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 821339f50355..9c6d44bf92de 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
@@ -1209,9 +1208,8 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SR;
} else {
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {One, Zero, TargetCC, Flag};
- return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+ return DAG.getNode(MSP430ISD::SELECT_CC, dl, Op.getValueType(), Ops);
}
}
@@ -1227,10 +1225,9 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
SDValue TargetCC;
SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {TrueV, FalseV, TargetCC, Flag};
- return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+ return DAG.getNode(MSP430ISD::SELECT_CC, dl, Op.getValueType(), Ops);
}
SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
@@ -1392,14 +1389,15 @@ bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
- return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+ return (Ty1->getPrimitiveSizeInBits().getFixedSize() >
+ Ty2->getPrimitiveSizeInBits().getFixedSize());
}
bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isInteger() || !VT2.isInteger())
return false;
- return (VT1.getSizeInBits() > VT2.getSizeInBits());
+ return (VT1.getFixedSizeInBits() > VT2.getFixedSizeInBits());
}
bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
index 1f3c1d34f76f..5a117404d772 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -47,7 +47,7 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
if (CPUName.empty())
CPUName = "msp430";
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
if (HWMultModeOption != NoHWMult)
HWMultMode = HWMultModeOption;
@@ -57,5 +57,5 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
- : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
+ : MSP430GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(),
InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h
index 2348d984d7e2..079af2c75ec1 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -54,7 +54,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
bool hasHWMult16() const { return HWMultMode == HWMult16; }
bool hasHWMult32() const { return HWMultMode == HWMult32; }
diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h
index 96fbc3ba0377..ef757dc7cb78 100644
--- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.h
@@ -15,10 +15,10 @@
#define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
#include "MSP430Subtarget.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
+class StringRef;
/// MSP430TargetMachine
///
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 9dbbdeb34dba..e4d61f8c210e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -352,8 +352,8 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandSaaAddr(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
const MCSubtargetInfo *STI);
- bool reportParseError(Twine ErrorMsg);
- bool reportParseError(SMLoc Loc, Twine ErrorMsg);
+ bool reportParseError(const Twine &ErrorMsg);
+ bool reportParseError(SMLoc Loc, const Twine &ErrorMsg);
bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
@@ -6982,12 +6982,12 @@ bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// FIXME: Given that these have the same name, these should both be
// consistent on affecting the Parser.
-bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
+bool MipsAsmParser::reportParseError(const Twine &ErrorMsg) {
SMLoc Loc = getLexer().getLoc();
return Error(Loc, ErrorMsg);
}
-bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
+bool MipsAsmParser::reportParseError(SMLoc Loc, const Twine &ErrorMsg) {
return Error(Loc, ErrorMsg);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index 37e970f2f15b..3315a8ba18d6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -9,7 +9,6 @@
#include "MipsABIInfo.h"
#include "MipsRegisterInfo.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/Support/CommandLine.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 1126b871cb11..16c7befb2670 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -53,15 +53,6 @@ public:
/// @name Target Relaxation Interfaces
/// @{
- /// MayNeedRelaxation - Check whether the given instruction may need
- /// relaxation.
- ///
- /// \param Inst - The instruction to test.
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
-
/// fixupNeedsRelaxation - Target specific predicate for whether a given
/// fixup requires the associated instruction to be relaxed.
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
index 3f534a2f1843..68b13bf1fcc3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
@@ -79,6 +79,7 @@ public:
: MCInstPrinter(MAI, MII, MRI) {}
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index de582bd60cbf..454f79926dd0 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -77,7 +77,7 @@ static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
CPU = MIPS_MC::selectMipsCPU(TT, CPU);
- return createMipsMCSubtargetInfoImpl(TT, CPU, FS);
+ return createMipsMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td b/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td
index 7fe750249c58..792960332bcc 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/Mips.td
@@ -191,7 +191,7 @@ def FeatureUseTCCInDIV : SubtargetFeature<
"UseTCCInDIV", "false",
"Force the assembler to use trapping">;
-def FeatureMadd4
+def FeatureNoMadd4
: SubtargetFeature<"nomadd4", "DisableMadd4", "true",
"Disable 4-operand madd.fmt and related instructions">;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index cc073fbf5231..b460bc71b11f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1233,7 +1233,7 @@ void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
.addImm(0x34));
}
- recordSled(CurSled, MI, Kind);
+ recordSled(CurSled, MI, Kind, 2);
}
void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) {
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
index cffd99affac1..377aa4825b43 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -87,9 +87,10 @@ bool MipsCallLowering::MipsHandler::handle(
}
namespace {
-class IncomingValueHandler : public MipsCallLowering::MipsHandler {
+class MipsIncomingValueHandler : public MipsCallLowering::MipsHandler {
public:
- IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+ MipsIncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI)
: MipsHandler(MIRBuilder, MRI) {}
private:
@@ -117,11 +118,11 @@ private:
}
};
-class CallReturnHandler : public IncomingValueHandler {
+class CallReturnHandler : public MipsIncomingValueHandler {
public:
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder &MIB)
- : IncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+ : MipsIncomingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
private:
void markPhysRegUsed(unsigned PhysReg) override {
@@ -133,9 +134,9 @@ private:
} // end anonymous namespace
-void IncomingValueHandler::assignValueToReg(Register ValVReg,
- const CCValAssign &VA,
- const EVT &VT) {
+void MipsIncomingValueHandler::assignValueToReg(Register ValVReg,
+ const CCValAssign &VA,
+ const EVT &VT) {
Register PhysReg = VA.getLocReg();
if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
const MipsSubtarget &STI =
@@ -167,8 +168,8 @@ void IncomingValueHandler::assignValueToReg(Register ValVReg,
}
}
-Register IncomingValueHandler::getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) {
+Register MipsIncomingValueHandler::getStackAddress(const CCValAssign &VA,
+ MachineMemOperand *&MMO) {
MachineFunction &MF = MIRBuilder.getMF();
unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
unsigned Offset = VA.getLocMemOffset();
@@ -186,8 +187,8 @@ Register IncomingValueHandler::getStackAddress(const CCValAssign &VA,
return MIRBuilder.buildFrameIndex(LLT::pointer(0, 32), FI).getReg(0);
}
-void IncomingValueHandler::assignValueToAddress(Register ValVReg,
- const CCValAssign &VA) {
+void MipsIncomingValueHandler::assignValueToAddress(Register ValVReg,
+ const CCValAssign &VA) {
if (VA.getLocInfo() == CCValAssign::SExt ||
VA.getLocInfo() == CCValAssign::ZExt ||
VA.getLocInfo() == CCValAssign::AExt) {
@@ -197,10 +198,10 @@ void IncomingValueHandler::assignValueToAddress(Register ValVReg,
buildLoad(ValVReg, VA);
}
-bool IncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex,
- Register ArgsReg, const EVT &VT) {
+bool MipsIncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
+ ArrayRef<CCValAssign> ArgLocs,
+ unsigned ArgLocsStartIndex,
+ Register ArgsReg, const EVT &VT) {
if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
return false;
setLeastSignificantFirst(VRegs);
@@ -209,10 +210,10 @@ bool IncomingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
}
namespace {
-class OutgoingValueHandler : public MipsCallLowering::MipsHandler {
+class MipsOutgoingValueHandler : public MipsCallLowering::MipsHandler {
public:
- OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder &MIB)
+ MipsOutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB)
: MipsHandler(MIRBuilder, MRI), MIB(MIB) {}
private:
@@ -234,9 +235,9 @@ private:
};
} // end anonymous namespace
-void OutgoingValueHandler::assignValueToReg(Register ValVReg,
- const CCValAssign &VA,
- const EVT &VT) {
+void MipsOutgoingValueHandler::assignValueToReg(Register ValVReg,
+ const CCValAssign &VA,
+ const EVT &VT) {
Register PhysReg = VA.getLocReg();
if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
const MipsSubtarget &STI =
@@ -254,8 +255,8 @@ void OutgoingValueHandler::assignValueToReg(Register ValVReg,
}
}
-Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
- MachineMemOperand *&MMO) {
+Register MipsOutgoingValueHandler::getStackAddress(const CCValAssign &VA,
+ MachineMemOperand *&MMO) {
MachineFunction &MF = MIRBuilder.getMF();
const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
@@ -278,16 +279,16 @@ Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
return AddrReg.getReg(0);
}
-void OutgoingValueHandler::assignValueToAddress(Register ValVReg,
- const CCValAssign &VA) {
+void MipsOutgoingValueHandler::assignValueToAddress(Register ValVReg,
+ const CCValAssign &VA) {
MachineMemOperand *MMO;
Register Addr = getStackAddress(VA, MMO);
Register ExtReg = extendRegister(ValVReg, VA);
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
-Register OutgoingValueHandler::extendRegister(Register ValReg,
- const CCValAssign &VA) {
+Register MipsOutgoingValueHandler::extendRegister(Register ValReg,
+ const CCValAssign &VA) {
LLT LocTy{VA.getLocVT()};
switch (VA.getLocInfo()) {
case CCValAssign::SExt: {
@@ -308,10 +309,10 @@ Register OutgoingValueHandler::extendRegister(Register ValReg,
llvm_unreachable("unable to extend register");
}
-bool OutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
- ArrayRef<CCValAssign> ArgLocs,
- unsigned ArgLocsStartIndex,
- Register ArgsReg, const EVT &VT) {
+bool MipsOutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
+ ArrayRef<CCValAssign> ArgLocs,
+ unsigned ArgLocsStartIndex,
+ Register ArgsReg, const EVT &VT) {
MIRBuilder.buildUnmerge(VRegs, ArgsReg);
setLeastSignificantFirst(VRegs);
if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex, VT))
@@ -346,7 +347,7 @@ static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
const ISD::ArgFlagsTy &Flags) {
// > does not mean loss of information as type RegisterVT can't hold type VT,
// it means that type VT is split into multiple registers of type RegisterVT
- if (VT.getSizeInBits() >= RegisterVT.getSizeInBits())
+ if (VT.getFixedSizeInBits() >= RegisterVT.getFixedSizeInBits())
return CCValAssign::LocInfo::Full;
if (Flags.isSExt())
return CCValAssign::LocInfo::SExt;
@@ -373,8 +374,8 @@ static void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
}
bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
- const Value *Val,
- ArrayRef<Register> VRegs) const {
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
@@ -403,7 +404,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
CCInfo.AnalyzeReturn(Outs, TLI.CCAssignFnForReturn());
setLocInfo(ArgLocs, Outs);
- OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+ MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
if (!RetHandler.handle(ArgLocs, RetInfos)) {
return false;
}
@@ -412,9 +413,10 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return true;
}
-bool MipsCallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const {
+bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
// Quick exit if there aren't any args.
if (F.arg_empty())
@@ -455,7 +457,7 @@ bool MipsCallLowering::lowerFormalArguments(
CCInfo.AnalyzeFormalArguments(Ins, TLI.CCAssignFnForCall());
setLocInfo(ArgLocs, Ins);
- IncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
+ MipsIncomingValueHandler Handler(MIRBuilder, MF.getRegInfo());
if (!Handler.handle(ArgLocs, ArgInfos))
return false;
@@ -579,7 +581,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
setLocInfo(ArgLocs, Outs);
- OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
+ MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB);
if (!RetHandler.handle(ArgLocs, ArgInfos)) {
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
index a284cf5e26cf..1c1c2080a76a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsCallLowering.h
@@ -18,6 +18,7 @@
namespace llvm {
+class MachineMemOperand;
class MipsTargetLowering;
class MipsCallLowering : public CallLowering {
@@ -63,10 +64,12 @@ public:
MipsCallLowering(const MipsTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<Register> VRegs) const override;
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index faf7160e63e2..8e619549f01c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -552,7 +552,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
const DataLayout &TD = MF->getDataLayout();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
- unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+ unsigned Size = CPs[i].getSizeInBytes(TD);
assert(Size >= 4 && "Too small constant pool entry");
Align Alignment = CPs[i].getAlign();
// Verify that all constant pool entries are a multiple of their alignment.
@@ -593,12 +593,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
return false;
MachineBasicBlock *NextBB = &*std::next(MBBI);
- for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
- E = MBB->succ_end(); I != E; ++I)
- if (*I == NextBB)
- return true;
-
- return false;
+ return llvm::is_contained(MBB->successors(), NextBB);
}
/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 155d19ba6959..797d81204305 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -182,7 +182,7 @@ namespace {
/// memory instruction can be moved to a delay slot.
class MemDefsUses : public InspectMemInstr {
public:
- MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI);
+ explicit MemDefsUses(const MachineFrameInfo *MFI);
private:
using ValueType = PointerUnion<const Value *, const PseudoSourceValue *>;
@@ -200,7 +200,6 @@ namespace {
const MachineFrameInfo *MFI;
SmallPtrSet<ValueType, 4> Uses, Defs;
- const DataLayout &DL;
/// Flags indicating whether loads or stores with no underlying objects have
/// been seen.
@@ -492,8 +491,8 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
return true;
}
-MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
- : InspectMemInstr(false), MFI(MFI_), DL(DL) {}
+MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
+ : InspectMemInstr(false), MFI(MFI_) {}
bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
bool HasHazard = false;
@@ -542,7 +541,7 @@ getUnderlyingObjects(const MachineInstr &MI,
if (const Value *V = MMO.getValue()) {
SmallVector<const Value *, 4> Objs;
- GetUnderlyingObjects(V, Objs, DL);
+ ::getUnderlyingObjects(V, Objs);
for (const Value *UValue : Objs) {
if (!isIdentifiedObject(V))
@@ -566,7 +565,11 @@ Iter MipsDelaySlotFiller::replaceWithCompactBranch(MachineBasicBlock &MBB,
unsigned NewOpcode = TII->getEquivalentCompactForm(Branch);
Branch = TII->genInstrWithNewOpc(NewOpcode, Branch);
- std::next(Branch)->eraseFromParent();
+ auto *ToErase = cast<MachineInstr>(&*std::next(Branch));
+ // Update call site info for the Branch.
+ if (ToErase->shouldUpdateCallSiteInfo())
+ ToErase->getMF()->moveCallSiteInfo(ToErase, cast<MachineInstr>(&*Branch));
+ ToErase->eraseFromParent();
return Branch;
}
@@ -775,7 +778,7 @@ bool MipsDelaySlotFiller::searchBackward(MachineBasicBlock &MBB,
auto *Fn = MBB.getParent();
RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo());
- MemDefsUses MemDU(Fn->getDataLayout(), &Fn->getFrameInfo());
+ MemDefsUses MemDU(&Fn->getFrameInfo());
ReverseIter Filler;
RegDU.init(Slot);
@@ -851,7 +854,7 @@ bool MipsDelaySlotFiller::searchSuccBBs(MachineBasicBlock &MBB,
IM.reset(new LoadFromStackOrConst());
} else {
const MachineFrameInfo &MFI = Fn->getFrameInfo();
- IM.reset(new MemDefsUses(Fn->getDataLayout(), &MFI));
+ IM.reset(new MemDefsUses(&MFI));
}
if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
index b1abf4a33717..f72dc1da4131 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -733,10 +733,10 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
assert(I->getNumOperands() == 5 &&
"Atomics min|max|umin|umax use an additional register");
- Register Scratch2 = I->getOperand(4).getReg();
+ MCRegister Scratch2 = I->getOperand(4).getReg().asMCReg();
// On Mips64 result of slt is GPR32.
- Register Scratch2_32 =
+ MCRegister Scratch2_32 =
(Size == 8) ? STI->getRegisterInfo()->getSubReg(Scratch2, Mips::sub_32)
: Scratch2;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 2da35020006e..8b599bca3915 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -134,7 +134,7 @@ unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv(
// Break down vector types to either 2 i64s or 4 i32s.
RegisterVT = getRegisterTypeForCallingConv(Context, CC, VT);
IntermediateVT = RegisterVT;
- NumIntermediates = VT.getSizeInBits() < RegisterVT.getSizeInBits()
+ NumIntermediates = VT.getFixedSizeInBits() < RegisterVT.getFixedSizeInBits()
? VT.getVectorNumElements()
: VT.getSizeInBits() / RegisterVT.getSizeInBits();
@@ -1198,17 +1198,6 @@ bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
}
void
-MipsTargetLowering::LowerOperationWrapper(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const {
- SDValue Res = LowerOperation(SDValue(N, 0), DAG);
-
- if (Res)
- for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
- Results.push_back(Res.getValue(I));
-}
-
-void
MipsTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -3025,8 +3014,8 @@ SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
int FI = MFI.CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
- /* Alignment = */ 0, MachineMemOperand::MOVolatile);
+ return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(), MaybeAlign(),
+ MachineMemOperand::MOVolatile);
}
void MipsTargetLowering::
@@ -4404,7 +4393,7 @@ void MipsTargetLowering::passByValArg(
SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
DAG.getConstant(OffsetInBytes, DL, PtrTy));
SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
- MachinePointerInfo(), Alignment.value());
+ MachinePointerInfo(), Alignment);
MemOpChains.push_back(LoadVal.getValue(1));
unsigned ArgReg = ArgRegs[FirstReg + I];
RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
@@ -4431,7 +4420,7 @@ void MipsTargetLowering::passByValArg(
PtrTy));
SDValue LoadVal = DAG.getExtLoad(
ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
- MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment.value());
+ MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
MemOpChains.push_back(LoadVal.getValue(1));
// Shift the loaded value.
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
index 16b4d51d3ca6..3820c42ba8aa 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -40,8 +40,6 @@
namespace llvm {
class Argument;
-class CCState;
-class CCValAssign;
class FastISel;
class FunctionLoweringInfo;
class MachineBasicBlock;
@@ -316,10 +314,6 @@ class TargetRegisterClass;
return ISD::SIGN_EXTEND;
}
- void LowerOperationWrapper(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const override;
-
/// LowerOperation - Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -365,14 +359,6 @@ class TargetRegisterClass;
return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
}
- /// Returns true if a cast between SrcAS and DestAS is a noop.
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
- // Mips doesn't have any special address spaces so we just reserve
- // the first 256 for software use (e.g. OpenCL) and treat casts
- // between them as noops.
- return SrcAS < 256 && DestAS < 256;
- }
-
bool isJumpTableRelative() const override {
return getTargetMachine().isPositionIndependent();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td
index 5696df96e798..14590ddacfcb 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -452,6 +452,12 @@ let AdditionalPredicates = [NotInMicroMips] in {
let DecoderNamespace = "MipsFP64" in {
let AdditionalPredicates = [NotInMicroMips] in {
+ def FADD_PS64 : ADDS_FT<"add.ps", FGR64Opnd, II_ADD_PS, 0>,
+ ADDS_FM<0x0, 22>,
+ ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+ def FMUL_PS64 : ADDS_FT<"mul.ps", FGR64Opnd, II_MUL_PS, 0>,
+ ADDS_FM<0x2, 22>,
+ ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
def PLL_PS64 : ADDS_FT<"pll.ps", FGR64Opnd, II_CVT, 0>,
ADDS_FM<0x2C, 22>,
ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
@@ -464,6 +470,9 @@ let DecoderNamespace = "MipsFP64" in {
def PUU_PS64 : ADDS_FT<"puu.ps", FGR64Opnd, II_CVT, 0>,
ADDS_FM<0x2F, 22>,
ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+ def FSUB_PS64 : ADDS_FT<"sub.ps", FGR64Opnd, II_SUB_PS, 0>,
+ ADDS_FM<0x1, 22>,
+ ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
def CVT_S_PU64 : ABSS_FT<"cvt.s.pu", FGR32Opnd, FGR64Opnd, II_CVT>,
ABSS_FM<0x20, 22>,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 0c6080258a3a..94828a976695 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -894,4 +894,4 @@ Optional<RegImmPair> MipsInstrInfo::isAddImmediate(const MachineInstr &MI,
}
}
return None;
-} \ No newline at end of file
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td
index a3b928870f3f..089fed9ec0bf 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -242,7 +242,7 @@ def HasEVA : Predicate<"Subtarget->hasEVA()">,
def HasMSA : Predicate<"Subtarget->hasMSA()">,
AssemblerPredicate<(all_of FeatureMSA)>;
def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
- AssemblerPredicate<(all_of (not FeatureMadd4))>;
+ AssemblerPredicate<(all_of (not FeatureNoMadd4))>;
def HasMT : Predicate<"Subtarget->hasMT()">,
AssemblerPredicate<(all_of FeatureMT)>;
def UseIndirectJumpsHazard : Predicate<"Subtarget->useIndirectJumpsHazard()">,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index b489c8137769..2692c08b93de 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -322,6 +322,8 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -500,7 +502,6 @@ static bool MSA2OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode,
bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
- MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
const MipsSubtarget &ST =
static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
const MipsInstrInfo &TII = *ST.getInstrInfo();
@@ -508,14 +509,6 @@ bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
const RegisterBankInfo &RBI = *ST.getRegBankInfo();
switch (MI.getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memset:
- case Intrinsic::memmove:
- if (createMemLibcall(MIRBuilder, MRI, MI) ==
- LegalizerHelper::UnableToLegalize)
- return false;
- MI.eraseFromParent();
- return true;
case Intrinsic::trap: {
MachineInstr *Trap = MIRBuilder.buildInstr(Mips::TRAP);
MI.eraseFromParent();
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 6325e513f9f8..3101820d476e 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -716,10 +716,10 @@ void MipsRegisterBankInfo::setRegBank(MachineInstr &MI,
static void
combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
- MachineInstr &MI, GISelObserverWrapper &Observer) {
+ MachineInstr &MI, GISelChangeObserver &Observer) {
SmallVector<Register, 4> UpdatedDefs;
SmallVector<MachineInstr *, 2> DeadInstrs;
- ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs, Observer);
+ ArtCombiner.tryCombineUnmergeValues(MI, DeadInstrs, UpdatedDefs, Observer);
for (MachineInstr *DeadMI : DeadInstrs)
DeadMI->eraseFromParent();
}
@@ -728,14 +728,13 @@ void MipsRegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
InstListTy NewInstrs;
- MachineIRBuilder B(MI);
MachineFunction *MF = MI.getMF();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
const LegalizerInfo &LegInfo = *MF->getSubtarget().getLegalizerInfo();
InstManager NewInstrObserver(NewInstrs);
- GISelObserverWrapper WrapperObserver(&NewInstrObserver);
- LegalizerHelper Helper(*MF, WrapperObserver, B);
+ MachineIRBuilder B(MI, NewInstrObserver);
+ LegalizerHelper Helper(*MF, NewInstrObserver, B);
LegalizationArtifactCombiner ArtCombiner(B, MF->getRegInfo(), LegInfo);
switch (MI.getOpcode()) {
@@ -752,7 +751,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
// not be considered for regbank selection. RegBankSelect for mips
// visits/makes corresponding G_MERGE first. Combine them here.
if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
- combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, WrapperObserver);
+ combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, NewInstrObserver);
// This G_MERGE will be combined away when its corresponding G_UNMERGE
// gets regBankSelected.
else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
@@ -764,7 +763,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
return;
}
case TargetOpcode::G_UNMERGE_VALUES:
- combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, WrapperObserver);
+ combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, NewInstrObserver);
return;
default:
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
index 55eeaf096b14..df51606e1e8a 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -150,7 +150,7 @@ private:
class TypeInfoForMF {
/// MachineFunction name is used to recognise when MF changes.
- std::string MFName = "";
+ std::string MFName;
/// <key, value> : value is vector of all MachineInstrs that are waiting for
/// key to figure out type of some of its ambiguous operands.
DenseMap<const MachineInstr *, SmallVector<const MachineInstr *, 2>>
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index a657bb44ac78..f31ba06a1e7c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -774,9 +774,9 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub(
.addImm(0);
}
-int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
MipsABIInfo ABI = STI.getABI();
@@ -785,8 +785,9 @@ int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
else
FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr();
- return MFI.getObjectOffset(FI) + MFI.getStackSize() -
- getOffsetOfLocalArea() + MFI.getOffsetAdjustment();
+ return StackOffset::getFixed(MFI.getObjectOffset(FI) + MFI.getStackSize() -
+ getOffsetOfLocalArea() +
+ MFI.getOffsetAdjustment());
}
bool MipsSEFrameLowering::spillCalleeSavedRegisters(
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index c818a65f5b14..bed2776c28da 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -10,6 +10,7 @@
#define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
#include "MipsFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
#include <vector>
namespace llvm {
@@ -27,8 +28,8 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index bdf29c53cbd5..4a448a5f7c68 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -2307,7 +2307,7 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
- /* Alignment = */ 16);
+ Align(16));
}
SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -2382,7 +2382,7 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
- /* Alignment = */ 16);
+ Align(16));
}
SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td
index 568c85af655d..3a5b3fe3b34b 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSchedule.td
@@ -26,6 +26,7 @@ def II_ADDIUPC : InstrItinClass;
def II_ADD : InstrItinClass;
def II_ADDU : InstrItinClass;
def II_ADD_D : InstrItinClass;
+def II_ADD_PS : InstrItinClass;
def II_ADD_S : InstrItinClass;
def II_ADDR_PS : InstrItinClass;
def II_ALIGN : InstrItinClass;
@@ -279,6 +280,7 @@ def II_MUL : InstrItinClass;
def II_MUH : InstrItinClass;
def II_MUHU : InstrItinClass;
def II_MULU : InstrItinClass;
+def II_MUL_PS : InstrItinClass;
def II_MULR_PS : InstrItinClass;
def II_MULT : InstrItinClass;
def II_MULTU : InstrItinClass;
@@ -341,6 +343,7 @@ def II_SRLV : InstrItinClass;
def II_SUB : InstrItinClass;
def II_SUBU : InstrItinClass;
def II_SUB_D : InstrItinClass;
+def II_SUB_PS : InstrItinClass;
def II_SUB_S : InstrItinClass;
def II_SUXC1 : InstrItinClass;
def II_SW : InstrItinClass;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td
index 3888ca4e82f5..f076f2f9cf10 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -829,10 +829,11 @@ def : InstRW<[GenericWriteFPUL], (instrs ADDR_PS64,
CVT_L_S, CVT_S_D32, CVT_S_D64, CVT_S_L,
CVT_S_W, CVT_W_D32, CVT_W_D64, CVT_W_S,
CVT_PS_S64, CVT_S_PL64, CVT_S_PU64,
- CVT_PS_PW64, CVT_PW_PS64,
+ CVT_PS_PW64, CVT_PW_PS64, FADD_PS64,
FLOOR_L_D64, FLOOR_L_S, FLOOR_W_D32,
FLOOR_W_D64, FLOOR_W_S, FMUL_D32, FMUL_D64,
- MADD_D32, MADD_D64, MSUB_D32, MSUB_D64, MULR_PS64,
+ FMUL_PS64, FSUB_PS64, MADD_D32, MADD_D64,
+ MSUB_D32, MSUB_D64, MULR_PS64,
NMADD_D32, NMADD_D64, NMSUB_D32, NMSUB_D64,
PLL_PS64, PLU_PS64, PUL_PS64, PUU_PS64,
ROUND_L_D64, ROUND_L_S, ROUND_W_D32,
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td
index 3d159d412489..466b5c6af696 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -449,8 +449,8 @@ def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
// cvt.ps.[sw], cvt.s.(pl|pu), c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps,
// pl[lu].ps, sub.[ds], sub.ps, trunc.w.[ds], trunc.w.ps
def : InstRW<[P5600WriteFPUL],
- (instrs FADD_D32, FADD_D64, FADD_S, FMUL_D32, FMUL_D64, FMUL_S,
- FSUB_D32, FSUB_D64, FSUB_S)>;
+ (instrs FADD_D32, FADD_D64, FADD_PS64, FADD_S, FMUL_D32, FMUL_D64,
+ FMUL_PS64, FMUL_S, FSUB_D32, FSUB_D64, FSUB_PS64, FSUB_S)>;
def : InstRW<[P5600WriteFPUL], (instregex "^TRUNC_(L|W)_(S|D32|D64)$")>;
def : InstRW<[P5600WriteFPUL],
(instregex "^CVT_(S|D32|D64|L|W)_(S|D32|D64|L|W)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp
index ef4191cec3df..8bb9d75e9173 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -70,21 +70,21 @@ void MipsSubtarget::anchor() {}
MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
bool little, const MipsTargetMachine &TM,
MaybeAlign StackAlignOverride)
- : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
- IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
- NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true),
- IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
- HasCnMipsP(false), HasMips3_32(false), HasMips3_32r2(false),
- HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
- InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
- InMicroMipsMode(false), HasDSP(false), HasDSPR2(false), HasDSPR3(false),
- AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
- UseTCCInDIV(false), HasSym32(false), HasEVA(false), DisableMadd4(false),
- HasMT(false), HasCRC(false), HasVirt(false), HasGINV(false),
- UseIndirectJumpsHazard(false), StackAlignOverride(StackAlignOverride),
- TM(TM), TargetTriple(TT), TSInfo(),
- InstrInfo(
- MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
+ : MipsGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+ MipsArchVersion(MipsDefault), IsLittle(little), IsSoftFloat(false),
+ IsSingleFloat(false), IsFPXX(false), NoABICalls(false), Abs2008(false),
+ IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false),
+ IsGP64bit(false), HasVFPU(false), HasCnMips(false), HasCnMipsP(false),
+ HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
+ HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
+ InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+ HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+ Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
+ HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
+ HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
+ StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
+ TSInfo(), InstrInfo(MipsInstrInfo::create(
+ initializeSubtargetDependencies(CPU, FS, TM))),
FrameLowering(MipsFrameLowering::create(*this)),
TLInfo(MipsTargetLowering::create(TM, *this)) {
@@ -240,7 +240,7 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
StringRef CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
// Parse features string.
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
// Initialize scheduling itinerary for the specified CPU.
InstrItins = getInstrItineraryForCPU(CPUName);
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h
index 26ee961fc95d..2b4c2b19a95d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -240,7 +240,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
bool hasMips1() const { return MipsArchVersion >= Mips1; }
bool hasMips2() const { return MipsArchVersion >= Mips2; }
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 80cb6ce7ac0c..7e2c43164d52 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -163,21 +163,15 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
- bool hasMips16Attr =
- !F.getFnAttribute("mips16").hasAttribute(Attribute::None);
- bool hasNoMips16Attr =
- !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
-
- bool HasMicroMipsAttr =
- !F.getFnAttribute("micromips").hasAttribute(Attribute::None);
- bool HasNoMicroMipsAttr =
- !F.getFnAttribute("nomicromips").hasAttribute(Attribute::None);
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+ bool hasMips16Attr = F.getFnAttribute("mips16").isValid();
+ bool hasNoMips16Attr = F.getFnAttribute("nomips16").isValid();
+
+ bool HasMicroMipsAttr = F.getFnAttribute("micromips").isValid();
+ bool HasNoMicroMipsAttr = F.getFnAttribute("nomicromips").isValid();
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
@@ -295,8 +289,7 @@ MipsTargetMachine::getTargetTransformInfo(const Function &F) {
}
// Implemented by targets that want to run passes immediately before
-// machine code is emitted. return true if -print-machineinstrs should
-// print out the code after the passes.
+// machine code is emitted.
void MipsPassConfig::addPreEmitPass() {
// Expand pseudo instructions that are sensitive to register allocation.
addPass(createMipsExpandPseudoPass());
@@ -323,7 +316,7 @@ void MipsPassConfig::addPreEmitPass() {
}
bool MipsPassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h
index 25300504a02d..e0de924be4fd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -63,6 +63,14 @@ public:
return TLOF.get();
}
+ /// Returns true if a cast between SrcAS and DestAS is a noop.
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Mips doesn't have any special address spaces so we just reserve
+ // the first 256 for software use (e.g. OpenCL) and treat casts
+ // between them as noops.
+ return SrcAS < 256 && DestAS < 256;
+ }
+
bool isLittleEndian() const { return isLittle; }
const MipsABIInfo &getABI() const { return ABI; }
};
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index cee0e7eec54a..503f0497b6f0 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -29,6 +29,7 @@ public:
const MCSubtargetInfo &STI, raw_ostream &OS) override;
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
// End
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index aef0eed6ab9a..f275011018a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -47,6 +47,7 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
AscizDirective = nullptr; // not supported
SupportsQuotedNames = false;
SupportsExtendedDwarfLocDirective = false;
+ SupportsSignedData = false;
// @TODO: Can we just disable this?
WeakDirective = "\t// .weak\t";
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index d758c2c86959..d69166feb042 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -46,7 +46,7 @@ static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *
createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
- return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
+ return createNVPTXMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h
index dfe0b9cb5ee6..c2fd090da084 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTX.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
#define LLVM_LIB_TARGET_NVPTX_NVPTX_H
+#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
@@ -47,6 +48,24 @@ FunctionPass *createNVPTXLowerAllocaPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
+struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
+ NVVMIntrRangePass();
+ NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ unsigned SmVersion;
+};
+
+struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
+ NVVMReflectPass();
+ NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ unsigned SmVersion;
+};
+
namespace NVPTX {
enum DrvInterface {
NVCL,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index da1a398a68f0..38844ff4ddf9 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1272,9 +1272,6 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
std::string
NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
switch (Ty->getTypeID()) {
- default:
- llvm_unreachable("unexpected type");
- break;
case Type::IntegerTyID: {
unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
if (NumBits == 1)
@@ -1305,9 +1302,10 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
return "b32";
else
return "u32";
+ default:
+ break;
}
llvm_unreachable("unexpected type");
- return nullptr;
}
void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index c533921842e4..024e51e5f488 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -63,12 +63,13 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
}
}
-int NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
FrameReg = NVPTX::VRDepot;
- return MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+ return StackOffset::getFixed(MFI.getObjectOffset(FI) -
+ getOffsetOfLocalArea());
}
void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
index e4c2b9e77f70..a5d49ac3ab29 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -24,8 +25,8 @@ public:
bool hasFP(const MachineFunction &MF) const override;
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4296eca6a8df..08f4ab87c68d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,12 +700,11 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
bool IsKernelFn = isKernelFunction(F->getFunction());
- // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
+ // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
// because the former looks through phi nodes while the latter does not. We
// need to look through phi nodes to handle pointer induction variables.
SmallVector<const Value *, 8> Objs;
- GetUnderlyingObjects(N->getMemOperand()->getValue(),
- Objs, F->getDataLayout());
+ getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
return all_of(Objs, [&](const Value *V) {
if (auto *A = dyn_cast<const Argument>(V))
@@ -2855,7 +2854,7 @@ bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
}
// Copy over operands
- SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
+ SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
Ops.push_back(N->getOperand(0)); // Move chain to the back.
ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
@@ -3364,7 +3363,7 @@ bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
}
// Copy over operands
- SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
+ SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
Ops.push_back(N->getOperand(0)); // Move chain to the back.
ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f45cc06e0a0a..8860e90f2806 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -19,6 +19,7 @@
#include "NVPTXTargetObjectFile.h"
#include "NVPTXUtilities.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/Analysis.h"
@@ -64,7 +65,7 @@
using namespace llvm;
-static unsigned int uniqueCallSite = 0;
+static std::atomic<unsigned> GlobalUniqueCallSite;
static cl::opt<bool> sched4reg(
"nvptx-sched4reg",
@@ -1242,7 +1243,7 @@ NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
std::string NVPTXTargetLowering::getPrototype(
const DataLayout &DL, Type *retTy, const ArgListTy &Args,
const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
- const CallBase &CB) const {
+ const CallBase &CB, unsigned UniqueCallSite) const {
auto PtrVT = getPointerTy(DL);
bool isABI = (STI.getSmVersion() >= 20);
@@ -1251,7 +1252,7 @@ std::string NVPTXTargetLowering::getPrototype(
return "";
std::stringstream O;
- O << "prototype_" << uniqueCallSite << " : .callprototype ";
+ O << "prototype_" << UniqueCallSite << " : .callprototype ";
if (retTy->getTypeID() == Type::VoidTyID) {
O << "()";
@@ -1421,8 +1422,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!isABI)
return Chain;
+ unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
SDValue tempChain = Chain;
- Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
+ Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
SDValue InFlag = Chain.getValue(1);
unsigned paramCount = 0;
@@ -1677,7 +1679,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// The prototype is embedded in a string and put as the operand for a
// CallPrototype SDNode which will print out to the value of the string.
SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB);
+ std::string Proto =
+ getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite);
const char *ProtoStr =
nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
SDValue ProtoOps[] = {
@@ -1733,9 +1736,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (isIndirectCall) {
SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue PrototypeOps[] = { Chain,
- DAG.getConstant(uniqueCallSite, dl, MVT::i32),
- InFlag };
+ SDValue PrototypeOps[] = {
+ Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag};
Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
InFlag = Chain.getValue(1);
}
@@ -1831,13 +1833,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
}
- Chain = DAG.getCALLSEQ_END(Chain,
- DAG.getIntPtrConstant(uniqueCallSite, dl, true),
- DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
- true),
- InFlag, dl);
+ Chain = DAG.getCALLSEQ_END(
+ Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true),
+ DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
- uniqueCallSite++;
// Append ProxyReg instructions to the chain to make sure that `callseq_end`
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
@@ -2438,8 +2437,7 @@ static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
if (!STy || STy->isLiteral())
return false;
- return std::find(std::begin(specialTypes), std::end(specialTypes),
- STy->getName()) != std::end(specialTypes);
+ return llvm::is_contained(specialTypes, STy->getName());
}
SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -2590,7 +2588,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
// Extend the element if necessary (e.g. an i8 is loaded
// into an i16 register)
if (Ins[InsIdx].VT.isInteger() &&
- Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+ Ins[InsIdx].VT.getFixedSizeInBits() >
+ LoadVT.getFixedSizeInBits()) {
unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND;
Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
@@ -4564,13 +4563,13 @@ static bool IsMulWideOperandDemotable(SDValue Op,
if (Op.getOpcode() == ISD::SIGN_EXTEND ||
Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
EVT OrigVT = Op.getOperand(0).getValueType();
- if (OrigVT.getSizeInBits() <= OptSize) {
+ if (OrigVT.getFixedSizeInBits() <= OptSize) {
S = Signed;
return true;
}
} else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
EVT OrigVT = Op.getOperand(0).getValueType();
- if (OrigVT.getSizeInBits() <= OptSize) {
+ if (OrigVT.getFixedSizeInBits() <= OptSize) {
S = Unsigned;
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index df9cd4159962..13829b924d4b 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -491,7 +491,8 @@ public:
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
const SmallVectorImpl<ISD::OutputArg> &,
- MaybeAlign retAlignment, const CallBase &CB) const;
+ MaybeAlign retAlignment, const CallBase &CB,
+ unsigned UniqueCallSite) const;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
index 77961c386827..9220f4766d92 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -31,14 +31,14 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
// TSFlagFields
bits<4> VecInstType = VecNOP.Value;
- bit IsSimpleMove = 0;
- bit IsLoad = 0;
- bit IsStore = 0;
+ bit IsSimpleMove = false;
+ bit IsLoad = false;
+ bit IsStore = false;
- bit IsTex = 0;
- bit IsSust = 0;
- bit IsSurfTexQuery = 0;
- bit IsTexModeUnified = 0;
+ bit IsTex = false;
+ bit IsSust = false;
+ bit IsSurfTexQuery = false;
+ bit IsTexModeUnified = false;
// The following field is encoded as log2 of the vector size minus one,
// with 0 meaning the operation is not a surface instruction. For example,
@@ -46,13 +46,13 @@ class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
// 2**(2-1) = 2.
bits<2> IsSuld = 0;
- let TSFlags{3-0} = VecInstType;
- let TSFlags{4-4} = IsSimpleMove;
- let TSFlags{5-5} = IsLoad;
- let TSFlags{6-6} = IsStore;
- let TSFlags{7} = IsTex;
- let TSFlags{9-8} = IsSuld;
- let TSFlags{10} = IsSust;
- let TSFlags{11} = IsSurfTexQuery;
- let TSFlags{12} = IsTexModeUnified;
+ let TSFlags{3...0} = VecInstType;
+ let TSFlags{4...4} = IsSimpleMove;
+ let TSFlags{5...5} = IsLoad;
+ let TSFlags{6...6} = IsStore;
+ let TSFlags{7} = IsTex;
+ let TSFlags{9...8} = IsSuld;
+ let TSFlags{10} = IsSust;
+ let TSFlags{11} = IsSurfTexQuery;
+ let TSFlags{12} = IsTexModeUnified;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index fe7a84f9a361..381ed4dd6887 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -13,7 +13,7 @@
include "NVPTXInstrFormats.td"
// A NOP instruction
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def NOP : NVPTXInst<(outs), (ins), "", []>;
}
@@ -137,7 +137,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
-def true : Predicate<"true">;
+def True : Predicate<"true">;
def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
@@ -407,7 +407,7 @@ multiclass F2<string OpcStr, SDNode OpNode> {
// Type Conversion
//-----------------------------------
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
// Generate a cvt to the given type from all possible types. Each instance
// takes a CvtMode immediate that defines the conversion mode to use. It can
// be CvtNONE to omit a conversion mode.
@@ -1022,12 +1022,12 @@ multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
}
defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
-defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>;
+defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, True>;
defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
-defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
+defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, True>;
defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
-defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
-defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
+defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
// sin/cos
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
@@ -1367,7 +1367,7 @@ multiclass BFE<string TyStr, RegisterClass RC> {
!strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
}
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
defm BFE_S32 : BFE<"s32", Int32Regs>;
defm BFE_U32 : BFE<"u32", Int32Regs>;
defm BFE_S64 : BFE<"s64", Int64Regs>;
@@ -1381,7 +1381,7 @@ let hasSideEffects = 0 in {
// FIXME: This doesn't cover versions of set and setp that combine with a
// boolean predicate, e.g. setp.eq.and.b16.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
def rr :
NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
@@ -1427,7 +1427,7 @@ def SETP_f16x2rr :
// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
// reg, either u32, s32, or f32. Anyway these aren't used at the moment.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
def rr : NVPTXInst<(outs Int32Regs:$dst),
(ins RC:$a, RC:$b, CmpMode:$cmp),
@@ -1462,7 +1462,7 @@ defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
// selp instructions that don't have any pattern matches; we explicitly use
// them within this file.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
def rr : NVPTXInst<(outs RC:$dst),
(ins RC:$a, RC:$b, Int1Regs:$p),
@@ -1572,7 +1572,7 @@ def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
[(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
// Get pointer to local stack.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
"mov.u32 \t$d, __local_depot$num;", []>;
def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
@@ -1988,7 +1988,7 @@ def ProxyReg :
SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-let mayLoad = 1 in {
+let mayLoad = true in {
class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
!strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
@@ -2013,7 +2013,7 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
!strconcat("mov", opstr, " \t$dst, retval$b;"),
[(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
-let mayStore = 1 in {
+let mayStore = true in {
class StoreParamInst<NVPTXRegClass regclass, string opstr> :
NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
!strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
@@ -2823,7 +2823,7 @@ def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
// pack a set of smaller int registers to a larger int register
def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
(ins Int16Regs:$s1, Int16Regs:$s2,
@@ -2856,7 +2856,7 @@ let hasSideEffects = 0 in {
}
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
// Extract element of f16x2 register. PTX does not provide any way
// to access elements of f16x2 vector directly, so we need to
// extract it using a temporary register.
@@ -2899,7 +2899,7 @@ let hasSideEffects = 0 in {
}
// Count leading zeros
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
"clz.b32 \t$d, $a;", []>;
def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
@@ -2937,7 +2937,7 @@ def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
(SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
// Population count
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
"popc.b32 \t$d, $a;", []>;
def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 76a4a1d4030a..8ccd47c0fcfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -51,19 +51,19 @@ def ptx : PTX;
// Generates list of n sequential register names.
// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
class RegSeq<int n, string prefix> {
- list<string> ret = !if(n, !listconcat(RegSeq<!add(n,-1), prefix>.ret,
- [prefix # !add(n, -1)]),
+ list<string> ret = !if(n, !listconcat(RegSeq<!sub(n, 1), prefix>.ret,
+ [prefix # !sub(n, 1)]),
[]);
}
class THREADMASK_INFO<bit sync> {
- list<bit> ret = !if(sync, [0,1], [0]);
+ list<bit> ret = !if(sync, [0, 1], [0]);
}
//-----------------------------------
// Synchronization and shuffle functions
//-----------------------------------
-let isConvergent = 1 in {
+let isConvergent = true in {
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
"bar.sync \t0;",
[(int_nvvm_barrier0)]>;
@@ -173,12 +173,12 @@ class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
)];
}
-foreach sync = [0, 1] in {
+foreach sync = [false, true] in {
foreach mode = ["up", "down", "bfly", "idx"] in {
foreach regclass = ["i32", "f32"] in {
- foreach return_pred = [0, 1] in {
- foreach offset_imm = [0, 1] in {
- foreach mask_imm = [0, 1] in {
+ foreach return_pred = [false, true] in {
+ foreach offset_imm = [false, true] in {
+ foreach mask_imm = [false, true] in {
foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
def : SHFL_INSTR<sync, mode, regclass, return_pred,
offset_imm, mask_imm, threadmask_imm>,
@@ -274,7 +274,7 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_s
defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
i64imm>;
-} // isConvergent = 1
+} // isConvergent = true
//-----------------------------------
// Explicit Memory Fence Functions
@@ -1548,7 +1548,7 @@ multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
!cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
# "_" # SpaceStr # "_" # IntTypeStr
- # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+ # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
regclass, ImmType, Imm, ImmTy, Preds>;
}
multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
@@ -1562,7 +1562,7 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
!cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
# "_" # SpaceStr # "_" # IntTypeStr
- # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+ # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
regclass, ImmType, Imm, ImmTy, Preds>;
}
@@ -2131,7 +2131,7 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
(ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
Requires<[noHWROT32]> ;
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
!strconcat("{{\n\t",
".reg .b32 %dummy;\n\t",
@@ -2147,7 +2147,7 @@ let hasSideEffects = 0 in {
[]> ;
}
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def PACK_TWO_INT32
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
"mov.b64 \t$dst, {{$lo, $hi}};", []> ;
@@ -2159,7 +2159,7 @@ def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so
// no side effects.
-let hasSideEffects = 0 in {
+let hasSideEffects = false in {
def SHF_L_WRAP_B32_IMM
: NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
@@ -2242,7 +2242,7 @@ def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
// also defined in NVPTXReplaceImageHandles.cpp
// texmode_independent
-let IsTex = 1, IsTexModeUnified = 0 in {
+let IsTex = true, IsTexModeUnified = false in {
// Texture fetch instructions using handles
def TEX_1D_F32_S32
: NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
@@ -2925,7 +2925,7 @@ def TLD4_A_2D_U32_F32
// texmode_unified
-let IsTex = 1, IsTexModeUnified = 1 in {
+let IsTex = true, IsTexModeUnified = true in {
// Texture fetch instructions using handles
def TEX_UNIFIED_1D_F32_S32
: NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
@@ -3610,7 +3610,7 @@ def TLD4_UNIFIED_A_2D_U32_F32
//=== Surface load instructions
// .clamp variant
-let IsSuld = 1 in {
+let IsSuld = true in {
def SULD_1D_I8_CLAMP
: NVPTXInst<(outs Int16Regs:$r),
(ins Int64Regs:$s, Int32Regs:$x),
@@ -3922,7 +3922,7 @@ def SULD_3D_V4I32_CLAMP
// .trap variant
-let IsSuld = 1 in {
+let IsSuld = true in {
def SULD_1D_I8_TRAP
: NVPTXInst<(outs Int16Regs:$r),
(ins Int64Regs:$s, Int32Regs:$x),
@@ -4233,7 +4233,7 @@ def SULD_3D_V4I32_TRAP
}
// .zero variant
-let IsSuld = 1 in {
+let IsSuld = true in {
def SULD_1D_I8_ZERO
: NVPTXInst<(outs Int16Regs:$r),
(ins Int64Regs:$s, Int32Regs:$x),
@@ -4547,7 +4547,7 @@ def SULD_3D_V4I32_ZERO
// Texture Query Intrinsics
//-----------------------------------
-let IsSurfTexQuery = 1 in {
+let IsSurfTexQuery = true in {
def TXQ_CHANNEL_ORDER
: NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
"txq.channel_order.b32 \t$d, [$a];",
@@ -4604,7 +4604,7 @@ def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
// Surface Query Intrinsics
//-----------------------------------
-let IsSurfTexQuery = 1 in {
+let IsSurfTexQuery = true in {
def SUQ_CHANNEL_ORDER
: NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
"suq.channel_order.b32 \t$d, [$a];",
@@ -4663,7 +4663,7 @@ def ISTYPEP_TEXTURE
//===- Surface Stores -----------------------------------------------------===//
-let IsSust = 1 in {
+let IsSust = true in {
// Unformatted
// .clamp variant
def SUST_B_1D_B8_CLAMP
@@ -7361,16 +7361,13 @@ class WMMA_REGINFO<WMMA_REGS r>
!eq(ptx_elt_type, "b1") : Int32Regs);
// Instruction input/output arguments for the fragment.
- list<NVPTXRegClass> ptx_regs = !foreach(tmp, regs, regclass);
+ list<NVPTXRegClass> ptx_regs = !listsplat(regclass, !size(regs));
// List of register names for the fragment -- ["ra0", "ra1",...]
list<string> reg_names = RegSeq<!size(ptx_regs), "r"#frag>.ret;
// Generates "{{$r0, $r1,.... $rN-1}}" for use in asm string construction.
- string regstring = "{{$" # !head(reg_names)
- # !foldl("", !tail(reg_names), a, b,
- !strconcat(a, ", $", b))
- # "}}";
+ string regstring = "{{$" # !interleave(reg_names, ", $") # "}}";
// Predicates for particular fragment variant. Technically those are
// per-instruction predicates, but currently all fragments that can be used in
@@ -7453,12 +7450,13 @@ class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride,
// To match the right intrinsic, we need to build AS-constrained PatFrag.
// Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
dag PFOperands = !if(WithStride, (ops node:$src, node:$ldm), (ops node:$src));
+ dag PFOperandsIntr = !if(WithStride, (Intr node:$src, node:$ldm), (Intr node:$src));
// Build PatFrag that only matches particular address space.
PatFrag IntrFrag = PatFrag<PFOperands,
- !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+ PFOperandsIntr,
!cond(!eq(Space, ".shared"): AS_match.shared,
!eq(Space, ".global"): AS_match.global,
- 1: AS_match.generic)>;
+ true: AS_match.generic)>;
// Build AS-constrained pattern.
let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
@@ -7493,14 +7491,14 @@ class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
// To match the right intrinsic, we need to build AS-constrained PatFrag.
// Operands is a dag equivalent in shape to Args, but using (ops node:$name, .....).
dag PFOperands = !con((ops node:$dst),
- !dag(ops, !foreach(tmp, Frag.regs, node), Frag.reg_names),
+ !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names),
!if(WithStride, (ops node:$ldm), (ops)));
// Build PatFrag that only matches particular address space.
PatFrag IntrFrag = PatFrag<PFOperands,
!foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
!cond(!eq(Space, ".shared"): AS_match.shared,
!eq(Space, ".global"): AS_match.global,
- 1: AS_match.generic)>;
+ true: AS_match.generic)>;
// Build AS-constrained pattern.
let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
@@ -7521,14 +7519,14 @@ class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
// Create all load/store variants
defset list<WMMA_INSTR> MMA_LDSTs = {
foreach layout = ["row", "col"] in {
- foreach stride = [0, 1] in {
+ foreach stride = [false, true] in {
foreach space = [".global", ".shared", ""] in {
foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
foreach frag = NVVM_MMA_OPS.all_ld_ops in
- foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+ if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
def : WMMA_LOAD<WMMA_REGINFO<frag>, layout, space, stride, addr>;
foreach frag = NVVM_MMA_OPS.all_st_ops in
- foreach _ = NVVM_MMA_SUPPORTED<[frag], layout>.ret in
+ if NVVM_MMA_SUPPORTED<[frag], layout>.ret then
def : WMMA_STORE_D<WMMA_REGINFO<frag>, layout, space, stride, addr>;
} // addr
} // space
@@ -7586,7 +7584,7 @@ defset list<WMMA_INSTR> MMAs = {
foreach layout_b = ["row", "col"] in {
foreach satf = [0, 1] in {
foreach op = NVVM_MMA_OPS.all_mma_ops in {
- foreach _ = NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret in {
+ if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
def : WMMA_MMA<WMMA_REGINFO<op[0]>,
WMMA_REGINFO<op[1]>,
WMMA_REGINFO<op[2]>,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index e60b5eeacdae..fd58ff13788d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -172,8 +172,12 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
Value *ArgInParam = new AddrSpaceCastInst(
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
+ // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
+ // addrspacecast preserves alignment. Since params are constant, this load is
+ // definitely not volatile.
LoadInst *LI =
- new LoadInst(StructType, ArgInParam, Arg->getName(), FirstInst);
+ new LoadInst(StructType, ArgInParam, Arg->getName(),
+ /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
new StoreInst(LI, AllocA, FirstInst);
}
@@ -214,8 +218,7 @@ bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
for (auto &I : B) {
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (LI->getType()->isPointerTy()) {
- Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
- F.getParent()->getDataLayout());
+ Value *UO = getUnderlyingObject(LI->getPointerOperand());
if (Argument *Arg = dyn_cast<Argument>(UO)) {
if (Arg->hasByValAttr()) {
// LI is a load from a pointer within a byval kernel parameter.
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index ea2274f394e6..756355f75e3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -69,7 +69,8 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
"operand of a DBG_VALUE machine instruction");
Register Reg;
int64_t Offset =
- TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
+ TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg)
+ .getFixed();
MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
MI.getOperand(0).setIsDebug();
auto *DIExpr = DIExpression::prepend(
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 4b755dcb55ff..19895a20bacf 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -30,7 +30,7 @@ def VRDepot : NVPTXReg<"%Depot">;
// We use virtual registers, but define a few physical registers here to keep
// SDAG and the MachineInstr layers happy.
-foreach i = 0-4 in {
+foreach i = 0...4 in {
def P#i : NVPTXReg<"%p"#i>; // Predicate
def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
def R#i : NVPTXReg<"%r"#i>; // 32-bit
@@ -47,7 +47,7 @@ foreach i = 0-4 in {
def da#i : NVPTXReg<"%da"#i>;
}
-foreach i = 0-31 in {
+foreach i = 0...31 in {
def ENVREG#i : NVPTXReg<"%envreg"#i>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index f1fa6416f15f..05c20369abf4 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -35,7 +35,7 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
// Provide the default CPU if we don't have one.
TargetName = std::string(CPU.empty() ? "sm_20" : CPU);
- ParseSubtargetFeatures(TargetName, FS);
+ ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
// Set default to PTX 3.2 (CUDA 5.5)
if (PTXVersion == 0) {
@@ -48,9 +48,9 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const NVPTXTargetMachine &TM)
- : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
- InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
- FrameLowering() {}
+ : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
+ SmVersion(20), TM(TM), InstrInfo(),
+ TLInfo(TM, initializeSubtargetDependencies(CPU, FS)), FrameLowering() {}
bool NVPTXSubtarget::hasImageHandles() const {
// Enable handles for Kepler+, where CUDA supports indirect surfaces and
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0e9fa1fd3e56..9a249d3da3d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -83,7 +83,7 @@ public:
unsigned getPTXVersion() const { return PTXVersion; }
NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
};
} // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85709eb731e2..f1a82f1cf607 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
@@ -170,11 +171,11 @@ public:
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
- bool addRegAssignmentFast() override {
+ bool addRegAssignAndRewriteFast() override {
llvm_unreachable("should not be used");
}
- bool addRegAssignmentOptimized() override {
+ bool addRegAssignAndRewriteOptimized() override {
llvm_unreachable("should not be used");
}
@@ -205,6 +206,32 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
});
}
+void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) {
+ PB.registerPipelineParsingCallback(
+ [](StringRef PassName, FunctionPassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "nvvm-reflect") {
+ PM.addPass(NVVMReflectPass());
+ return true;
+ }
+ if (PassName == "nvvm-intr-range") {
+ PM.addPass(NVVMIntrRangePass());
+ return true;
+ }
+ return false;
+ });
+
+ PB.registerPipelineStartEPCallback(
+ [this, DebugPassManager](ModulePassManager &PM,
+ PassBuilder::OptimizationLevel Level) {
+ FunctionPassManager FPM(DebugPassManager);
+ FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+ FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
+ PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ });
+}
+
TargetTransformInfo
NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(NVPTXTTIImpl(this, F));
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index d84600c74e29..bef541c2b28d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -15,8 +15,6 @@
#include "ManagedStringPool.h"
#include "NVPTXSubtarget.h"
-#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -64,6 +62,8 @@ public:
}
void adjustPassManager(PassManagerBuilder &) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) override;
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 3873c73fb2e0..d4b2ae384068 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -111,6 +111,263 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
return false;
}
+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+ // Each NVVM intrinsic we can simplify can be replaced with one of:
+ //
+ // * an LLVM intrinsic,
+ // * an LLVM cast operation,
+ // * an LLVM binary operation, or
+ // * ad-hoc LLVM IR for the particular operation.
+
+ // Some transformations are only valid when the module's
+ // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+ // transformations are valid regardless of the module's ftz setting.
+ enum FtzRequirementTy {
+ FTZ_Any, // Any ftz setting is ok.
+ FTZ_MustBeOn, // Transformation is valid only if ftz is on.
+ FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+ };
+ // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+ // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+ // simplify.
+ enum SpecialCase {
+ SPC_Reciprocal,
+ };
+
+ // SimplifyAction is a poor-man's variant (plus an additional flag) that
+ // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+ struct SimplifyAction {
+ // Invariant: At most one of these Optionals has a value.
+ Optional<Intrinsic::ID> IID;
+ Optional<Instruction::CastOps> CastOp;
+ Optional<Instruction::BinaryOps> BinaryOp;
+ Optional<SpecialCase> Special;
+
+ FtzRequirementTy FtzRequirement = FTZ_Any;
+
+ SimplifyAction() = default;
+
+ SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+ : IID(IID), FtzRequirement(FtzReq) {}
+
+ // Cast operations don't have anything to do with FTZ, so we skip that
+ // argument.
+ SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+ SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+ : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+ SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+ : Special(Special), FtzRequirement(FtzReq) {}
+ };
+
+ // Try to generate a SimplifyAction describing how to replace our
+ // IntrinsicInstr with target-generic LLVM IR.
+ const SimplifyAction Action = [II]() -> SimplifyAction {
+ switch (II->getIntrinsicID()) {
+ // NVVM intrinsics that map directly to LLVM intrinsics.
+ case Intrinsic::nvvm_ceil_d:
+ return {Intrinsic::ceil, FTZ_Any};
+ case Intrinsic::nvvm_ceil_f:
+ return {Intrinsic::ceil, FTZ_MustBeOff};
+ case Intrinsic::nvvm_ceil_ftz_f:
+ return {Intrinsic::ceil, FTZ_MustBeOn};
+ case Intrinsic::nvvm_fabs_d:
+ return {Intrinsic::fabs, FTZ_Any};
+ case Intrinsic::nvvm_fabs_f:
+ return {Intrinsic::fabs, FTZ_MustBeOff};
+ case Intrinsic::nvvm_fabs_ftz_f:
+ return {Intrinsic::fabs, FTZ_MustBeOn};
+ case Intrinsic::nvvm_floor_d:
+ return {Intrinsic::floor, FTZ_Any};
+ case Intrinsic::nvvm_floor_f:
+ return {Intrinsic::floor, FTZ_MustBeOff};
+ case Intrinsic::nvvm_floor_ftz_f:
+ return {Intrinsic::floor, FTZ_MustBeOn};
+ case Intrinsic::nvvm_fma_rn_d:
+ return {Intrinsic::fma, FTZ_Any};
+ case Intrinsic::nvvm_fma_rn_f:
+ return {Intrinsic::fma, FTZ_MustBeOff};
+ case Intrinsic::nvvm_fma_rn_ftz_f:
+ return {Intrinsic::fma, FTZ_MustBeOn};
+ case Intrinsic::nvvm_fmax_d:
+ return {Intrinsic::maxnum, FTZ_Any};
+ case Intrinsic::nvvm_fmax_f:
+ return {Intrinsic::maxnum, FTZ_MustBeOff};
+ case Intrinsic::nvvm_fmax_ftz_f:
+ return {Intrinsic::maxnum, FTZ_MustBeOn};
+ case Intrinsic::nvvm_fmin_d:
+ return {Intrinsic::minnum, FTZ_Any};
+ case Intrinsic::nvvm_fmin_f:
+ return {Intrinsic::minnum, FTZ_MustBeOff};
+ case Intrinsic::nvvm_fmin_ftz_f:
+ return {Intrinsic::minnum, FTZ_MustBeOn};
+ case Intrinsic::nvvm_round_d:
+ return {Intrinsic::round, FTZ_Any};
+ case Intrinsic::nvvm_round_f:
+ return {Intrinsic::round, FTZ_MustBeOff};
+ case Intrinsic::nvvm_round_ftz_f:
+ return {Intrinsic::round, FTZ_MustBeOn};
+ case Intrinsic::nvvm_sqrt_rn_d:
+ return {Intrinsic::sqrt, FTZ_Any};
+ case Intrinsic::nvvm_sqrt_f:
+ // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
+ // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
+ // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
+ // the versions with explicit ftz-ness.
+ return {Intrinsic::sqrt, FTZ_Any};
+ case Intrinsic::nvvm_sqrt_rn_f:
+ return {Intrinsic::sqrt, FTZ_MustBeOff};
+ case Intrinsic::nvvm_sqrt_rn_ftz_f:
+ return {Intrinsic::sqrt, FTZ_MustBeOn};
+ case Intrinsic::nvvm_trunc_d:
+ return {Intrinsic::trunc, FTZ_Any};
+ case Intrinsic::nvvm_trunc_f:
+ return {Intrinsic::trunc, FTZ_MustBeOff};
+ case Intrinsic::nvvm_trunc_ftz_f:
+ return {Intrinsic::trunc, FTZ_MustBeOn};
+
+ // NVVM intrinsics that map to LLVM cast operations.
+ //
+ // Note that llvm's target-generic conversion operators correspond to the rz
+ // (round to zero) versions of the nvvm conversion intrinsics, even though
+ // most everything else here uses the rn (round to nearest even) nvvm ops.
+ case Intrinsic::nvvm_d2i_rz:
+ case Intrinsic::nvvm_f2i_rz:
+ case Intrinsic::nvvm_d2ll_rz:
+ case Intrinsic::nvvm_f2ll_rz:
+ return {Instruction::FPToSI};
+ case Intrinsic::nvvm_d2ui_rz:
+ case Intrinsic::nvvm_f2ui_rz:
+ case Intrinsic::nvvm_d2ull_rz:
+ case Intrinsic::nvvm_f2ull_rz:
+ return {Instruction::FPToUI};
+ case Intrinsic::nvvm_i2d_rz:
+ case Intrinsic::nvvm_i2f_rz:
+ case Intrinsic::nvvm_ll2d_rz:
+ case Intrinsic::nvvm_ll2f_rz:
+ return {Instruction::SIToFP};
+ case Intrinsic::nvvm_ui2d_rz:
+ case Intrinsic::nvvm_ui2f_rz:
+ case Intrinsic::nvvm_ull2d_rz:
+ case Intrinsic::nvvm_ull2f_rz:
+ return {Instruction::UIToFP};
+
+ // NVVM intrinsics that map to LLVM binary ops.
+ case Intrinsic::nvvm_add_rn_d:
+ return {Instruction::FAdd, FTZ_Any};
+ case Intrinsic::nvvm_add_rn_f:
+ return {Instruction::FAdd, FTZ_MustBeOff};
+ case Intrinsic::nvvm_add_rn_ftz_f:
+ return {Instruction::FAdd, FTZ_MustBeOn};
+ case Intrinsic::nvvm_mul_rn_d:
+ return {Instruction::FMul, FTZ_Any};
+ case Intrinsic::nvvm_mul_rn_f:
+ return {Instruction::FMul, FTZ_MustBeOff};
+ case Intrinsic::nvvm_mul_rn_ftz_f:
+ return {Instruction::FMul, FTZ_MustBeOn};
+ case Intrinsic::nvvm_div_rn_d:
+ return {Instruction::FDiv, FTZ_Any};
+ case Intrinsic::nvvm_div_rn_f:
+ return {Instruction::FDiv, FTZ_MustBeOff};
+ case Intrinsic::nvvm_div_rn_ftz_f:
+ return {Instruction::FDiv, FTZ_MustBeOn};
+
+ // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+ // need special handling.
+ //
+ // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
+ // as well.
+ case Intrinsic::nvvm_rcp_rn_d:
+ return {SPC_Reciprocal, FTZ_Any};
+ case Intrinsic::nvvm_rcp_rn_f:
+ return {SPC_Reciprocal, FTZ_MustBeOff};
+ case Intrinsic::nvvm_rcp_rn_ftz_f:
+ return {SPC_Reciprocal, FTZ_MustBeOn};
+
+ // We do not currently simplify intrinsics that give an approximate
+ // answer. These include:
+ //
+ // - nvvm_cos_approx_{f,ftz_f}
+ // - nvvm_ex2_approx_{d,f,ftz_f}
+ // - nvvm_lg2_approx_{d,f,ftz_f}
+ // - nvvm_sin_approx_{f,ftz_f}
+ // - nvvm_sqrt_approx_{f,ftz_f}
+ // - nvvm_rsqrt_approx_{d,f,ftz_f}
+ // - nvvm_div_approx_{ftz_d,ftz_f,f}
+ // - nvvm_rcp_approx_ftz_d
+ //
+ // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+ // means that fastmath is enabled in the intrinsic. Unfortunately only
+ // binary operators (currently) have a fastmath bit in SelectionDAG, so
+ // this information gets lost and we can't select on it.
+ //
+ // TODO: div and rcp are lowered to a binary op, so these we could in
+ // theory lower them to "fast fdiv".
+
+ default:
+ return {};
+ }
+ }();
+
+ // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+ // can bail out now. (Notice that in the case that IID is not an NVVM
+ // intrinsic, we don't have to look up any module metadata, as
+ // FtzRequirementTy will be FTZ_Any.)
+ if (Action.FtzRequirement != FTZ_Any) {
+ StringRef Attr = II->getFunction()
+ ->getFnAttribute("denormal-fp-math-f32")
+ .getValueAsString();
+ DenormalMode Mode = parseDenormalFPAttribute(Attr);
+ bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
+
+ if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+ return nullptr;
+ }
+
+ // Simplify to target-generic intrinsic.
+ if (Action.IID) {
+ SmallVector<Value *, 4> Args(II->arg_operands());
+ // All the target-generic intrinsics currently of interest to us have one
+ // type argument, equal to that of the nvvm intrinsic's argument.
+ Type *Tys[] = {II->getArgOperand(0)->getType()};
+ return CallInst::Create(
+ Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+ }
+
+ // Simplify to target-generic binary op.
+ if (Action.BinaryOp)
+ return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+ II->getArgOperand(1), II->getName());
+
+ // Simplify to target-generic cast op.
+ if (Action.CastOp)
+ return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+ II->getName());
+
+ // All that's left are the special cases.
+ if (!Action.Special)
+ return nullptr;
+
+ switch (*Action.Special) {
+ case SPC_Reciprocal:
+ // Simplify reciprocal.
+ return BinaryOperator::Create(
+ Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+ II->getArgOperand(0), II->getName());
+ }
+ llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
+Optional<Instruction *>
+NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
+ return I;
+ }
+ return None;
+}
+
int NVPTXTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueKind Opd1Info,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index cb832031f1ad..6f071040dd9d 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -48,6 +48,9 @@ public:
return AddressSpace::ADDRESS_SPACE_GENERIC;
}
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+
// Loads and stores can be vectorized if the alignment is at least as big as
// the load/store we want to vectorize.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index baaedc7ac87c..5381646434eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -17,6 +17,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
@@ -32,21 +33,13 @@ static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
namespace {
class NVVMIntrRange : public FunctionPass {
private:
- struct {
- unsigned x, y, z;
- } MaxBlockSize, MaxGridSize;
+ unsigned SmVersion;
public:
static char ID;
NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
- NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) {
- MaxBlockSize.x = 1024;
- MaxBlockSize.y = 1024;
- MaxBlockSize.z = 64;
-
- MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
- MaxGridSize.y = 0xffff;
- MaxGridSize.z = 0xffff;
+ NVVMIntrRange(unsigned int SmVersion)
+ : FunctionPass(ID), SmVersion(SmVersion) {
initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
}
@@ -79,7 +72,18 @@ static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
return true;
}
-bool NVVMIntrRange::runOnFunction(Function &F) {
+static bool runNVVMIntrRange(Function &F, unsigned SmVersion) {
+ struct {
+ unsigned x, y, z;
+ } MaxBlockSize, MaxGridSize;
+ MaxBlockSize.x = 1024;
+ MaxBlockSize.y = 1024;
+ MaxBlockSize.z = 64;
+
+ MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+ MaxGridSize.y = 0xffff;
+ MaxGridSize.z = 0xffff;
+
// Go through the calls in this function.
bool Changed = false;
for (Instruction &I : instructions(F)) {
@@ -151,3 +155,15 @@ bool NVVMIntrRange::runOnFunction(Function &F) {
return Changed;
}
+
+bool NVVMIntrRange::runOnFunction(Function &F) {
+ return runNVVMIntrRange(F, SmVersion);
+}
+
+NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {}
+
+PreservedAnalyses NVVMIntrRangePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index ae166dc5a8d5..339f51d21087 100644
--- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -29,6 +29,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -73,7 +74,7 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
"Replace occurrences of __nvvm_reflect() calls with 0/1", false,
false)
-bool NVVMReflect::runOnFunction(Function &F) {
+static bool runNVVMReflect(Function &F, unsigned SmVersion) {
if (!NVVMReflectEnabled)
return false;
@@ -179,3 +180,15 @@ bool NVVMReflect::runOnFunction(Function &F) {
return ToRemove.size() > 0;
}
+
+bool NVVMReflect::runOnFunction(Function &F) {
+ return runNVVMReflect(F, SmVersion);
+}
+
+NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
+
+PreservedAnalyses NVVMReflectPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ return runNVVMReflect(F, SmVersion) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 13fd7d05ab9f..197fd3c7aa74 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -11,7 +11,6 @@
#include "PPCTargetStreamer.h"
#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -99,12 +98,10 @@ struct PPCOperand;
class PPCAsmParser : public MCTargetAsmParser {
bool IsPPC64;
- bool IsDarwin;
void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
bool isPPC64() const { return IsPPC64; }
- bool isDarwin() const { return IsDarwin; }
bool MatchRegisterName(unsigned &RegNo, int64_t &IntVal);
@@ -116,14 +113,12 @@ class PPCAsmParser : public MCTargetAsmParser {
PPCMCExpr::VariantKind &Variant);
const MCExpr *FixupVariantKind(const MCExpr *E);
bool ParseExpression(const MCExpr *&EVal);
- bool ParseDarwinExpression(const MCExpr *&EVal);
bool ParseOperand(OperandVector &Operands);
bool ParseDirectiveWord(unsigned Size, AsmToken ID);
bool ParseDirectiveTC(unsigned Size, AsmToken ID);
bool ParseDirectiveMachine(SMLoc L);
- bool ParseDarwinDirectiveMachine(SMLoc L);
bool ParseDirectiveAbiVersion(SMLoc L);
bool ParseDirectiveLocalEntry(SMLoc L);
@@ -150,7 +145,6 @@ public:
// Check for 64-bit vs. 32-bit pointer mode.
const Triple &TheTriple = STI.getTargetTriple();
IsPPC64 = TheTriple.isPPC64();
- IsDarwin = TheTriple.isMacOSX();
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
}
@@ -290,6 +284,16 @@ public:
return (unsigned) Imm.Val;
}
+ unsigned getACCReg() const {
+ assert(isACCRegNumber() && "Invalid access!");
+ return (unsigned) Imm.Val;
+ }
+
+ unsigned getVSRpEvenReg() const {
+ assert(isVSRpEvenRegNumber() && "Invalid access!");
+ return (unsigned) Imm.Val >> 1;
+ }
+
unsigned getCCReg() const {
assert(isCCRegNumber() && "Invalid access!");
return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
@@ -402,6 +406,12 @@ public:
(getImm() & 3) == 0); }
bool isImmZero() const { return Kind == Immediate && getImm() == 0; }
bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+ bool isACCRegNumber() const {
+ return Kind == Immediate && isUInt<3>(getImm());
+ }
+ bool isVSRpEvenRegNumber() const {
+ return Kind == Immediate && isUInt<6>(getImm()) && ((getImm() & 1) == 0);
+ }
bool isVSRegNumber() const {
return Kind == Immediate && isUInt<6>(getImm());
}
@@ -492,29 +502,29 @@ public:
Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()]));
}
- void addRegQFRCOperands(MCInst &Inst, unsigned N) const {
+ void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+ Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
}
- void addRegQSRCOperands(MCInst &Inst, unsigned N) const {
+ void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+ Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
}
- void addRegQBRCOperands(MCInst &Inst, unsigned N) const {
+ void addRegACCRCOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+ Inst.addOperand(MCOperand::createReg(ACCRegs[getACCReg()]));
}
- void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
+ void addRegVSRpRCOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+ Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()]));
}
- void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
+ void addRegVSRpEvenRCOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
+ Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()]));
}
void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
@@ -666,7 +676,8 @@ public:
return CreateImm(CE->getValue(), S, E, IsPPC64);
if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
- if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
+ if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS ||
+ SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS_PCREL)
return CreateTLSReg(SRE, S, E, IsPPC64);
if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
@@ -762,12 +773,18 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
}
case PPC::DCBFx:
case PPC::DCBFL:
- case PPC::DCBFLP: {
+ case PPC::DCBFLP:
+ case PPC::DCBFPS:
+ case PPC::DCBSTPS: {
int L = 0;
if (Opcode == PPC::DCBFL)
L = 1;
else if (Opcode == PPC::DCBFLP)
L = 3;
+ else if (Opcode == PPC::DCBFPS)
+ L = 4;
+ else if (Opcode == PPC::DCBSTPS)
+ L = 6;
MCInst TmpInst;
TmpInst.setOpcode(PPC::DCBF);
@@ -1184,41 +1201,41 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) {
- if (getParser().getTok().is(AsmToken::Identifier)) {
- StringRef Name = getParser().getTok().getString();
- if (Name.equals_lower("lr")) {
- RegNo = isPPC64()? PPC::LR8 : PPC::LR;
- IntVal = 8;
- } else if (Name.equals_lower("ctr")) {
- RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
- IntVal = 9;
- } else if (Name.equals_lower("vrsave")) {
- RegNo = PPC::VRSAVE;
- IntVal = 256;
- } else if (Name.startswith_lower("r") &&
- !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
- RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
- } else if (Name.startswith_lower("f") &&
- !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
- RegNo = FRegs[IntVal];
- } else if (Name.startswith_lower("vs") &&
- !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
- RegNo = VSRegs[IntVal];
- } else if (Name.startswith_lower("v") &&
- !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
- RegNo = VRegs[IntVal];
- } else if (Name.startswith_lower("q") &&
- !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
- RegNo = QFRegs[IntVal];
- } else if (Name.startswith_lower("cr") &&
- !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
- RegNo = CRRegs[IntVal];
- } else
- return true;
- getParser().Lex();
- return false;
- }
- return true;
+ if (getParser().getTok().is(AsmToken::Percent))
+ getParser().Lex(); // Eat the '%'.
+
+ if (!getParser().getTok().is(AsmToken::Identifier))
+ return true;
+
+ StringRef Name = getParser().getTok().getString();
+ if (Name.equals_lower("lr")) {
+ RegNo = isPPC64() ? PPC::LR8 : PPC::LR;
+ IntVal = 8;
+ } else if (Name.equals_lower("ctr")) {
+ RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR;
+ IntVal = 9;
+ } else if (Name.equals_lower("vrsave")) {
+ RegNo = PPC::VRSAVE;
+ IntVal = 256;
+ } else if (Name.startswith_lower("r") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal];
+ } else if (Name.startswith_lower("f") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = FRegs[IntVal];
+ } else if (Name.startswith_lower("vs") &&
+ !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
+ RegNo = VSRegs[IntVal];
+ } else if (Name.startswith_lower("v") &&
+ !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+ RegNo = VRegs[IntVal];
+ } else if (Name.startswith_lower("cr") &&
+ !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
+ RegNo = CRRegs[IntVal];
+ } else
+ return true;
+ getParser().Lex();
+ return false;
}
bool PPCAsmParser::
@@ -1387,10 +1404,6 @@ FixupVariantKind(const MCExpr *E) {
/// it handles modifiers.
bool PPCAsmParser::
ParseExpression(const MCExpr *&EVal) {
-
- if (isDarwin())
- return ParseDarwinExpression(EVal);
-
// (ELF Platforms)
// Handle \code @l/@ha \endcode
if (getParser().parseExpression(EVal))
@@ -1406,53 +1419,6 @@ ParseExpression(const MCExpr *&EVal) {
return false;
}
-/// ParseDarwinExpression. (MachO Platforms)
-/// This differs from the default "parseExpression" in that it handles detection
-/// of the \code hi16(), ha16() and lo16() \endcode modifiers. At present,
-/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
-/// syntax form so it is done here. TODO: Determine if there is merit in
-/// arranging for this to be done at a higher level.
-bool PPCAsmParser::
-ParseDarwinExpression(const MCExpr *&EVal) {
- MCAsmParser &Parser = getParser();
- PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
- switch (getLexer().getKind()) {
- default:
- break;
- case AsmToken::Identifier:
- // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus
- // something starting with any other char should be part of the
- // asm syntax. If handwritten asm includes an identifier like lo16,
- // then all bets are off - but no-one would do that, right?
- StringRef poss = Parser.getTok().getString();
- if (poss.equals_lower("lo16")) {
- Variant = PPCMCExpr::VK_PPC_LO;
- } else if (poss.equals_lower("hi16")) {
- Variant = PPCMCExpr::VK_PPC_HI;
- } else if (poss.equals_lower("ha16")) {
- Variant = PPCMCExpr::VK_PPC_HA;
- }
- if (Variant != PPCMCExpr::VK_PPC_None) {
- Parser.Lex(); // Eat the xx16
- if (getLexer().isNot(AsmToken::LParen))
- return Error(Parser.getTok().getLoc(), "expected '('");
- Parser.Lex(); // Eat the '('
- }
- break;
- }
-
- if (getParser().parseExpression(EVal))
- return true;
-
- if (Variant != PPCMCExpr::VK_PPC_None) {
- if (getLexer().isNot(AsmToken::RParen))
- return Error(Parser.getTok().getLoc(), "expected ')'");
- Parser.Lex(); // Eat the ')'
- EVal = PPCMCExpr::create(Variant, EVal, getParser().getContext());
- }
- return false;
-}
-
/// ParseOperand
/// This handles registers in the form 'NN', '%rNN' for ELF platforms and
/// rNN for MachO.
@@ -1466,8 +1432,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
switch (getLexer().getKind()) {
// Special handling for register names. These are interpreted
// as immediates corresponding to the register number.
- case AsmToken::Percent:
- Parser.Lex(); // Eat the '%'.
+ case AsmToken::Percent: {
unsigned RegNo;
int64_t IntVal;
if (MatchRegisterName(RegNo, IntVal))
@@ -1475,7 +1440,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
return false;
-
+ }
case AsmToken::Identifier:
case AsmToken::LParen:
case AsmToken::Plus:
@@ -1485,20 +1450,6 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
case AsmToken::Dollar:
case AsmToken::Exclaim:
case AsmToken::Tilde:
- // Note that non-register-name identifiers from the compiler will begin
- // with '_', 'L'/'l' or '"'. Of course, handwritten asm could include
- // identifiers like r31foo - so we fall through in the event that parsing
- // a register name fails.
- if (isDarwin()) {
- unsigned RegNo;
- int64_t IntVal;
- if (!MatchRegisterName(RegNo, IntVal)) {
- Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
- return false;
- }
- }
- // All other expressions
-
if (!ParseExpression(EVal))
break;
// Fall-through
@@ -1537,29 +1488,18 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
int64_t IntVal;
switch (getLexer().getKind()) {
- case AsmToken::Percent:
- Parser.Lex(); // Eat the '%'.
+ case AsmToken::Percent: {
unsigned RegNo;
if (MatchRegisterName(RegNo, IntVal))
return Error(S, "invalid register name");
break;
-
+ }
case AsmToken::Integer:
- if (isDarwin())
- return Error(S, "unexpected integer value");
- else if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 ||
- IntVal > 31)
+ if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 ||
+ IntVal > 31)
return Error(S, "invalid register number");
break;
- case AsmToken::Identifier:
- if (isDarwin()) {
- unsigned RegNo;
- if (!MatchRegisterName(RegNo, IntVal)) {
- break;
- }
- }
- LLVM_FALLTHROUGH;
-
+ case AsmToken::Identifier:
default:
return Error(S, "invalid memory operand");
}
@@ -1643,12 +1583,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
/// ParseDirective parses the PPC specific directives
bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getIdentifier();
- if (isDarwin()) {
- if (IDVal == ".machine")
- ParseDarwinDirectiveMachine(DirectiveID.getLoc());
- else
- return true;
- } else if (IDVal == ".word")
+ if (IDVal == ".word")
ParseDirectiveWord(2, DirectiveID);
else if (IDVal == ".llong")
ParseDirectiveWord(8, DirectiveID);
@@ -1720,11 +1655,7 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
// FIXME: Right now, the parser always allows any available
// instruction, so the .machine directive is not useful.
- // Implement ".machine any" (by doing nothing) for the benefit
- // of existing assembler code. Likewise, we can then implement
- // ".machine push" and ".machine pop" as no-op.
- if (CPU != "any" && CPU != "push" && CPU != "pop")
- return TokError("unrecognized machine type");
+ // In the wild, any/push/pop/ppc64/altivec/power[4-9] are seen.
Parser.Lex();
@@ -1739,31 +1670,6 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
return false;
}
-/// ParseDarwinDirectiveMachine (Mach-o platforms)
-/// ::= .machine cpu-identifier
-bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
- MCAsmParser &Parser = getParser();
- if (Parser.getTok().isNot(AsmToken::Identifier) &&
- Parser.getTok().isNot(AsmToken::String))
- return Error(L, "unexpected token in directive");
-
- StringRef CPU = Parser.getTok().getIdentifier();
- Parser.Lex();
-
- // FIXME: this is only the 'default' set of cpu variants.
- // However we don't act on this information at present, this is simply
- // allowing parsing to proceed with minimal sanity checking.
- if (check(CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64", L,
- "unrecognized cpu type") ||
- check(isPPC64() && (CPU == "ppc7400" || CPU == "ppc"), L,
- "wrong cpu type specified for 64bit") ||
- check(!isPPC64() && CPU == "ppc64", L,
- "wrong cpu type specified for 32bit") ||
- parseToken(AsmToken::EndOfStatement))
- return addErrorSuffix(" in '.machine' directive");
- return false;
-}
-
/// ParseDirectiveAbiVersion
/// ::= .abiversion constant-expression
bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
@@ -1809,8 +1715,9 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
/// Force static initialization.
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
- RegisterMCAsmParser<PPCAsmParser> B(getThePPC64Target());
- RegisterMCAsmParser<PPCAsmParser> C(getThePPC64LETarget());
+ RegisterMCAsmParser<PPCAsmParser> B(getThePPC32LETarget());
+ RegisterMCAsmParser<PPCAsmParser> C(getThePPC64Target());
+ RegisterMCAsmParser<PPCAsmParser> D(getThePPC64LETarget());
}
#define GET_REGISTER_MATCHER
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 74c6fd3733f0..3e9286fb0b30 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -54,6 +54,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
// Register the disassembler for each target.
TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
createPPCDisassembler);
+ TargetRegistry::RegisterMCDisassembler(getThePPC32LETarget(),
+ createPPCLEDisassembler);
TargetRegistry::RegisterMCDisassembler(getThePPC64Target(),
createPPCDisassembler);
TargetRegistry::RegisterMCDisassembler(getThePPC64LETarget(),
@@ -167,18 +169,24 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
-static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
- uint64_t Address,
- const void *Decoder) {
- return decodeRegisterClass(Inst, RegNo, QFRegs);
-}
-
static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
return decodeRegisterClass(Inst, RegNo, SPERegs);
}
+static DecodeStatus DecodeACCRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, ACCRegs);
+}
+
+static DecodeStatus DecodeVSRpRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, VSRpRegs);
+}
+
#define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
#define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
@@ -206,6 +214,15 @@ static DecodeStatus decodeImmZeroOperand(MCInst &Inst, uint64_t Imm,
return MCDisassembler::Success;
}
+static DecodeStatus decodeVSRpEvenOperands(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo & 1)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::createReg(VSRpRegs[RegNo >> 1]));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
int64_t Address, const void *Decoder) {
// Decode the memri field (imm, reg), which has the low 16-bits as the
@@ -401,14 +418,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Read the instruction in the proper endianness.
uint64_t Inst = ReadFunc(Bytes.data());
- if (STI.getFeatureBits()[PPC::FeatureQPX]) {
- DecodeStatus result =
- decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
- if (result != MCDisassembler::Fail)
- return result;
- } else if (STI.getFeatureBits()[PPC::FeatureSPE]) {
+ if (STI.getFeatureBits()[PPC::FeatureSPE]) {
DecodeStatus result =
- decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI);
+ decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI);
if (result != MCDisassembler::Fail)
return result;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
new file mode 100644
index 000000000000..e8f8cbfee6ee
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
@@ -0,0 +1,53 @@
+//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PPCCallLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-call-lowering"
+
+using namespace llvm;
+
+PPCCallLowering::PPCCallLowering(const PPCTargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+bool PPCCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI,
+ Register SwiftErrorVReg) const {
+ assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
+ "Return value without a vreg");
+ if (VRegs.size() > 0)
+ return false;
+
+ MIRBuilder.buildInstr(PPC::BLR8);
+ return true;
+}
+
+bool PPCCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
+
+ // If VRegs is empty, then there are no formal arguments to lower and thus can
+ // always return true. If there are formal arguments, we currently do not
+ // handle them and thus return false.
+ return VRegs.empty();
+}
+
+bool PPCCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
new file mode 100644
index 000000000000..5a449f4cab1b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
@@ -0,0 +1,40 @@
+//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H
+
+#include "PPCISelLowering.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+class PPCTargetLowering;
+
+class PPCCallLowering : public CallLowering {
+public:
+ PPCCallLowering(const PPCTargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
+ Register SwiftErrorVReg) const override;
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
new file mode 100644
index 000000000000..7d64816ed6c7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -0,0 +1,92 @@
+//===- PPCInstructionSelector.cpp --------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// PowerPC.
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPCRegisterBankInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-gisel"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class PPCInstructionSelector : public InstructionSelector {
+public:
+ PPCInstructionSelector(const PPCTargetMachine &TM, const PPCSubtarget &STI,
+ const PPCRegisterBankInfo &RBI);
+
+ bool select(MachineInstr &I) override;
+ static const char *getName() { return DEBUG_TYPE; }
+
+private:
+ /// tblgen generated 'select' implementation that is used as the initial
+ /// selector for the patterns that do not require complex C++.
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+ const PPCInstrInfo &TII;
+ const PPCRegisterInfo &TRI;
+ const PPCRegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+PPCInstructionSelector::PPCInstructionSelector(const PPCTargetMachine &TM,
+ const PPCSubtarget &STI,
+ const PPCRegisterBankInfo &RBI)
+ : InstructionSelector(), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "PPCGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+bool PPCInstructionSelector::select(MachineInstr &I) {
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+ return false;
+}
+
+namespace llvm {
+InstructionSelector *
+createPPCInstructionSelector(const PPCTargetMachine &TM,
+ const PPCSubtarget &Subtarget,
+ const PPCRegisterBankInfo &RBI) {
+ return new PPCInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
new file mode 100644
index 000000000000..c16bcaea592b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp
@@ -0,0 +1,20 @@
+//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for PowerPC
+//===----------------------------------------------------------------------===//
+
+#include "PPCLegalizerInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-legalinfo"
+
+using namespace llvm;
+using namespace LegalizeActions;
+
+PPCLegalizerInfo::PPCLegalizerInfo(const PPCSubtarget &ST) { computeTables(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h
new file mode 100644
index 000000000000..c73186d3d0c1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h
@@ -0,0 +1,28 @@
+//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for PowerPC
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class PPCSubtarget;
+
+/// This class provides the information for the PowerPC target legalizer for
+/// GlobalISel.
+class PPCLegalizerInfo : public LegalizerInfo {
+public:
+ PPCLegalizerInfo(const PPCSubtarget &ST);
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
new file mode 100644
index 000000000000..6af79324919c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
@@ -0,0 +1,27 @@
+//===- PPCRegisterBankInfo.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// PowerPC.
+//===----------------------------------------------------------------------===//
+
+#include "PPCRegisterBankInfo.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ppc-reg-bank-info"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "PPCGenRegisterBank.inc"
+
+using namespace llvm;
+
+PPCRegisterBankInfo::PPCRegisterBankInfo(const TargetRegisterInfo &TRI)
+ : PPCGenRegisterBankInfo() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
new file mode 100644
index 000000000000..358d5ed3cf14
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h
@@ -0,0 +1,39 @@
+//===-- PPCRegisterBankInfo.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for PowerPC.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "PPCGenRegisterBank.inc"
+
+namespace llvm {
+class TargetRegisterInfo;
+
+class PPCGenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "PPCGenRegisterBank.inc"
+};
+
+class PPCRegisterBankInfo final : public PPCGenRegisterBankInfo {
+public:
+ PPCRegisterBankInfo(const TargetRegisterInfo &TRI);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td
new file mode 100644
index 000000000000..0e8a4b7061c5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td
@@ -0,0 +1,15 @@
+//===-- PPCRegisterBanks.td - Describe the PPC Banks -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Define the PPC register banks used for GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers
+def GPRRegBank : RegisterBank<"GPR", [G8RC]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index dbaf221db9fc..72401668c8d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -46,6 +46,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
case PPC::fixup_ppc_half16ds:
return Value & 0xfffc;
case PPC::fixup_ppc_pcrel34:
+ case PPC::fixup_ppc_imm34:
return Value & 0x3ffffffff;
}
}
@@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
case PPC::fixup_ppc_br24_notoc:
return 4;
case PPC::fixup_ppc_pcrel34:
+ case PPC::fixup_ppc_imm34:
case FK_Data_8:
return 8;
case PPC::fixup_ppc_nofixup:
@@ -100,6 +102,7 @@ public:
{ "fixup_ppc_half16", 0, 16, 0 },
{ "fixup_ppc_half16ds", 0, 14, 0 },
{ "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_ppc_imm34", 0, 34, 0 },
{ "fixup_ppc_nofixup", 0, 0, 0 }
};
const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
@@ -112,6 +115,7 @@ public:
{ "fixup_ppc_half16", 0, 16, 0 },
{ "fixup_ppc_half16ds", 2, 14, 0 },
{ "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel },
+ { "fixup_ppc_imm34", 0, 34, 0 },
{ "fixup_ppc_nofixup", 0, 0, 0 }
};
@@ -178,12 +182,6 @@ public:
}
}
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- // FIXME.
- return false;
- }
-
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value,
const MCRelaxableFragment *DF,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index d8b3301e97f1..94ef7b45434f 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -138,6 +138,15 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
case MCSymbolRefExpr::VK_PPC_GOT_PCREL:
Type = ELF::R_PPC64_GOT_PCREL34;
break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL:
+ Type = ELF::R_PPC64_GOT_TLSGD_PCREL34;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL:
+ Type = ELF::R_PPC64_GOT_TLSLD_PCREL34;
+ break;
+ case MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL:
+ Type = ELF::R_PPC64_GOT_TPREL_PCREL34;
+ break;
}
break;
case FK_Data_4:
@@ -407,6 +416,21 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
else
Type = ELF::R_PPC_TLS;
break;
+ case MCSymbolRefExpr::VK_PPC_TLS_PCREL:
+ Type = ELF::R_PPC64_TLS;
+ break;
+ }
+ break;
+ case PPC::fixup_ppc_imm34:
+ switch (Modifier) {
+ default:
+ report_fatal_error("Unsupported Modifier for fixup_ppc_imm34.");
+ case MCSymbolRefExpr::VK_DTPREL:
+ Type = ELF::R_PPC64_DTPREL34;
+ break;
+ case MCSymbolRefExpr::VK_TPREL:
+ Type = ELF::R_PPC64_TPREL34;
+ break;
}
break;
case FK_Data_8:
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 4373778cc96c..386d59266096 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -20,6 +20,7 @@
#include "PPCELFStreamer.h"
+#include "PPCFixupKinds.h"
#include "PPCInstrInfo.h"
#include "PPCMCCodeEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -89,12 +90,33 @@ void PPCELFStreamer::emitInstruction(const MCInst &Inst,
PPCMCCodeEmitter *Emitter =
static_cast<PPCMCCodeEmitter*>(getAssembler().getEmitterPtr());
+ // If the instruction is a part of the GOT to PC-Rel link time optimization
+ // instruction pair, return a value, otherwise return None. A true returned
+ // value means the instruction is the PLDpc and a false value means it is
+ // the user instruction.
+ Optional<bool> IsPartOfGOTToPCRelPair = isPartOfGOTToPCRelPair(Inst, STI);
+
+ // User of the GOT-indirect address.
+ // For example, the load that will get the relocation as follows:
+ // .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8)
+ // lwa 3, 4(3)
+ if (IsPartOfGOTToPCRelPair.hasValue() && !IsPartOfGOTToPCRelPair.getValue())
+ emitGOTToPCRelReloc(Inst);
+
// Special handling is only for prefixed instructions.
if (!Emitter->isPrefixedInstruction(Inst)) {
MCELFStreamer::emitInstruction(Inst, STI);
return;
}
emitPrefixedInstruction(Inst, STI);
+
+ // Producer of the GOT-indirect address.
+ // For example, the prefixed load from the got that will get the label as
+ // follows:
+ // pld 3, vec@got@pcrel(0), 1
+ // .Lpcrel1:
+ if (IsPartOfGOTToPCRelPair.hasValue() && IsPartOfGOTToPCRelPair.getValue())
+ emitGOTToPCRelLabel(Inst);
}
void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
@@ -103,6 +125,102 @@ void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
MCELFStreamer::emitLabel(Symbol);
}
+// This linker time GOT PC Relative optimization relocation will look like this:
+// pld <reg> symbol@got@pcrel
+// <Label###>:
+// .reloc Label###-8,R_PPC64_PCREL_OPT,.-(Label###-8)
+// load <loadedreg>, 0(<reg>)
+// The reason we place the label after the PLDpc instruction is that there
+// may be an alignment nop before it since prefixed instructions must not
+// cross a 64-byte boundary (please see
+// PPCELFStreamer::emitPrefixedInstruction()). When referring to the
+// label, we subtract the width of a prefixed instruction (8 bytes) to ensure
+// we refer to the PLDpc.
+void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) {
+ // Get the last operand which contains the symbol.
+ const MCOperand &Operand = Inst.getOperand(Inst.getNumOperands() - 1);
+ assert(Operand.isExpr() && "Expecting an MCExpr.");
+ // Cast the last operand to MCSymbolRefExpr to get the symbol.
+ const MCExpr *Expr = Operand.getExpr();
+ const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
+ assert(SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT &&
+ "Expecting a symbol of type VK_PPC_PCREL_OPT");
+ MCSymbol *LabelSym =
+ getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
+ const MCExpr *LabelExpr = MCSymbolRefExpr::create(LabelSym, getContext());
+ const MCExpr *Eight = MCConstantExpr::create(8, getContext());
+ // SubExpr is just Label###-8
+ const MCExpr *SubExpr =
+ MCBinaryExpr::createSub(LabelExpr, Eight, getContext());
+ MCSymbol *CurrentLocation = getContext().createTempSymbol();
+ const MCExpr *CurrentLocationExpr =
+ MCSymbolRefExpr::create(CurrentLocation, getContext());
+ // SubExpr2 is .-(Label###-8)
+ const MCExpr *SubExpr2 =
+ MCBinaryExpr::createSub(CurrentLocationExpr, SubExpr, getContext());
+
+ MCDataFragment *DF = static_cast<MCDataFragment *>(LabelSym->getFragment());
+ assert(DF && "Expecting a valid data fragment.");
+ MCFixupKind FixupKind = static_cast<MCFixupKind>(FirstLiteralRelocationKind +
+ ELF::R_PPC64_PCREL_OPT);
+ DF->getFixups().push_back(
+ MCFixup::create(LabelSym->getOffset() - 8, SubExpr2,
+ FixupKind, Inst.getLoc()));
+ emitLabel(CurrentLocation, Inst.getLoc());
+}
+
+// Emit the label that immediately follows the PLDpc for a link time GOT PC Rel
+// optimization.
+void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) {
+ // Get the last operand which contains the symbol.
+ const MCOperand &Operand = Inst.getOperand(Inst.getNumOperands() - 1);
+ assert(Operand.isExpr() && "Expecting an MCExpr.");
+ // Cast the last operand to MCSymbolRefExpr to get the symbol.
+ const MCExpr *Expr = Operand.getExpr();
+ const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
+ assert(SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT &&
+ "Expecting a symbol of type VK_PPC_PCREL_OPT");
+ MCSymbol *LabelSym =
+ getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
+ emitLabel(LabelSym, Inst.getLoc());
+}
+
+// This funciton checks if the parameter Inst is part of the setup for a link
+// time GOT PC Relative optimization. For example in this situation:
+// <MCInst PLDpc <MCOperand Reg:282> <MCOperand Expr:(glob_double@got@pcrel)>
+// <MCOperand Imm:0> <MCOperand Expr:(.Lpcrel@<<invalid>>)>>
+// <MCInst SOME_LOAD <MCOperand Reg:22> <MCOperand Imm:0> <MCOperand Reg:282>
+// <MCOperand Expr:(.Lpcrel@<<invalid>>)>>
+// The above is a pair of such instructions and this function will not return
+// None for either one of them. In both cases we are looking for the last
+// operand <MCOperand Expr:(.Lpcrel@<<invalid>>)> which needs to be an MCExpr
+// and has the flag MCSymbolRefExpr::VK_PPC_PCREL_OPT. After that we just look
+// at the opcode and in the case of PLDpc we will return true. For the load
+// (or store) this function will return false indicating it has found the second
+// instruciton in the pair.
+Optional<bool> llvm::isPartOfGOTToPCRelPair(const MCInst &Inst,
+ const MCSubtargetInfo &STI) {
+ // Need at least two operands.
+ if (Inst.getNumOperands() < 2)
+ return None;
+
+ unsigned LastOp = Inst.getNumOperands() - 1;
+ // The last operand needs to be an MCExpr and it needs to have a variant kind
+ // of VK_PPC_PCREL_OPT. If it does not satisfy these conditions it is not a
+ // link time GOT PC Rel opt instruction and we can ignore it and return None.
+ const MCOperand &Operand = Inst.getOperand(LastOp);
+ if (!Operand.isExpr())
+ return None;
+
+ // Check for the variant kind VK_PPC_PCREL_OPT in this expression.
+ const MCExpr *Expr = Operand.getExpr();
+ const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
+ if (!SymExpr || SymExpr->getKind() != MCSymbolRefExpr::VK_PPC_PCREL_OPT)
+ return None;
+
+ return (Inst.getOpcode() == PPC::PLDpc);
+}
+
MCELFStreamer *llvm::createPPCELFStreamer(
MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
std::unique_ptr<MCObjectWriter> OW,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
index 51863232d071..f44200104f32 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
@@ -43,8 +43,15 @@ public:
void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
private:
void emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
+ void emitGOTToPCRelReloc(const MCInst &Inst);
+ void emitGOTToPCRelLabel(const MCInst &Inst);
};
+// Check if the instruction Inst is part of a pair of instructions that make up
+// a link time GOT PC Rel optimization.
+Optional<bool> isPartOfGOTToPCRelPair(const MCInst &Inst,
+ const MCSubtargetInfo &STI);
+
MCELFStreamer *createPPCELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> MAB,
std::unique_ptr<MCObjectWriter> OW,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 2fb8947fd4e0..73292f7b7938 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -43,6 +43,9 @@ enum Fixups {
// A 34-bit fixup corresponding to PC-relative paddi.
fixup_ppc_pcrel34,
+ // A 34-bit fixup corresponding to Non-PC-relative paddi.
+ fixup_ppc_imm34,
+
/// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
/// TLS general and local dynamic models, or inserts the thread-pointer
/// register number.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 16da62a74b8c..a291a34d4c52 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -49,18 +49,6 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden,
void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
const char *RegName = getRegisterName(RegNo);
- if (RegName[0] == 'q' /* QPX */) {
- // The system toolchain on the BG/Q does not understand QPX register names
- // in .cfi_* directives, so print the name of the floating-point
- // subregister instead.
- std::string RN(RegName);
-
- RN[0] = 'f';
- OS << RN;
-
- return;
- }
-
OS << RegName;
}
@@ -83,15 +71,45 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
"reference expression if it is an expression at all.");
O << "\taddis ";
- printOperand(MI, 0, O);
+ printOperand(MI, 0, STI, O);
O << ", ";
- printOperand(MI, 2, O);
+ printOperand(MI, 2, STI, O);
O << "(";
- printOperand(MI, 1, O);
+ printOperand(MI, 1, STI, O);
O << ")";
return;
}
+ // Check if the last operand is an expression with the variant kind
+ // VK_PPC_PCREL_OPT. If this is the case then this is a linker optimization
+ // relocation and the .reloc directive needs to be added.
+ unsigned LastOp = MI->getNumOperands() - 1;
+ if (MI->getNumOperands() > 1) {
+ const MCOperand &Operand = MI->getOperand(LastOp);
+ if (Operand.isExpr()) {
+ const MCExpr *Expr = Operand.getExpr();
+ const MCSymbolRefExpr *SymExpr =
+ static_cast<const MCSymbolRefExpr *>(Expr);
+
+ if (SymExpr && SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT) {
+ const MCSymbol &Symbol = SymExpr->getSymbol();
+ if (MI->getOpcode() == PPC::PLDpc) {
+ printInstruction(MI, Address, STI, O);
+ O << "\n";
+ Symbol.print(O, &MAI);
+ O << ":";
+ return;
+ } else {
+ O << "\t.reloc ";
+ Symbol.print(O, &MAI);
+ O << "-8,R_PPC64_PCREL_OPT,.-(";
+ Symbol.print(O, &MAI);
+ O << "-8)\n";
+ }
+ }
+ }
+ }
+
// Check for slwi/srwi mnemonics.
if (MI->getOpcode() == PPC::RLWINM) {
unsigned char SH = MI->getOperand(2).getImm();
@@ -106,9 +124,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
SH = 32-SH;
}
if (useSubstituteMnemonic) {
- printOperand(MI, 0, O);
+ printOperand(MI, 0, STI, O);
O << ", ";
- printOperand(MI, 1, O);
+ printOperand(MI, 1, STI, O);
O << ", " << (unsigned int)SH;
printAnnotation(O, Annot);
@@ -123,9 +141,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
// rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
if (63-SH == ME) {
O << "\tsldi ";
- printOperand(MI, 0, O);
+ printOperand(MI, 0, STI, O);
O << ", ";
- printOperand(MI, 1, O);
+ printOperand(MI, 1, STI, O);
O << ", " << (unsigned int)SH;
printAnnotation(O, Annot);
return;
@@ -153,9 +171,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
if (IsBookE && TH != 0 && TH != 16)
O << (unsigned int) TH << ", ";
- printOperand(MI, 1, O);
+ printOperand(MI, 1, STI, O);
O << ", ";
- printOperand(MI, 2, O);
+ printOperand(MI, 2, STI, O);
if (!IsBookE && TH != 0 && TH != 16)
O << ", " << (unsigned int) TH;
@@ -166,29 +184,36 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
if (MI->getOpcode() == PPC::DCBF) {
unsigned char L = MI->getOperand(0).getImm();
- if (!L || L == 1 || L == 3) {
- O << "\tdcbf";
- if (L == 1 || L == 3)
+ if (!L || L == 1 || L == 3 || L == 4 || L == 6) {
+ O << "\tdcb";
+ if (L != 6)
+ O << "f";
+ if (L == 1)
O << "l";
if (L == 3)
- O << "p";
+ O << "lp";
+ if (L == 4)
+ O << "ps";
+ if (L == 6)
+ O << "stps";
O << " ";
- printOperand(MI, 1, O);
+ printOperand(MI, 1, STI, O);
O << ", ";
- printOperand(MI, 2, O);
+ printOperand(MI, 2, STI, O);
printAnnotation(O, Annot);
return;
}
}
- if (!printAliasInstr(MI, Address, O))
- printInstruction(MI, Address, O);
+ if (!printAliasInstr(MI, Address, STI, O))
+ printInstruction(MI, Address, STI, O);
printAnnotation(O, Annot);
}
void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O,
const char *Modifier) {
unsigned Code = MI->getOperand(OpNo).getImm();
@@ -282,10 +307,11 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
assert(StringRef(Modifier) == "reg" &&
"Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
- printOperand(MI, OpNo+1, O);
+ printOperand(MI, OpNo + 1, STI, O);
}
void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned Code = MI->getOperand(OpNo).getImm();
if (Code == 2)
@@ -295,6 +321,7 @@ void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 1 && "Invalid u1imm argument!");
@@ -302,6 +329,7 @@ void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 3 && "Invalid u2imm argument!");
@@ -309,6 +337,7 @@ void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 8 && "Invalid u3imm argument!");
@@ -316,6 +345,7 @@ void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 15 && "Invalid u4imm argument!");
@@ -323,6 +353,7 @@ void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
int Value = MI->getOperand(OpNo).getImm();
Value = SignExtend32<5>(Value);
@@ -330,6 +361,7 @@ void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value == 0 && "Operand must be zero");
@@ -337,6 +369,7 @@ void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 31 && "Invalid u5imm argument!");
@@ -344,6 +377,7 @@ void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 63 && "Invalid u6imm argument!");
@@ -351,6 +385,7 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned int Value = MI->getOperand(OpNo).getImm();
assert(Value <= 127 && "Invalid u7imm argument!");
@@ -361,12 +396,14 @@ void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and
// print as unsigned.
void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned char Value = MI->getOperand(OpNo).getImm();
O << (unsigned int)Value;
}
void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned short Value = MI->getOperand(OpNo).getImm();
assert(Value <= 1023 && "Invalid u10imm argument!");
@@ -374,6 +411,7 @@ void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned short Value = MI->getOperand(OpNo).getImm();
assert(Value <= 4095 && "Invalid u12imm argument!");
@@ -381,14 +419,16 @@ void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOperand(OpNo).isImm())
O << (short)MI->getOperand(OpNo).getImm();
else
- printOperand(MI, OpNo, O);
+ printOperand(MI, OpNo, STI, O);
}
void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOperand(OpNo).isImm()) {
long long Value = MI->getOperand(OpNo).getImm();
@@ -396,21 +436,24 @@ void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo,
O << (long long)Value;
}
else
- printOperand(MI, OpNo, O);
+ printOperand(MI, OpNo, STI, O);
}
void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOperand(OpNo).isImm())
O << (unsigned short)MI->getOperand(OpNo).getImm();
else
- printOperand(MI, OpNo, O);
+ printOperand(MI, OpNo, STI, O);
}
void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
- unsigned OpNo, raw_ostream &O) {
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
if (!MI->getOperand(OpNo).isImm())
- return printOperand(MI, OpNo, O);
+ return printOperand(MI, OpNo, STI, O);
int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
if (PrintBranchImmAsAddress) {
uint64_t Target = Address + Imm;
@@ -433,16 +476,16 @@ void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
}
void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
if (!MI->getOperand(OpNo).isImm())
- return printOperand(MI, OpNo, O);
+ return printOperand(MI, OpNo, STI, O);
O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
}
-
void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned CCReg = MI->getOperand(OpNo).getReg();
unsigned RegNo;
switch (CCReg) {
@@ -460,33 +503,37 @@ void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
}
void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
- printS16ImmOperand(MI, OpNo, O);
+ printS16ImmOperand(MI, OpNo, STI, O);
O << '(';
if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
O << "0";
else
- printOperand(MI, OpNo+1, O);
+ printOperand(MI, OpNo + 1, STI, O);
O << ')';
}
void PPCInstPrinter::printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
- printS34ImmOperand(MI, OpNo, O);
+ printS34ImmOperand(MI, OpNo, STI, O);
O << '(';
- printImmZeroOperand(MI, OpNo + 1, O);
+ printImmZeroOperand(MI, OpNo + 1, STI, O);
O << ')';
}
void PPCInstPrinter::printMemRegImm34(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- printS34ImmOperand(MI, OpNo, O);
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printS34ImmOperand(MI, OpNo, STI, O);
O << '(';
- printOperand(MI, OpNo + 1, O);
+ printOperand(MI, OpNo + 1, STI, O);
O << ')';
}
void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
raw_ostream &O) {
// When used as the base register, r0 reads constant zero rather than
// the value contained in the register. For this reason, the darwin
@@ -494,13 +541,13 @@ void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
if (MI->getOperand(OpNo).getReg() == PPC::R0)
O << "0";
else
- printOperand(MI, OpNo, O);
+ printOperand(MI, OpNo, STI, O);
O << ", ";
- printOperand(MI, OpNo+1, O);
+ printOperand(MI, OpNo + 1, STI, O);
}
void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
// On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
// come at the _end_ of the expression.
const MCOperand &Op = MI->getOperand(OpNo);
@@ -513,10 +560,17 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
RefExp = cast<MCSymbolRefExpr>(Op.getExpr());
O << RefExp->getSymbol().getName();
+ // The variant kind VK_PPC_NOTOC needs to be handled as a special case
+ // because we do not want the assembly to print out the @notoc at the
+ // end like __tls_get_addr(x@tlsgd)@notoc. Instead we want it to look
+ // like __tls_get_addr@notoc(x@tlsgd).
+ if (RefExp->getKind() == MCSymbolRefExpr::VK_PPC_NOTOC)
+ O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind());
O << '(';
- printOperand(MI, OpNo+1, O);
+ printOperand(MI, OpNo + 1, STI, O);
O << ')';
- if (RefExp->getKind() != MCSymbolRefExpr::VK_None)
+ if (RefExp->getKind() != MCSymbolRefExpr::VK_None &&
+ RefExp->getKind() != MCSymbolRefExpr::VK_PPC_NOTOC)
O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind());
if (ConstExp != nullptr)
O << '+' << ConstExp->getValue();
@@ -525,7 +579,7 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
/// showRegistersWithPercentPrefix - Check if this register name should be
/// printed with a percentage symbol as prefix.
bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
- if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX)
+ if (!FullRegNamesWithPercent || TT.getOS() == Triple::AIX)
return false;
switch (RegName[0]) {
@@ -545,7 +599,7 @@ bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
unsigned RegEncoding)
const {
- if (!TT.isOSDarwin() && !FullRegNames)
+ if (!FullRegNames)
return nullptr;
if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN)
return nullptr;
@@ -567,11 +621,11 @@ const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
bool PPCInstPrinter::showRegistersWithPrefix() const {
if (TT.getOS() == Triple::AIX)
return false;
- return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames;
+ return FullRegNamesWithPercent || FullRegNames;
}
void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+ const MCSubtargetInfo &STI, raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
unsigned Reg = Op.getReg();
@@ -600,4 +654,3 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
assert(Op.isExpr() && "unknown operand kind in printOperand");
Op.getExpr()->print(O, &MAI);
}
-
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index 9763aeceef94..5e9b01494416 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -36,45 +36,73 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O) override;
// Autogenerated by tblgen.
- void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+ void printInstruction(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
- bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ bool printAliasInstr(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &OS);
void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
unsigned OpIdx, unsigned PrintMethodIdx,
- raw_ostream &OS);
+ const MCSubtargetInfo &STI, raw_ostream &OS);
- void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printPredicateOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O, const char *Modifier = nullptr);
- void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ const MCSubtargetInfo &STI, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printATBitsAsHint(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
- void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printS34ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printImmZeroOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU1ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU3ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU10ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printS34ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printImmZeroOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
- raw_ostream &O);
- void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printTLSCall(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
- void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printcrbitm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
- void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printMemRegImm34(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemRegImm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMemRegImm34(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printMemRegReg(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 593dc2843c3d..2b76af279ce6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -26,7 +26,8 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
if (is64Bit) {
CodePointerSize = CalleeSaveStackSlotSize = 8;
}
- IsLittleEndian = T.getArch() == Triple::ppc64le;
+ IsLittleEndian =
+ T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle;
// ".comm align is in bytes but .align is pow-2."
AlignmentIsInBytes = false;
@@ -56,7 +57,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
void PPCXCOFFMCAsmInfo::anchor() {}
PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
- if (T.getArch() == Triple::ppc64le)
+ if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
report_fatal_error("XCOFF is not supported for little-endian targets");
CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 27c687686641..48806051f581 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -13,7 +13,6 @@
#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
-#include "llvm/MC/MCAsmInfoDarwin.h"
#include "llvm/MC/MCAsmInfoELF.h"
#include "llvm/MC/MCAsmInfoXCOFF.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index fb65e7320f2b..5f0769fd21f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -44,11 +44,13 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
const MCOperand &MO = MI.getOperand(OpNo);
- if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+ if (MO.isReg() || MO.isImm())
+ return getMachineOpValue(MI, MO, Fixups, STI);
// Add a fixup for the branch target.
Fixups.push_back(MCFixup::create(0, MO.getExpr(),
- ((MI.getOpcode() == PPC::BL8_NOTOC)
+ ((MI.getOpcode() == PPC::BL8_NOTOC ||
+ MI.getOpcode() == PPC::BL8_NOTOC_TLS)
? (MCFixupKind)PPC::fixup_ppc_br24_notoc
: (MCFixupKind)PPC::fixup_ppc_br24)));
return 0;
@@ -92,6 +94,16 @@ getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
return 0;
}
+unsigned
+PPCMCCodeEmitter::getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ assert(MI.getOperand(OpNo).isReg() && "Operand should be a register");
+ unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI)
+ << 1;
+ return RegBits;
+}
+
unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -104,20 +116,36 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
return 0;
}
-uint64_t
-PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI,
+ MCFixupKind Fixup) const {
const MCOperand &MO = MI.getOperand(OpNo);
- if (MO.isReg() || MO.isImm())
+ assert(!MO.isReg() && "Not expecting a register for this operand.");
+ if (MO.isImm())
return getMachineOpValue(MI, MO, Fixups, STI);
// Add a fixup for the immediate field.
- Fixups.push_back(MCFixup::create(0, MO.getExpr(),
- (MCFixupKind)PPC::fixup_ppc_pcrel34));
+ Fixups.push_back(MCFixup::create(0, MO.getExpr(), Fixup));
return 0;
}
+uint64_t
+PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getImm34Encoding(MI, OpNo, Fixups, STI,
+ (MCFixupKind)PPC::fixup_ppc_imm34);
+}
+
+uint64_t
+PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return getImm34Encoding(MI, OpNo, Fixups, STI,
+ (MCFixupKind)PPC::fixup_ppc_pcrel34);
+}
+
unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
@@ -213,8 +241,13 @@ PPCMCCodeEmitter::getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
(void)SRE;
// Currently these are the only valid PCRelative Relocations.
assert((SRE->getKind() == MCSymbolRefExpr::VK_PCREL ||
- SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL) &&
- "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL");
+ SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL ||
+ SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL ||
+ SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL ||
+ SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL) &&
+ "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL or "
+ "VK_PPC_GOT_TLSGD_PCREL or VK_PPC_GOT_TLSLD_PCREL or "
+ "VK_PPC_GOT_TPREL_PCREL.");
// Generate the fixup for the relocation.
Fixups.push_back(
MCFixup::create(0, Expr,
@@ -326,8 +359,12 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
// Add a fixup for the TLS register, which simply provides a relocation
// hint to the linker that this statement is part of a relocation sequence.
- // Return the thread-pointer register's encoding.
- Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+ // Return the thread-pointer register's encoding. Add a one byte displacement
+ // if using PC relative memops.
+ const MCExpr *Expr = MO.getExpr();
+ const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
+ bool IsPCRel = SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS_PCREL;
+ Fixups.push_back(MCFixup::create(IsPCRel ? 1 : 0, Expr,
(MCFixupKind)PPC::fixup_ppc_nofixup));
const Triple &TT = STI.getTargetTriple();
bool isPPC64 = TT.isPPC64();
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index 588aa76bd806..347e163c9515 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -52,7 +52,14 @@ public:
const MCSubtargetInfo &STI) const;
uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ const MCSubtargetInfo &STI,
+ MCFixupKind Fixup) const;
+ uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+ uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
@@ -86,6 +93,9 @@ public:
unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ unsigned getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
/// getMachineOpValue - Return binary encoding of operand. If the machine
/// operand requires relocation, record the relocation and return zero.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 3092d56da1c5..bf9c6feb541e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -20,8 +20,8 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDwarf.h"
@@ -30,6 +30,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionXCOFF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -77,7 +78,17 @@ static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
- return createPPCMCSubtargetInfoImpl(TT, CPU, FS);
+ // Set some default feature to MC layer.
+ std::string FullFS = std::string(FS);
+
+ if (TT.isOSAIX()) {
+ if (!FullFS.empty())
+ FullFS = "+aix," + FullFS;
+ else
+ FullFS = "+aix";
+ }
+
+ return createPPCMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FullFS);
}
static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
@@ -122,11 +133,12 @@ public:
void emitTCEntry(const MCSymbol &S) override {
if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
MCSymbolXCOFF *TCSym =
- cast<MCSymbolXCOFF>(Streamer.getContext().getOrCreateSymbol(
- XSym->getSymbolTableName() + "[TC]"));
+ cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
+ ->getQualNameSymbol();
+ OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
+
if (TCSym->hasRename())
Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName());
- OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
return;
}
@@ -334,8 +346,8 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
}
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
- for (Target *T :
- {&getThePPC32Target(), &getThePPC64Target(), &getThePPC64LETarget()}) {
+ for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(),
+ &getThePPC64Target(), &getThePPC64LETarget()}) {
// Register the MC asm info.
RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 719e005d9813..03b316341717 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -124,6 +124,11 @@ static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) {
#define GET_SUBTARGETINFO_ENUM
#include "PPCGenSubtargetInfo.inc"
+#define PPC_REGS0_7(X) \
+ { \
+ X##0, X##1, X##2, X##3, X##4, X##5, X##6, X##7 \
+ }
+
#define PPC_REGS0_31(X) \
{ \
X##0, X##1, X##2, X##3, X##4, X##5, X##6, X##7, X##8, X##9, X##10, X##11, \
@@ -156,10 +161,10 @@ using llvm::MCPhysReg;
static const MCPhysReg RRegs[32] = PPC_REGS0_31(PPC::R); \
static const MCPhysReg XRegs[32] = PPC_REGS0_31(PPC::X); \
static const MCPhysReg FRegs[32] = PPC_REGS0_31(PPC::F); \
+ static const MCPhysReg VSRpRegs[32] = PPC_REGS0_31(PPC::VSRp); \
static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \
static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \
static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \
- static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \
static const MCPhysReg RRegsNoR0[32] = \
PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \
static const MCPhysReg XRegsNoX0[32] = \
@@ -179,8 +184,6 @@ using llvm::MCPhysReg;
PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, \
PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, \
PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN}; \
- static const MCPhysReg CRRegs[8] = { \
- PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, \
- PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7}
-
+ static const MCPhysReg CRRegs[8] = PPC_REGS0_7(PPC::CR); \
+ static const MCPhysReg ACCRegs[8] = PPC_REGS0_7(PPC::ACC)
#endif // LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index d672d54772e0..77b0331bb14c 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -58,14 +58,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
switch ((unsigned)Fixup.getKind()) {
default:
report_fatal_error("Unimplemented fixup kind.");
- case PPC::fixup_ppc_half16:
+ case PPC::fixup_ppc_half16: {
+ const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15;
switch (Modifier) {
default:
report_fatal_error("Unsupported modifier for half16 fixup.");
case MCSymbolRefExpr::VK_None:
- return {XCOFF::RelocationType::R_TOC, EncodedSignednessIndicator | 15};
+ return {XCOFF::RelocationType::R_TOC, SignAndSizeForHalf16};
+ case MCSymbolRefExpr::VK_PPC_U:
+ return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16};
+ case MCSymbolRefExpr::VK_PPC_L:
+ return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16};
}
- break;
+ } break;
case PPC::fixup_ppc_br24:
// Branches are 4 byte aligned, so the 24 bits we encode in
// the instruction actually represents a 26 bit offset.
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
index d7e3519d5539..63531f72adfb 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -94,7 +94,7 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instregex "CMPRB(8)?$"),
(instregex "TD(I)?$"),
(instregex "TW(I)?$"),
- (instregex "FCMPU(S|D)$"),
+ (instregex "FCMP(O|U)(S|D)$"),
(instregex "XSTSTDC(S|D)P$"),
FTDIV,
FTSQRT,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h
index 7e0aa2c6061d..264582b244a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.h
@@ -20,17 +20,20 @@
#undef PPC
namespace llvm {
- class PPCTargetMachine;
- class PassRegistry;
- class FunctionPass;
- class MachineInstr;
- class MachineOperand;
- class AsmPrinter;
- class MCInst;
- class MCOperand;
- class ModulePass;
-
- FunctionPass *createPPCCTRLoops();
+class PPCRegisterBankInfo;
+class PPCSubtarget;
+class PPCTargetMachine;
+class PassRegistry;
+class FunctionPass;
+class InstructionSelector;
+class MachineInstr;
+class MachineOperand;
+class AsmPrinter;
+class MCInst;
+class MCOperand;
+class ModulePass;
+
+FunctionPass *createPPCCTRLoops();
#ifndef NDEBUG
FunctionPass *createPPCCTRLoopsVerify();
#endif
@@ -44,7 +47,6 @@ namespace llvm {
FunctionPass *createPPCMIPeepholePass();
FunctionPass *createPPCBranchSelectionPass();
FunctionPass *createPPCBranchCoalescingPass();
- FunctionPass *createPPCQPXLoadSplatPass();
FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
FunctionPass *createPPCTLSDynamicCallPass();
FunctionPass *createPPCBoolRetToIntPass();
@@ -68,7 +70,6 @@ namespace llvm {
void initializePPCReduceCRLogicalsPass(PassRegistry&);
void initializePPCBSelPass(PassRegistry&);
void initializePPCBranchCoalescingPass(PassRegistry&);
- void initializePPCQPXLoadSplatPass(PassRegistry&);
void initializePPCBoolRetToIntPass(PassRegistry&);
void initializePPCExpandISELPass(PassRegistry &);
void initializePPCPreEmitPeepholePass(PassRegistry &);
@@ -80,7 +81,10 @@ namespace llvm {
ModulePass *createPPCLowerMASSVEntriesPass();
void initializePPCLowerMASSVEntriesPass(PassRegistry &);
extern char &PPCLowerMASSVEntriesID;
-
+
+ InstructionSelector *
+ createPPCInstructionSelector(const PPCTargetMachine &, const PPCSubtarget &,
+ const PPCRegisterBankInfo &);
namespace PPCII {
/// Target Operand Flag enum.
@@ -107,6 +111,37 @@ namespace llvm {
/// produce the relocation @got@pcrel. Fixup is VK_PPC_GOT_PCREL.
MO_GOT_FLAG = 8,
+ // MO_PCREL_OPT_FLAG - If this bit is set the operand is part of a
+ // PC Relative linker optimization.
+ MO_PCREL_OPT_FLAG = 16,
+
+ /// MO_TLSGD_FLAG - If this bit is set the symbol reference is relative to
+ /// TLS General Dynamic model.
+ MO_TLSGD_FLAG = 32,
+
+ /// MO_TPREL_FLAG - If this bit is set the symbol reference is relative to
+ /// TLS Initial Exec model.
+ MO_TPREL_FLAG = 64,
+
+ /// MO_TLSLD_FLAG - If this bit is set the symbol reference is relative to
+ /// TLS Local Dynamic model.
+ MO_TLSLD_FLAG = 128,
+
+ /// MO_GOT_TLSGD_PCREL_FLAG - A combintaion of flags, if these bits are set
+ /// they should produce the relocation @got@tlsgd@pcrel.
+ /// Fix up is VK_PPC_GOT_TLSGD_PCREL
+ MO_GOT_TLSGD_PCREL_FLAG = MO_PCREL_FLAG | MO_GOT_FLAG | MO_TLSGD_FLAG,
+
+ /// MO_GOT_TLSLD_PCREL_FLAG - A combintaion of flags, if these bits are set
+ /// they should produce the relocation @got@tlsld@pcrel.
+ /// Fix up is VK_PPC_GOT_TLSLD_PCREL
+ MO_GOT_TLSLD_PCREL_FLAG = MO_PCREL_FLAG | MO_GOT_FLAG | MO_TLSLD_FLAG,
+
+ /// MO_GOT_TPREL_PCREL_FLAG - A combintaion of flags, if these bits are set
+ /// they should produce the relocation @got@tprel@pcrel.
+ /// Fix up is VK_PPC_GOT_TPREL_PCREL
+ MO_GOT_TPREL_PCREL_FLAG = MO_GOT_FLAG | MO_TPREL_FLAG | MO_PCREL_FLAG,
+
/// The next are not flags but distinct values.
MO_ACCESS_MASK = 0xf00,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
index 9ad78bf67fe6..1e6ded231585 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td
@@ -57,6 +57,10 @@ def DirectivePwrFuture
def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true",
"Enable 64-bit instructions">;
+def AIXOS: SubtargetFeature<"aix", "IsAIX", "true", "AIX OS">;
+def FeatureModernAIXAs
+ : SubtargetFeature<"modern-aix-as", "HasModernAIXAs", "true",
+ "AIX system assembler is modern enough to support new mnes">;
def FeatureHardFloat : SubtargetFeature<"hard-float", "HasHardFloat", "true",
"Enable floating-point instructions">;
def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
@@ -72,6 +76,9 @@ def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true",
def FeatureSPE : SubtargetFeature<"spe","HasSPE", "true",
"Enable SPE instructions",
[FeatureHardFloat]>;
+def FeatureEFPU2 : SubtargetFeature<"efpu2", "HasEFPU2", "true",
+ "Enable Embedded Floating-Point APU 2 instructions",
+ [FeatureSPE]>;
def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
"Enable the MFOCRF instruction">;
def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true",
@@ -132,9 +139,6 @@ def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
"Enable PPC 4xx instructions">;
def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
"Enable PPC 6xx instructions">;
-def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true",
- "Enable QPX instructions",
- [FeatureFPU]>;
def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true",
"Enable VSX instructions",
[FeatureAltivec]>;
@@ -177,6 +181,9 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
"HasAddisLoadFusion", "true",
"Power8 Addis-Load fusion",
[FeatureFusion]>;
+def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true",
+ "Target supports store clustering",
+ [FeatureFusion]>;
def FeatureUnalignedFloats :
SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
"true", "CPU does not trap on unaligned FP access">;
@@ -193,7 +200,7 @@ def FeatureFloat128 :
def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD",
"POPCNTD_Fast",
"Enable the popcnt[dw] instructions">;
-// Note that for the a2/a2q processor models we should not use popcnt[dw] by
+// Note that for the a2 processor models we should not use popcnt[dw] by
// default. These processors do support the instructions, but they're
// microcoded, and the software emulation is about twice as fast.
def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
@@ -236,7 +243,15 @@ def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs",
def FeaturePCRelativeMemops :
SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true",
"Enable PC relative Memory Ops",
+ [FeatureISA3_0, FeaturePrefixInstrs]>;
+def FeaturePairedVectorMemops:
+ SubtargetFeature<"paired-vector-memops", "PairedVectorMemops", "true",
+ "32Byte load and store instructions",
[FeatureISA3_0]>;
+def FeatureMMA : SubtargetFeature<"mma", "HasMMA", "true",
+ "Enable MMA instructions",
+ [FeatureP8Vector, FeatureP9Altivec,
+ FeaturePairedVectorMemops]>;
def FeaturePredictableSelectIsExpensive :
SubtargetFeature<"predictable-select-expensive",
@@ -320,6 +335,8 @@ def ProcessorFeatures {
[DirectivePwr9,
FeatureP9Altivec,
FeatureP9Vector,
+ FeaturePPCPreRASched,
+ FeaturePPCPostRASched,
FeatureISA3_0,
FeaturePredictableSelectIsExpensive
];
@@ -329,9 +346,7 @@ def ProcessorFeatures {
// dispatch for vector operations than scalar ones. For the time being,
// this list also includes scheduling-related features since we do not have
// enough info to create custom scheduling strategies for future CPUs.
- list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits,
- FeaturePPCPreRASched,
- FeaturePPCPostRASched];
+ list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
list<SubtargetFeature> P9InheritableFeatures =
!listconcat(P8InheritableFeatures, P9AdditionalFeatures);
list<SubtargetFeature> P9Features =
@@ -340,9 +355,12 @@ def ProcessorFeatures {
// Power10
// For P10 CPU we assume that all of the existing features from Power9
// still exist with the exception of those we know are Power9 specific.
+ list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion];
list<SubtargetFeature> P10AdditionalFeatures =
- [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
- FeaturePCRelativeMemops, FeatureP10Vector];
+ !listconcat(FusionFeatures, [
+ DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
+ FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA,
+ FeaturePairedVectorMemops]);
list<SubtargetFeature> P10SpecificFeatures = [];
list<SubtargetFeature> P10InheritableFeatures =
!listconcat(P9InheritableFeatures, P10AdditionalFeatures);
@@ -427,6 +445,7 @@ def getAltVSXFMAOpcode : InstrMapping {
include "PPCRegisterInfo.td"
include "PPCSchedule.td"
+include "GISel/PPCRegisterBanks.td"
//===----------------------------------------------------------------------===//
// PowerPC processors supported.
@@ -514,15 +533,6 @@ def : ProcessorModel<"a2", PPCA2Model,
FeatureFPRND, FeatureFPCVT, FeatureISEL,
FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>;
-def : ProcessorModel<"a2q", PPCA2Model,
- [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
- FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
- FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
- FeatureSTFIWX, FeatureLFIWAX,
- FeatureFPRND, FeatureFPCVT, FeatureISEL,
- FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
- Feature64Bit /*, Feature64BitRegs */, FeatureQPX,
- FeatureMFTB]>;
def : ProcessorModel<"pwr3", G5Model,
[DirectivePwr3, FeatureAltivec,
FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
@@ -561,7 +571,7 @@ def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
// No scheduler model yet.
-def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>;
+def : ProcessorModel<"pwr10", P9Model, ProcessorFeatures.P10Features>;
// No scheduler model for future CPU.
def : ProcessorModel<"future", NoSchedModel,
ProcessorFeatures.FutureFeatures>;
@@ -592,6 +602,13 @@ def PPCInstrInfo : InstrInfo {
let noNamedPositionallyEncodedOperands = 1;
}
+def PPCAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ int PassSubtarget = 1;
+ int Variant = 0;
+ bit isMCAsmWriter = 1;
+}
+
def PPCAsmParser : AsmParser {
let ShouldEmitMatchRegisterName = 0;
}
@@ -610,6 +627,7 @@ def PPC : Target {
// Information about the instructions.
let InstructionSet = PPCInstrInfo;
+ let AssemblyWriters = [PPCAsmWriter];
let AssemblyParsers = [PPCAsmParser];
let AssemblyParserVariants = [PPCAsmParserVariant];
let AllowRegisterRenaming = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bf5fe741bac8..cce21f32414a 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -27,11 +27,11 @@
#include "PPCTargetStreamer.h"
#include "TargetInfo/PowerPCTargetInfo.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/BinaryFormat/MachO.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -47,11 +47,11 @@
#include "llvm/IR/Module.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSectionXCOFF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
@@ -62,9 +62,11 @@
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -72,6 +74,7 @@
#include <new>
using namespace llvm;
+using namespace llvm::XCOFF;
#define DEBUG_TYPE "asmprinter"
@@ -147,7 +150,21 @@ public:
class PPCAIXAsmPrinter : public PPCAsmPrinter {
private:
+ /// Symbols lowered from ExternalSymbolSDNodes, we will need to emit extern
+ /// linkage for them in AIX.
+ SmallPtrSet<MCSymbol *, 8> ExtSymSDNodeSymbols;
+
+ /// A format indicator and unique trailing identifier to form part of the
+ /// sinit/sterm function names.
+ std::string FormatIndicatorAndUniqueModId;
+
static void ValidateGV(const GlobalVariable *GV);
+ // Record a list of GlobalAlias associated with a GlobalObject.
+ // This is used for AIX's extra-label-at-definition aliasing strategy.
+ DenseMap<const GlobalObject *, SmallVector<const GlobalAlias *, 1>>
+ GOAliasMap;
+
+ void emitTracebackTable();
public:
PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -161,15 +178,28 @@ public:
bool doInitialization(Module &M) override;
+ void emitXXStructorList(const DataLayout &DL, const Constant *List,
+ bool IsCtor) override;
+
void SetupMachineFunction(MachineFunction &MF) override;
void emitGlobalVariable(const GlobalVariable *GV) override;
void emitFunctionDescriptor() override;
+ void emitFunctionEntryLabel() override;
+
+ void emitFunctionBodyEnd() override;
+
void emitEndOfAsmFile(Module &) override;
void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override;
+
+ void emitInstruction(const MachineInstr *MI) override;
+
+ bool doFinalization(Module &M) override;
+
+ void emitTTypeReference(const GlobalValue *GV, unsigned Encoding) override;
};
} // end anonymous namespace
@@ -463,6 +493,14 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
StringRef Name = "__tls_get_addr";
MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+ unsigned Opcode = PPC::BL8_NOP_TLS;
+
+ assert(MI->getNumOperands() >= 3 && "Expecting at least 3 operands from MI");
+ if (MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG ||
+ MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG) {
+ Kind = MCSymbolRefExpr::VK_PPC_NOTOC;
+ Opcode = PPC::BL8_NOTOC_TLS;
+ }
const Module *M = MF->getFunction().getParent();
assert(MI->getOperand(0).isReg() &&
@@ -490,10 +528,10 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
MCSymbol *MOSymbol = getSymbol(GValue);
const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
EmitToStreamer(*OutStreamer,
- MCInstBuilder(Subtarget->isPPC64() ?
- PPC::BL8_NOP_TLS : PPC::BL_TLS)
- .addExpr(TlsRef)
- .addExpr(SymVar));
+ MCInstBuilder(Subtarget->isPPC64() ? Opcode
+ : (unsigned)PPC::BL_TLS)
+ .addExpr(TlsRef)
+ .addExpr(SymVar));
}
/// Map a machine operand for a TOC pseudo-machine instruction to its
@@ -533,9 +571,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
if (Subtarget->hasSPE()) {
if (PPC::F4RCRegClass.contains(Reg) ||
PPC::F8RCRegClass.contains(Reg) ||
- PPC::QBRCRegClass.contains(Reg) ||
- PPC::QFRCRegClass.contains(Reg) ||
- PPC::QSRCRegClass.contains(Reg) ||
PPC::VFRCRegClass.contains(Reg) ||
PPC::VRRCRegClass.contains(Reg) ||
PPC::VSFRCRegClass.contains(Reg) ||
@@ -550,6 +585,38 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
}
}
#endif
+
+ auto getTOCRelocAdjustedExprForXCOFF = [this](const MCExpr *Expr,
+ ptrdiff_t OriginalOffset) {
+ // Apply an offset to the TOC-based expression such that the adjusted
+ // notional offset from the TOC base (to be encoded into the instruction's D
+ // or DS field) is the signed 16-bit truncation of the original notional
+ // offset from the TOC base.
+ // This is consistent with the treatment used both by XL C/C++ and
+ // by AIX ld -r.
+ ptrdiff_t Adjustment =
+ OriginalOffset - llvm::SignExtend32<16>(OriginalOffset);
+ return MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(-Adjustment, OutContext), OutContext);
+ };
+
+ auto getTOCEntryLoadingExprForXCOFF =
+ [IsPPC64, getTOCRelocAdjustedExprForXCOFF,
+ this](const MCSymbol *MOSymbol, const MCExpr *Expr) -> const MCExpr * {
+ const unsigned EntryByteSize = IsPPC64 ? 8 : 4;
+ const auto TOCEntryIter = TOC.find(MOSymbol);
+ assert(TOCEntryIter != TOC.end() &&
+ "Could not find the TOC entry for this symbol.");
+ const ptrdiff_t EntryDistanceFromTOCBase =
+ (TOCEntryIter - TOC.begin()) * EntryByteSize;
+ constexpr int16_t PositiveTOCRange = INT16_MAX;
+
+ if (EntryDistanceFromTOCBase > PositiveTOCRange)
+ return getTOCRelocAdjustedExprForXCOFF(Expr, EntryDistanceFromTOCBase);
+
+ return Expr;
+ };
+
// Lower multi-instruction pseudo operations.
switch (MI->getOpcode()) {
default: break;
@@ -696,6 +763,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
assert(
TM.getCodeModel() == CodeModel::Small &&
"This pseudo should only be selected for 32-bit small code model.");
+ Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
@@ -724,17 +792,20 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
"Invalid operand!");
+ // Map the operand to its corresponding MCSymbol.
+ const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
// Map the machine operand to its corresponding MCSymbol, then map the
// global address operand to be a reference to the TOC entry we will
// synthesize later.
- MCSymbol *TOCEntry =
- lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this));
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
const MCSymbolRefExpr::VariantKind VK =
IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
const MCExpr *Exp =
MCSymbolRefExpr::create(TOCEntry, VK, OutContext);
- TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ TmpInst.getOperand(1) = MCOperand::createExpr(
+ IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp) : Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
@@ -1010,6 +1081,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
case PPC::GETtlsADDR:
// Transform: %x3 = GETtlsADDR %x3, @sym
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
+ case PPC::GETtlsADDRPCREL:
case PPC::GETtlsADDR32: {
// Transform: %r3 = GETtlsADDR32 %r3, @sym
// Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
@@ -1055,6 +1127,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
case PPC::GETtlsldADDR:
// Transform: %x3 = GETtlsldADDR %x3, @sym
// Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
+ case PPC::GETtlsldADDRPCREL:
case PPC::GETtlsldADDR32: {
// Transform: %r3 = GETtlsldADDR32 %r3, @sym
// Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
@@ -1081,6 +1154,21 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addExpr(SymDtprel));
return;
}
+ case PPC::PADDIdtprel: {
+ // Transform: %rd = PADDIdtprel %rs, @sym
+ // Into: %rd = PADDI8 %rs, sym@dtprel
+ const MachineOperand &MO = MI->getOperand(2);
+ const GlobalValue *GValue = MO.getGlobal();
+ MCSymbol *MOSymbol = getSymbol(GValue);
+ const MCExpr *SymDtprel = MCSymbolRefExpr::create(
+ MOSymbol, MCSymbolRefExpr::VK_DTPREL, OutContext);
+ EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::PADDI8)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(SymDtprel));
+ return;
+ }
+
case PPC::ADDIdtprelL:
// Transform: %xd = ADDIdtprelL %xs, @sym
// Into: %xd = ADDI8 %xs, sym@dtprel@l
@@ -1137,10 +1225,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
case PPC::LWA: {
// Verify alignment is legal, so we don't create relocations
// that can't be supported.
- // FIXME: This test is currently disabled for Darwin. The test
- // suite shows a handful of test cases that fail this check for
- // Darwin. Those need to be investigated before this sanity test
- // can be enabled for those subtargets.
unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
const MachineOperand &MO = MI->getOperand(OpNum);
if (MO.isGlobal()) {
@@ -1621,17 +1705,19 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
assert(LinkageAttr != MCSA_Invalid && "LinkageAttr should not MCSA_Invalid.");
MCSymbolAttr VisibilityAttr = MCSA_Invalid;
- switch (GV->getVisibility()) {
+ if (!TM.getIgnoreXCOFFVisibility()) {
+ switch (GV->getVisibility()) {
- // TODO: "exported" and "internal" Visibility needs to go here.
- case GlobalValue::DefaultVisibility:
- break;
- case GlobalValue::HiddenVisibility:
- VisibilityAttr = MAI->getHiddenVisibilityAttr();
- break;
- case GlobalValue::ProtectedVisibility:
- VisibilityAttr = MAI->getProtectedVisibilityAttr();
- break;
+ // TODO: "exported" and "internal" Visibility needs to go here.
+ case GlobalValue::DefaultVisibility:
+ break;
+ case GlobalValue::HiddenVisibility:
+ VisibilityAttr = MAI->getHiddenVisibilityAttr();
+ break;
+ case GlobalValue::ProtectedVisibility:
+ VisibilityAttr = MAI->getProtectedVisibilityAttr();
+ break;
+ }
}
OutStreamer->emitXCOFFSymbolLinkageWithVisibility(GVSym, LinkageAttr,
@@ -1650,18 +1736,305 @@ void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
return AsmPrinter::SetupMachineFunction(MF);
}
+void PPCAIXAsmPrinter::emitFunctionBodyEnd() {
+
+ if (!TM.getXCOFFTracebackTable())
+ return;
+
+ emitTracebackTable();
+}
+
+void PPCAIXAsmPrinter::emitTracebackTable() {
+
+ // Create a symbol for the end of function.
+ MCSymbol *FuncEnd = createTempSymbol(MF->getName());
+ OutStreamer->emitLabel(FuncEnd);
+
+ OutStreamer->AddComment("Traceback table begin");
+ // Begin with a fullword of zero.
+ OutStreamer->emitIntValueInHexWithPadding(0, 4 /*size*/);
+
+ SmallString<128> CommentString;
+ raw_svector_ostream CommentOS(CommentString);
+
+ auto EmitComment = [&]() {
+ OutStreamer->AddComment(CommentOS.str());
+ CommentString.clear();
+ };
+
+ auto EmitCommentAndValue = [&](uint64_t Value, int Size) {
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(Value, Size);
+ };
+
+ unsigned int Version = 0;
+ CommentOS << "Version = " << Version;
+ EmitCommentAndValue(Version, 1);
+
+ // There is a lack of information in the IR to assist with determining the
+ // source language. AIX exception handling mechanism would only search for
+ // personality routine and LSDA area when such language supports exception
+ // handling. So to be conservatively correct and allow runtime to do its job,
+ // we need to set it to C++ for now.
+ TracebackTable::LanguageID LanguageIdentifier =
+ TracebackTable::CPlusPlus; // C++
+
+ CommentOS << "Language = "
+ << getNameForTracebackTableLanguageId(LanguageIdentifier);
+ EmitCommentAndValue(LanguageIdentifier, 1);
+
+ // This is only populated for the third and fourth bytes.
+ uint32_t FirstHalfOfMandatoryField = 0;
+
+ // Emit the 3rd byte of the mandatory field.
+
+ // We always set traceback offset bit to true.
+ FirstHalfOfMandatoryField |= TracebackTable::HasTraceBackTableOffsetMask;
+
+ const PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Check the function uses floating-point processor instructions or not
+ for (unsigned Reg = PPC::F0; Reg <= PPC::F31; ++Reg) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ FirstHalfOfMandatoryField |= TracebackTable::IsFloatingPointPresentMask;
+ break;
+ }
+ }
+
+#define GENBOOLCOMMENT(Prefix, V, Field) \
+ CommentOS << (Prefix) << ((V) & (TracebackTable::Field##Mask) ? "+" : "-") \
+ << #Field
+
+#define GENVALUECOMMENT(PrefixAndName, V, Field) \
+ CommentOS << (PrefixAndName) << " = " \
+ << static_cast<unsigned>(((V) & (TracebackTable::Field##Mask)) >> \
+ (TracebackTable::Field##Shift))
+
+ GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobaLinkage);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsOutOfLineEpilogOrPrologue);
+ EmitComment();
+
+ GENBOOLCOMMENT("", FirstHalfOfMandatoryField, HasTraceBackTableOffset);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsInternalProcedure);
+ EmitComment();
+
+ GENBOOLCOMMENT("", FirstHalfOfMandatoryField, HasControlledStorage);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsTOCless);
+ EmitComment();
+
+ GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsFloatingPointPresent);
+ EmitComment();
+ GENBOOLCOMMENT("", FirstHalfOfMandatoryField,
+ IsFloatingPointOperationLogOrAbortEnabled);
+ EmitComment();
+
+ OutStreamer->emitIntValueInHexWithPadding(
+ (FirstHalfOfMandatoryField & 0x0000ff00) >> 8, 1);
+
+ // Set the 4th byte of the mandatory field.
+ FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask;
+
+ static_assert(XCOFF::AllocRegNo == 31, "Unexpected register usage!");
+ if (MRI.isPhysRegUsed(Subtarget->isPPC64() ? PPC::X31 : PPC::R31))
+ FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask;
+
+ const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs();
+ if (!MustSaveCRs.empty())
+ FirstHalfOfMandatoryField |= TracebackTable::IsCRSavedMask;
+
+ if (FI->mustSaveLR())
+ FirstHalfOfMandatoryField |= TracebackTable::IsLRSavedMask;
+
+ GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsInterruptHandler);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsFunctionNamePresent);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsAllocaUsed);
+ EmitComment();
+ GENVALUECOMMENT("OnConditionDirective", FirstHalfOfMandatoryField,
+ OnConditionDirective);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsCRSaved);
+ GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsLRSaved);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding((FirstHalfOfMandatoryField & 0xff),
+ 1);
+
+ // Set the 5th byte of mandatory field.
+ uint32_t SecondHalfOfMandatoryField = 0;
+
+ // Always store back chain.
+ SecondHalfOfMandatoryField |= TracebackTable::IsBackChainStoredMask;
+
+ uint32_t FPRSaved = 0;
+ for (unsigned Reg = PPC::F14; Reg <= PPC::F31; ++Reg) {
+ if (MRI.isPhysRegModified(Reg)) {
+ FPRSaved = PPC::F31 - Reg + 1;
+ break;
+ }
+ }
+ SecondHalfOfMandatoryField |= (FPRSaved << TracebackTable::FPRSavedShift) &
+ TracebackTable::FPRSavedMask;
+ GENBOOLCOMMENT("", SecondHalfOfMandatoryField, IsBackChainStored);
+ GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, IsFixup);
+ GENVALUECOMMENT(", NumOfFPRsSaved", SecondHalfOfMandatoryField, FPRSaved);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(
+ (SecondHalfOfMandatoryField & 0xff000000) >> 24, 1);
+
+ // Set the 6th byte of mandatory field.
+ bool ShouldEmitEHBlock = TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF);
+ if (ShouldEmitEHBlock)
+ SecondHalfOfMandatoryField |= TracebackTable::HasExtensionTableMask;
+
+ uint32_t GPRSaved = 0;
+
+ // X13 is reserved under 64-bit environment.
+ unsigned GPRBegin = Subtarget->isPPC64() ? PPC::X14 : PPC::R13;
+ unsigned GPREnd = Subtarget->isPPC64() ? PPC::X31 : PPC::R31;
+
+ for (unsigned Reg = GPRBegin; Reg <= GPREnd; ++Reg) {
+ if (MRI.isPhysRegModified(Reg)) {
+ GPRSaved = GPREnd - Reg + 1;
+ break;
+ }
+ }
+
+ SecondHalfOfMandatoryField |= (GPRSaved << TracebackTable::GPRSavedShift) &
+ TracebackTable::GPRSavedMask;
+
+ GENBOOLCOMMENT("", SecondHalfOfMandatoryField, HasVectorInfo);
+ GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasExtensionTable);
+ GENVALUECOMMENT(", NumOfGPRsSaved", SecondHalfOfMandatoryField, GPRSaved);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(
+ (SecondHalfOfMandatoryField & 0x00ff0000) >> 16, 1);
+
+ // Set the 7th byte of mandatory field.
+ uint32_t NumberOfFixedPara = FI->getFixedParamNum();
+ SecondHalfOfMandatoryField |=
+ (NumberOfFixedPara << TracebackTable::NumberOfFixedParmsShift) &
+ TracebackTable::NumberOfFixedParmsMask;
+ GENVALUECOMMENT("NumberOfFixedParms", SecondHalfOfMandatoryField,
+ NumberOfFixedParms);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(
+ (SecondHalfOfMandatoryField & 0x0000ff00) >> 8, 1);
+
+ // Set the 8th byte of mandatory field.
+
+ // Always set parameter on stack.
+ SecondHalfOfMandatoryField |= TracebackTable::HasParmsOnStackMask;
+
+ uint32_t NumberOfFPPara = FI->getFloatingPointParamNum();
+ SecondHalfOfMandatoryField |=
+ (NumberOfFPPara << TracebackTable::NumberOfFloatingPointParmsShift) &
+ TracebackTable::NumberOfFloatingPointParmsMask;
+
+ GENVALUECOMMENT("NumberOfFPParms", SecondHalfOfMandatoryField,
+ NumberOfFloatingPointParms);
+ GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasParmsOnStack);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(SecondHalfOfMandatoryField & 0xff,
+ 1);
+
+ // Generate the optional fields of traceback table.
+
+ // Parameter type.
+ if (NumberOfFixedPara || NumberOfFPPara) {
+ assert((SecondHalfOfMandatoryField & TracebackTable::HasVectorInfoMask) ==
+ 0 &&
+ "VectorInfo has not been implemented.");
+ uint32_t ParaType = FI->getParameterType();
+ CommentOS << "Parameter type = "
+ << XCOFF::parseParmsType(ParaType,
+ NumberOfFixedPara + NumberOfFPPara);
+ EmitComment();
+ OutStreamer->emitIntValueInHexWithPadding(ParaType, sizeof(ParaType));
+ }
+
+ // Traceback table offset.
+ OutStreamer->AddComment("Function size");
+ if (FirstHalfOfMandatoryField & TracebackTable::HasTraceBackTableOffsetMask) {
+ MCSymbol *FuncSectSym = getObjFileLowering().getFunctionEntryPointSymbol(
+ &(MF->getFunction()), TM);
+ OutStreamer->emitAbsoluteSymbolDiff(FuncEnd, FuncSectSym, 4);
+ }
+
+ // Since we unset the Int_Handler.
+ if (FirstHalfOfMandatoryField & TracebackTable::IsInterruptHandlerMask)
+ report_fatal_error("Hand_Mask not implement yet");
+
+ if (FirstHalfOfMandatoryField & TracebackTable::HasControlledStorageMask)
+ report_fatal_error("Ctl_Info not implement yet");
+
+ if (FirstHalfOfMandatoryField & TracebackTable::IsFunctionNamePresentMask) {
+ StringRef Name = MF->getName().substr(0, INT16_MAX);
+ int16_t NameLength = Name.size();
+ CommentOS << "Function name len = "
+ << static_cast<unsigned int>(NameLength);
+ EmitCommentAndValue(NameLength, 2);
+ OutStreamer->AddComment("Function Name");
+ OutStreamer->emitBytes(Name);
+ }
+
+ if (FirstHalfOfMandatoryField & TracebackTable::IsAllocaUsedMask) {
+ uint8_t AllocReg = XCOFF::AllocRegNo;
+ OutStreamer->AddComment("AllocaUsed");
+ OutStreamer->emitIntValueInHex(AllocReg, sizeof(AllocReg));
+ }
+
+ uint8_t ExtensionTableFlag = 0;
+ if (SecondHalfOfMandatoryField & TracebackTable::HasExtensionTableMask) {
+ if (ShouldEmitEHBlock)
+ ExtensionTableFlag |= ExtendedTBTableFlag::TB_EH_INFO;
+
+ CommentOS << "ExtensionTableFlag = "
+ << getExtendedTBTableFlagString(ExtensionTableFlag);
+ EmitCommentAndValue(ExtensionTableFlag, sizeof(ExtensionTableFlag));
+ }
+
+ if (ExtensionTableFlag & ExtendedTBTableFlag::TB_EH_INFO) {
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *EHInfoSym =
+ TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym);
+ const MCSymbol *TOCBaseSym =
+ cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
+ const MCExpr *Exp =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
+ MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
+
+ const DataLayout &DL = getDataLayout();
+ OutStreamer->emitValueToAlignment(4);
+ OutStreamer->AddComment("EHInfo Table");
+ OutStreamer->emitValue(Exp, DL.getPointerSize());
+ }
+
+#undef GENBOOLCOMMENT
+#undef GENVALUECOMMENT
+}
+
void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) {
// Early error checking limiting what is supported.
if (GV->isThreadLocal())
report_fatal_error("Thread local not yet supported on AIX.");
- if (GV->hasSection())
- report_fatal_error("Custom section for Data not yet supported.");
-
if (GV->hasComdat())
report_fatal_error("COMDAT not yet supported by AIX.");
}
+static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) {
+ return GV->hasAppendingLinkage() &&
+ StringSwitch<bool>(GV->getName())
+ // TODO: Linker could still eliminate the GV if we just skip
+ // handling llvm.used array. Skipping them for now until we or the
+ // AIX OS team come up with a good solution.
+ .Case("llvm.used", true)
+ // It's correct to just skip llvm.compiler.used array here.
+ .Case("llvm.compiler.used", true)
+ .Default(false);
+}
+
static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
return StringSwitch<bool>(GV->getName())
.Cases("llvm.global_ctors", "llvm.global_dtors", true)
@@ -1669,19 +2042,15 @@ static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
}
void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
- ValidateGV(GV);
-
- // TODO: Update the handling of global arrays for static init when we support
- // the ".ref" directive.
- // Otherwise, we can skip these arrays, because the AIX linker collects
- // static init functions simply based on their name.
- if (isSpecialLLVMGlobalArrayForStaticInit(GV))
+ // Special LLVM global arrays have been handled at the initialization.
+ if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
return;
- // Create the symbol, set its storage class.
+ assert(!GV->getName().startswith("llvm.") &&
+ "Unhandled intrinsic global variable.");
+ ValidateGV(GV);
+
MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
- GVSym->setStorageClass(
- TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
if (GV->isDeclarationForLinker()) {
emitLinkage(GV, GVSym);
@@ -1705,10 +2074,12 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
if (GVKind.isCommon() || GVKind.isBSSLocal()) {
Align Alignment = GV->getAlign().getValueOr(DL.getPreferredAlign(GV));
uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+ GVSym->setStorageClass(
+ TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
if (GVKind.isBSSLocal())
OutStreamer->emitXCOFFLocalCommonSymbol(
- OutContext.getOrCreateSymbol(GVSym->getUnqualifiedName()), Size,
+ OutContext.getOrCreateSymbol(GVSym->getSymbolTableName()), Size,
GVSym, Alignment.value());
else
OutStreamer->emitCommonSymbol(GVSym, Size, Alignment.value());
@@ -1718,7 +2089,18 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
MCSymbol *EmittedInitSym = GVSym;
emitLinkage(GV, EmittedInitSym);
emitAlignment(getGVAlignment(GV, DL), GV);
- OutStreamer->emitLabel(EmittedInitSym);
+
+ // When -fdata-sections is enabled, every GlobalVariable will
+ // be put into its own csect; therefore, label is not necessary here.
+ if (!TM.getDataSections() || GV->hasSection()) {
+ OutStreamer->emitLabel(EmittedInitSym);
+ }
+
+ // Emit aliasing label for global variable.
+ llvm::for_each(GOAliasMap[GV], [this](const GlobalAlias *Alias) {
+ OutStreamer->emitLabel(getSymbol(Alias));
+ });
+
emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
}
@@ -1730,6 +2112,13 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
// Emit function descriptor.
OutStreamer->SwitchSection(
cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect());
+
+ // Emit aliasing label for function descriptor csect.
+ llvm::for_each(GOAliasMap[&MF->getFunction()],
+ [this](const GlobalAlias *Alias) {
+ OutStreamer->emitLabel(getSymbol(Alias));
+ });
+
// Emit function entry point address.
OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
PointerSize);
@@ -1745,6 +2134,20 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
OutStreamer->SwitchSection(Current.first, Current.second);
}
+void PPCAIXAsmPrinter::emitFunctionEntryLabel() {
+ // It's not necessary to emit the label when we have individual
+ // function in its own csect.
+ if (!TM.getFunctionSections())
+ PPCAsmPrinter::emitFunctionEntryLabel();
+
+ // Emit aliasing label for function entry point label.
+ llvm::for_each(
+ GOAliasMap[&MF->getFunction()], [this](const GlobalAlias *Alias) {
+ OutStreamer->emitLabel(
+ getObjFileLowering().getFunctionEntryPointSymbol(Alias, TM));
+ });
+}
+
void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
// If there are no functions in this module, we will never need to reference
// the TOC base.
@@ -1757,20 +2160,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
PPCTargetStreamer *TS =
static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
- const unsigned EntryByteSize = Subtarget->isPPC64() ? 8 : 4;
- const unsigned TOCEntriesByteSize = TOC.size() * EntryByteSize;
- // TODO: If TOC entries' size is larger than 32768, then we run out of
- // positive displacement to reach the TOC entry. We need to decide how to
- // handle entries' size larger than that later.
- if (TOCEntriesByteSize > 32767) {
- report_fatal_error("Handling of TOC entry displacement larger than 32767 "
- "is not yet implemented.");
- }
-
for (auto &I : TOC) {
// Setup the csect for the current TC entry.
MCSectionXCOFF *TCEntry = cast<MCSectionXCOFF>(
- getObjFileLowering().getSectionForTOCEntry(I.first));
+ getObjFileLowering().getSectionForTOCEntry(I.first, TM));
OutStreamer->SwitchSection(TCEntry);
OutStreamer->emitLabel(I.second);
@@ -1780,10 +2173,6 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
}
bool PPCAIXAsmPrinter::doInitialization(Module &M) {
- if (M.alias_size() > 0u)
- report_fatal_error(
- "module has aliases, which LLVM does not yet support for AIX");
-
const bool Result = PPCAsmPrinter::doInitialization(M);
auto setCsectAlignment = [this](const GlobalObject *GO) {
@@ -1803,19 +2192,174 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
// We need to know, up front, the alignment of csects for the assembly path,
// because once a .csect directive gets emitted, we could not change the
// alignment value on it.
- for (const auto &G : M.globals())
+ for (const auto &G : M.globals()) {
+ if (isSpecialLLVMGlobalArrayToSkip(&G))
+ continue;
+
+ if (isSpecialLLVMGlobalArrayForStaticInit(&G)) {
+ // Generate a format indicator and a unique module id to be a part of
+ // the sinit and sterm function names.
+ if (FormatIndicatorAndUniqueModId.empty()) {
+ std::string UniqueModuleId = getUniqueModuleId(&M);
+ if (UniqueModuleId != "")
+ // TODO: Use source file full path to generate the unique module id
+ // and add a format indicator as a part of function name in case we
+ // will support more than one format.
+ FormatIndicatorAndUniqueModId = "clang_" + UniqueModuleId.substr(1);
+ else
+ // Use the Pid and current time as the unique module id when we cannot
+ // generate one based on a module's strong external symbols.
+ // FIXME: Adjust the comment accordingly after we use source file full
+ // path instead.
+ FormatIndicatorAndUniqueModId =
+ "clangPidTime_" + llvm::itostr(sys::Process::getProcessId()) +
+ "_" + llvm::itostr(time(nullptr));
+ }
+
+ emitSpecialLLVMGlobal(&G);
+ continue;
+ }
+
setCsectAlignment(&G);
+ }
for (const auto &F : M)
setCsectAlignment(&F);
+ // Construct an aliasing list for each GlobalObject.
+ for (const auto &Alias : M.aliases()) {
+ const GlobalObject *Base = Alias.getBaseObject();
+ if (!Base)
+ report_fatal_error(
+ "alias without a base object is not yet supported on AIX");
+ GOAliasMap[Base].push_back(&Alias);
+ }
+
return Result;
}
-/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
-/// for a MachineFunction to the given output stream, in a format that the
-/// Darwin assembler can deal with.
-///
+void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case PPC::BL8:
+ case PPC::BL:
+ case PPC::BL8_NOP:
+ case PPC::BL_NOP: {
+ const MachineOperand &MO = MI->getOperand(0);
+ if (MO.isSymbol()) {
+ MCSymbolXCOFF *S =
+ cast<MCSymbolXCOFF>(OutContext.getOrCreateSymbol(MO.getSymbolName()));
+ ExtSymSDNodeSymbols.insert(S);
+ }
+ } break;
+ case PPC::BL_TLS:
+ case PPC::BL8_TLS:
+ case PPC::BL8_TLS_:
+ case PPC::BL8_NOP_TLS:
+ report_fatal_error("TLS call not yet implemented");
+ case PPC::TAILB:
+ case PPC::TAILB8:
+ case PPC::TAILBA:
+ case PPC::TAILBA8:
+ case PPC::TAILBCTR:
+ case PPC::TAILBCTR8:
+ if (MI->getOperand(0).isSymbol())
+ report_fatal_error("Tail call for extern symbol not yet supported.");
+ break;
+ }
+ return PPCAsmPrinter::emitInstruction(MI);
+}
+
+bool PPCAIXAsmPrinter::doFinalization(Module &M) {
+ for (MCSymbol *Sym : ExtSymSDNodeSymbols)
+ OutStreamer->emitSymbolAttribute(Sym, MCSA_Extern);
+ return PPCAsmPrinter::doFinalization(M);
+}
+
+static unsigned mapToSinitPriority(int P) {
+ if (P < 0 || P > 65535)
+ report_fatal_error("invalid init priority");
+
+ if (P <= 20)
+ return P;
+
+ if (P < 81)
+ return 20 + (P - 20) * 16;
+
+ if (P <= 1124)
+ return 1004 + (P - 81);
+
+ if (P < 64512)
+ return 2047 + (P - 1124) * 33878;
+
+ return 2147482625u + (P - 64512);
+}
+
+static std::string convertToSinitPriority(int Priority) {
+ // This helper function converts clang init priority to values used in sinit
+ // and sterm functions.
+ //
+ // The conversion strategies are:
+ // We map the reserved clang/gnu priority range [0, 100] into the sinit/sterm
+ // reserved priority range [0, 1023] by
+ // - directly mapping the first 21 and the last 20 elements of the ranges
+ // - linear interpolating the intermediate values with a step size of 16.
+ //
+ // We map the non reserved clang/gnu priority range of [101, 65535] into the
+ // sinit/sterm priority range [1024, 2147483648] by:
+ // - directly mapping the first and the last 1024 elements of the ranges
+ // - linear interpolating the intermediate values with a step size of 33878.
+ unsigned int P = mapToSinitPriority(Priority);
+
+ std::string PrioritySuffix;
+ llvm::raw_string_ostream os(PrioritySuffix);
+ os << llvm::format_hex_no_prefix(P, 8);
+ os.flush();
+ return PrioritySuffix;
+}
+
+void PPCAIXAsmPrinter::emitXXStructorList(const DataLayout &DL,
+ const Constant *List, bool IsCtor) {
+ SmallVector<Structor, 8> Structors;
+ preprocessXXStructorList(DL, List, Structors);
+ if (Structors.empty())
+ return;
+
+ unsigned Index = 0;
+ for (Structor &S : Structors) {
+ if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(S.Func))
+ S.Func = CE->getOperand(0);
+
+ llvm::GlobalAlias::create(
+ GlobalValue::ExternalLinkage,
+ (IsCtor ? llvm::Twine("__sinit") : llvm::Twine("__sterm")) +
+ llvm::Twine(convertToSinitPriority(S.Priority)) +
+ llvm::Twine("_", FormatIndicatorAndUniqueModId) +
+ llvm::Twine("_", llvm::utostr(Index++)),
+ cast<Function>(S.Func));
+ }
+}
+
+void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV,
+ unsigned Encoding) {
+ if (GV) {
+ MCSymbol *TypeInfoSym = TM.getSymbol(GV);
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym);
+ const MCSymbol *TOCBaseSym =
+ cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
+ ->getQualNameSymbol();
+ auto &Ctx = OutStreamer->getContext();
+ const MCExpr *Exp =
+ MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
+ MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
+ OutStreamer->emitValue(Exp, GetSizeOfEncodedValue(Encoding));
+ } else
+ OutStreamer->emitIntValue(0, GetSizeOfEncodedValue(Encoding));
+}
+
+// Return a pass that prints the PPC assembly code for a MachineFunction to the
+// given output stream.
static AsmPrinter *
createPPCAsmPrinterPass(TargetMachine &tm,
std::unique_ptr<MCStreamer> &&Streamer) {
@@ -1829,6 +2373,8 @@ createPPCAsmPrinterPass(TargetMachine &tm,
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() {
TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
createPPCAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(getThePPC32LETarget(),
+ createPPCAsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(getThePPC64Target(),
createPPCAsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(getThePPC64LETarget(),
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index f125ca011cd2..3c6b1f84b821 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -59,7 +59,7 @@ using namespace llvm;
namespace {
-#define DEBUG_TYPE "bool-ret-to-int"
+#define DEBUG_TYPE "ppc-bool-ret-to-int"
STATISTIC(NumBoolRetPromotion,
"Number of times a bool feeding a RetInst was promoted to an int");
@@ -75,8 +75,7 @@ class PPCBoolRetToInt : public FunctionPass {
WorkList.push_back(V);
Defs.insert(V);
while (!WorkList.empty()) {
- Value *Curr = WorkList.back();
- WorkList.pop_back();
+ Value *Curr = WorkList.pop_back_val();
auto *CurrUser = dyn_cast<User>(Curr);
// Operands of CallInst/Constant are skipped because they may not be Bool
// type. For CallInst, their positions are defined by ABI.
@@ -283,8 +282,8 @@ private:
} // end anonymous namespace
char PPCBoolRetToInt::ID = 0;
-INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
- "Convert i1 constants to i32/i64 if they are returned",
- false, false)
+INITIALIZE_PASS(PPCBoolRetToInt, "ppc-bool-ret-to-int",
+ "Convert i1 constants to i32/i64 if they are returned", false,
+ false)
FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp
index 5116f0d121f4..79ffc6627a61 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCCState.cpp
@@ -32,4 +32,4 @@ void PPCCCState::PreAnalyzeFormalArguments(
OriginalArgWasPPCF128.push_back(false);
}
}
-} \ No newline at end of file
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index bb12e05173a6..b9518d6d7064 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -1,4 +1,4 @@
-//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//===-- PPCCTRLoops.cpp - Verify CTR loops -----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,74 +6,48 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass identifies loops where we can generate the PPC branch instructions
-// that decrement and test the count register (CTR) (bdnz and friends).
-//
-// The pattern that defines the induction variable can changed depending on
-// prior optimizations. For example, the IndVarSimplify phase run by 'opt'
-// normalizes induction variables, and the Loop Strength Reduction pass
-// run by 'llc' may also make changes to the induction variable.
-//
-// Criteria for CTR loops:
-// - Countable loops (w/ ind. var for a trip count)
-// - Try inner-most loops first
-// - No nested CTR loops.
-// - No function calls in loops.
+// This pass verifies that all bdnz/bdz instructions are dominated by a loop
+// mtctr before any other instructions that might clobber the ctr register.
//
//===----------------------------------------------------------------------===//
+// CTR loops are produced by the HardwareLoops pass and this pass is simply a
+// verification that no invalid CTR loops are produced. As such, it isn't
+// something that needs to be run (or even defined) for Release builds so the
+// entire file is guarded by NDEBUG.
+#ifndef NDEBUG
+#include <vector>
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
#include "PPC.h"
-#include "PPCSubtarget.h"
-#include "PPCTargetMachine.h"
-#include "PPCTargetTransformInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueHandle.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+#include "llvm/Support/Printable.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-
-#ifndef NDEBUG
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#endif
using namespace llvm;
-#define DEBUG_TYPE "ctrloops"
-
-#ifndef NDEBUG
-static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
-#endif
+#define DEBUG_TYPE "ppc-ctrloops-verify"
namespace {
-#ifndef NDEBUG
struct PPCCTRLoopsVerify : public MachineFunctionPass {
public:
static char ID;
@@ -94,10 +68,8 @@ namespace {
};
char PPCCTRLoopsVerify::ID = 0;
-#endif // NDEBUG
} // end anonymous namespace
-#ifndef NDEBUG
INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
"PowerPC CTR Loops Verify", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
@@ -107,9 +79,7 @@ INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
FunctionPass *llvm::createPPCCTRLoopsVerify() {
return new PPCCTRLoopsVerify();
}
-#endif // NDEBUG
-#ifndef NDEBUG
static bool clobbersCTR(const MachineInstr &MI) {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI.getOperand(i);
@@ -178,9 +148,7 @@ queue_preds:
return false;
}
- for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
- PIE = MBB->pred_end(); PI != PIE; ++PI)
- Preds.push_back(*PI);
+ append_range(Preds, MBB->predecessors());
}
do {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
index 1eaa7f7a44b3..cc3486718179 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -59,10 +59,7 @@ def RetCC_PPC_Cold : CallingConv<[
CCIfType<[f32], CCAssignToReg<[F1]>>,
CCIfType<[f64], CCAssignToReg<[F1]>>,
- CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>,
-
- CCIfType<[v4f64, v4f32, v4i1],
- CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>,
+ CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>>,
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
CCIfSubtarget<"hasAltivec()",
@@ -95,13 +92,9 @@ def RetCC_PPC : CallingConv<[
// For P9, f128 are passed in vector registers.
CCIfType<[f128],
- CCIfSubtarget<"hasP9Vector()",
+ CCIfSubtarget<"hasAltivec()",
CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
- // QPX vectors are returned in QF1 and QF2.
- CCIfType<[v4f64, v4f32, v4i1],
- CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
-
// Vector types returned as "direct" go into V2 .. V9; note that only the
// ELFv2 ABI fully utilizes all these registers.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
@@ -156,10 +149,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
CCIfType<[f128],
- CCIfSubtarget<"hasP9Vector()",
+ CCIfSubtarget<"hasAltivec()",
CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
- CCIfType<[v4f64, v4f32, v4i1],
- CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
CCIfSubtarget<"hasAltivec()",
CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
@@ -223,12 +214,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[
CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>,
CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
- // QPX vectors that are stored in double precision need 32-byte alignment.
- CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
-
// Vectors and float128 get 16-byte stack slots that are 16-byte aligned.
CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>,
- CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>>
+ CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToStack<16, 16>>>
]>;
// This calling convention puts vector arguments always on the stack. It is used
@@ -243,10 +231,6 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[
// put vector arguments in vector registers before putting them on the stack.
let Entry = 1 in
def CC_PPC32_SVR4 : CallingConv<[
- // QPX vectors mirror the scalar FP convention.
- CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()",
- CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,
-
// The first 12 Vector arguments are passed in AltiVec registers.
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
@@ -254,7 +238,7 @@ def CC_PPC32_SVR4 : CallingConv<[
// Float128 types treated as vector arguments.
CCIfType<[f128],
- CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+ CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
V8, V9, V10, V11, V12, V13]>>>,
CCDelegateTo<CC_PPC32_SVR4_Common>
@@ -307,6 +291,8 @@ def CSR_AIX32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
F27, F28, F29, F30, F31, CR2, CR3, CR4
)>;
+def CSR_AIX32_Altivec : CalleeSavedRegs<(add CSR_AIX32, CSR_Altivec)>;
+
// Common CalleeSavedRegs for SVR4 and AIX.
def CSR_PPC64 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
X21, X22, X23, X24, X25, X26, X27, X28,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index c9f74bbf861c..08b7bdb3ac1e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -77,8 +77,9 @@ protected:
if (J->getOperand(0).getMBB() == &ReturnMBB) {
// This is an unconditional branch to the return. Replace the
// branch with a blr.
- BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
- .copyImplicitOps(*I);
+ MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
+ (*PI)->insert(J, MI);
+
MachineBasicBlock::iterator K = J--;
K->eraseFromParent();
BlockChanged = true;
@@ -89,10 +90,13 @@ protected:
if (J->getOperand(2).getMBB() == &ReturnMBB) {
// This is a conditional branch to the return. Replace the branch
// with a bclr.
- BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+ MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
+ MI->setDesc(TII->get(PPC::BCCLR));
+ MachineInstrBuilder(*ReturnMBB.getParent(), MI)
.add(J->getOperand(0))
- .add(J->getOperand(1))
- .copyImplicitOps(*I);
+ .add(J->getOperand(1));
+ (*PI)->insert(J, MI);
+
MachineBasicBlock::iterator K = J--;
K->eraseFromParent();
BlockChanged = true;
@@ -103,11 +107,13 @@ protected:
if (J->getOperand(1).getMBB() == &ReturnMBB) {
// This is a conditional branch to the return. Replace the branch
// with a bclr.
- BuildMI(
- **PI, J, J->getDebugLoc(),
- TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
- .add(J->getOperand(0))
- .copyImplicitOps(*I);
+ MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
+ MI->setDesc(
+ TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn));
+ MachineInstrBuilder(*ReturnMBB.getParent(), MI)
+ .add(J->getOperand(0));
+ (*PI)->insert(J, MI);
+
MachineBasicBlock::iterator K = J--;
K->eraseFromParent();
BlockChanged = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 39790ac9a8aa..c181816e31c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -86,7 +86,6 @@ typedef struct Address {
class PPCFastISel final : public FastISel {
const TargetMachine &TM;
- const PPCSubtarget *PPCSubTarget;
const PPCSubtarget *Subtarget;
PPCFunctionInfo *PPCFuncInfo;
const TargetInstrInfo &TII;
@@ -97,7 +96,6 @@ class PPCFastISel final : public FastISel {
explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo)
: FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
- PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
Subtarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()),
@@ -1567,6 +1565,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (IsVarArg)
return false;
+ // If this is a PC-Rel function, let SDISel handle the call.
+ if (Subtarget->isUsingPCRelativeCalls())
+ return false;
+
// Handle simple calls for now, with legal return types and
// those that can be extended.
Type *RetTy = CLI.RetTy;
@@ -1622,7 +1624,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
return false;
- if (ArgVT.isVector())
+ // FIXME: FastISel cannot handle non-simple types yet, including 128-bit FP
+ // types, which is passed through vector register. Skip these types and
+ // fallback to default SelectionDAG based selection.
+ if (ArgVT.isVector() || ArgVT == MVT::f128)
return false;
unsigned Arg = getRegForValue(ArgValue);
@@ -1991,6 +1996,10 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
// Materialize a floating-point constant into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
+ // If this is a PC-Rel function, let SDISel handle constant pool.
+ if (Subtarget->isUsingPCRelativeCalls())
+ return false;
+
// No plans to handle long double here.
if (VT != MVT::f32 && VT != MVT::f64)
return 0;
@@ -2055,6 +2064,10 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
// Materialize the address of a global value into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+ // If this is a PC-Rel function, let SDISel handle GV materialization.
+ if (Subtarget->isUsingPCRelativeCalls())
+ return false;
+
assert(VT == MVT::i64 && "Non-address!");
const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
unsigned DestReg = createResultReg(RC);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 2ee394e9259d..50ce11b8374f 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -39,15 +39,6 @@ EnablePEVectorSpills("ppc-enable-pe-vector-spills",
cl::desc("Enable spills in prologue to vector registers."),
cl::init(false), cl::Hidden);
-/// VRRegNo - Map from a numbered VR register to its enum value.
-///
-static const MCPhysReg VRRegNo[] = {
- PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
- PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
- PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
- PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
-};
-
static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
if (STI.isAIXABI())
return STI.isPPC64() ? 16 : 8;
@@ -227,19 +218,14 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
CALLEE_SAVED_VRS
};
- static const SpillSlot AIXOffsets32[] = {
- CALLEE_SAVED_FPRS,
- CALLEE_SAVED_GPRS32,
- // Add AIX's extra CSR.
- {PPC::R13, -76},
- // TODO: Update when we add vector support for AIX.
- };
+ static const SpillSlot AIXOffsets32[] = {CALLEE_SAVED_FPRS,
+ CALLEE_SAVED_GPRS32,
+ // Add AIX's extra CSR.
+ {PPC::R13, -76},
+ CALLEE_SAVED_VRS};
static const SpillSlot AIXOffsets64[] = {
- CALLEE_SAVED_FPRS,
- CALLEE_SAVED_GPRS64,
- // TODO: Update when we add vector support for AIX.
- };
+ CALLEE_SAVED_FPRS, CALLEE_SAVED_GPRS64, CALLEE_SAVED_VRS};
if (Subtarget.is64BitELFABI()) {
NumEntries = array_lengthof(ELFOffsets64);
@@ -262,153 +248,11 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
return AIXOffsets32;
}
-/// RemoveVRSaveCode - We have found that this function does not need any code
-/// to manipulate the VRSAVE register, even though it uses vector registers.
-/// This can happen when the only registers used are known to be live in or out
-/// of the function. Remove all of the VRSAVE related code from the function.
-/// FIXME: The removal of the code results in a compile failure at -O0 when the
-/// function contains a function call, as the GPR containing original VRSAVE
-/// contents is spilled and reloaded around the call. Without the prolog code,
-/// the spill instruction refers to an undefined register. This code needs
-/// to account for all uses of that GPR.
-static void RemoveVRSaveCode(MachineInstr &MI) {
- MachineBasicBlock *Entry = MI.getParent();
- MachineFunction *MF = Entry->getParent();
-
- // We know that the MTVRSAVE instruction immediately follows MI. Remove it.
- MachineBasicBlock::iterator MBBI = MI;
- ++MBBI;
- assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
- MBBI->eraseFromParent();
-
- bool RemovedAllMTVRSAVEs = true;
- // See if we can find and remove the MTVRSAVE instruction from all of the
- // epilog blocks.
- for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
- // If last instruction is a return instruction, add an epilogue
- if (I->isReturnBlock()) {
- bool FoundIt = false;
- for (MBBI = I->end(); MBBI != I->begin(); ) {
- --MBBI;
- if (MBBI->getOpcode() == PPC::MTVRSAVE) {
- MBBI->eraseFromParent(); // remove it.
- FoundIt = true;
- break;
- }
- }
- RemovedAllMTVRSAVEs &= FoundIt;
- }
- }
-
- // If we found and removed all MTVRSAVE instructions, remove the read of
- // VRSAVE as well.
- if (RemovedAllMTVRSAVEs) {
- MBBI = MI;
- assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
- --MBBI;
- assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
- MBBI->eraseFromParent();
- }
-
- // Finally, nuke the UPDATE_VRSAVE.
- MI.eraseFromParent();
-}
-
-// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
-// instruction selector. Based on the vector registers that have been used,
-// transform this into the appropriate ORI instruction.
-static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
- MachineFunction *MF = MI.getParent()->getParent();
- const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- DebugLoc dl = MI.getDebugLoc();
-
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned UsedRegMask = 0;
- for (unsigned i = 0; i != 32; ++i)
- if (MRI.isPhysRegModified(VRRegNo[i]))
- UsedRegMask |= 1 << (31-i);
-
- // Live in and live out values already must be in the mask, so don't bother
- // marking them.
- for (std::pair<unsigned, unsigned> LI : MF->getRegInfo().liveins()) {
- unsigned RegNo = TRI->getEncodingValue(LI.first);
- if (VRRegNo[RegNo] == LI.first) // If this really is a vector reg.
- UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked.
- }
-
- // Live out registers appear as use operands on return instructions.
- for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
- UsedRegMask != 0 && BI != BE; ++BI) {
- const MachineBasicBlock &MBB = *BI;
- if (!MBB.isReturnBlock())
- continue;
- const MachineInstr &Ret = MBB.back();
- for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
- const MachineOperand &MO = Ret.getOperand(I);
- if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
- continue;
- unsigned RegNo = TRI->getEncodingValue(MO.getReg());
- UsedRegMask &= ~(1 << (31-RegNo));
- }
- }
-
- // If no registers are used, turn this into a copy.
- if (UsedRegMask == 0) {
- // Remove all VRSAVE code.
- RemoveVRSaveCode(MI);
- return;
- }
-
- Register SrcReg = MI.getOperand(1).getReg();
- Register DstReg = MI.getOperand(0).getReg();
-
- if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
- if (DstReg != SrcReg)
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
- .addReg(SrcReg)
- .addImm(UsedRegMask);
- else
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
- .addReg(SrcReg, RegState::Kill)
- .addImm(UsedRegMask);
- } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
- if (DstReg != SrcReg)
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
- .addReg(SrcReg)
- .addImm(UsedRegMask >> 16);
- else
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
- .addReg(SrcReg, RegState::Kill)
- .addImm(UsedRegMask >> 16);
- } else {
- if (DstReg != SrcReg)
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
- .addReg(SrcReg)
- .addImm(UsedRegMask >> 16);
- else
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
- .addReg(SrcReg, RegState::Kill)
- .addImm(UsedRegMask >> 16);
-
- BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
- .addReg(DstReg, RegState::Kill)
- .addImm(UsedRegMask & 0xFFFF);
- }
-
- // Remove the old UPDATE_VRSAVE instruction.
- MI.eraseFromParent();
-}
-
static bool spillsCR(const MachineFunction &MF) {
const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
return FuncInfo->isCRSpilled();
}
-static bool spillsVRSAVE(const MachineFunction &MF) {
- const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
- return FuncInfo->isVRSAVESpilled();
-}
-
static bool hasSpills(const MachineFunction &MF) {
const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
return FuncInfo->hasSpills();
@@ -474,7 +318,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
!FI->mustSaveTOC() && // No need to save TOC.
!RegInfo->hasBasePointer(MF); // No special alignment.
- // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless
+ // Note: for PPC32 SVR4ABI, we can still generate stackless
// code if all local vars are reg-allocated.
bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize();
@@ -531,9 +375,10 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
return false;
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
- MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() ||
- (MF.getTarget().Options.GuaranteedTailCallOpt &&
- MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+ MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ MF.exposesReturnsTwice() ||
+ (MF.getTarget().Options.GuaranteedTailCallOpt &&
+ MF.getInfo<PPCFunctionInfo>()->hasFastCall());
}
void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
@@ -681,6 +526,8 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
// register is available, we can adjust for that by not overlapping the spill
// code. However, if we need to realign the stack (i.e. have a base pointer)
// and the stack frame is large, we need two scratch registers.
+// Also, stack probe requires two scratch registers, one for old sp, one for
+// large frame and large probe size.
bool
PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -692,8 +539,10 @@ PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
Align MaxAlign = MFI.getMaxAlign();
bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+ const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
- return (IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1;
+ return ((IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1) ||
+ TLI.hasInlineStackProbe(MF);
}
bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
@@ -736,8 +585,8 @@ bool PPCFrameLowering::stackUpdateCanBeMoved(MachineFunction &MF) const {
// Frame pointers and base pointers complicate matters so don't do anything
// if we have them. For example having a frame pointer will sometimes require
// a copy of r1 into r31 and that makes keeping track of updates to r1 more
- // difficult.
- if (hasFP(MF) || RegInfo->hasBasePointer(MF))
+ // difficult. Similar situation exists with setjmp.
+ if (hasFP(MF) || RegInfo->hasBasePointer(MF) || MF.exposesReturnsTwice())
return false;
// Calls to fast_cc functions use different rules for passing parameters on
@@ -771,24 +620,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
bool isPPC64 = Subtarget.isPPC64();
// Get the ABI.
bool isSVR4ABI = Subtarget.isSVR4ABI();
- bool isAIXABI = Subtarget.isAIXABI();
bool isELFv2ABI = Subtarget.isELFv2ABI();
- assert((isSVR4ABI || isAIXABI) && "Unsupported PPC ABI.");
-
- // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it,
- // process it.
- if (!isSVR4ABI)
- for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
- if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
- if (isAIXABI)
- report_fatal_error("UPDATE_VRSAVE is unexpected on AIX.");
- HandleVRSaveUpdate(*MBBI, TII);
- break;
- }
- }
-
- // Move MBBI back to the beginning of the prologue block.
- MBBI = MBB.begin();
+ assert((isSVR4ABI || Subtarget.isAIXABI()) && "Unsupported PPC ABI.");
// Work out frame sizes.
unsigned FrameSize = determineFrameLayoutAndUpdate(MF);
@@ -848,12 +681,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
"FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
// Using the same bool variable as below to suppress compiler warnings.
- // Stack probe requires two scratch registers, one for old sp, one for large
- // frame and large probe size.
bool SingleScratchReg = findScratchRegister(
- &MBB, false,
- twoUniqueScratchRegsRequired(&MBB) || TLI.hasInlineStackProbe(MF),
- &ScratchReg, &TempReg);
+ &MBB, false, twoUniqueScratchRegsRequired(&MBB), &ScratchReg, &TempReg);
assert(SingleScratchReg &&
"Required number of registers not available in this block");
@@ -863,26 +692,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
int FPOffset = 0;
if (HasFP) {
- if (isSVR4ABI) {
- MachineFrameInfo &MFI = MF.getFrameInfo();
- int FPIndex = FI->getFramePointerSaveIndex();
- assert(FPIndex && "No Frame Pointer Save Slot!");
- FPOffset = MFI.getObjectOffset(FPIndex);
- } else {
- FPOffset = getFramePointerSaveOffset();
- }
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FPIndex = FI->getFramePointerSaveIndex();
+ assert(FPIndex && "No Frame Pointer Save Slot!");
+ FPOffset = MFI.getObjectOffset(FPIndex);
}
int BPOffset = 0;
if (HasBP) {
- if (isSVR4ABI) {
- MachineFrameInfo &MFI = MF.getFrameInfo();
- int BPIndex = FI->getBasePointerSaveIndex();
- assert(BPIndex && "No Base Pointer Save Slot!");
- BPOffset = MFI.getObjectOffset(BPIndex);
- } else {
- BPOffset = getBasePointerSaveOffset();
- }
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int BPIndex = FI->getBasePointerSaveIndex();
+ assert(BPIndex && "No Base Pointer Save Slot!");
+ BPOffset = MFI.getObjectOffset(BPIndex);
}
int PBPOffset = 0;
@@ -1382,10 +1203,12 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
if (StackAllocMIPos == PrologMBB.end())
return;
const BasicBlock *ProbedBB = PrologMBB.getBasicBlock();
+ MachineBasicBlock *CurrentMBB = &PrologMBB;
DebugLoc DL = PrologMBB.findDebugLoc(StackAllocMIPos);
MachineInstr &MI = *StackAllocMIPos;
int64_t NegFrameSize = MI.getOperand(2).getImm();
- int64_t NegProbeSize = -(int64_t)TLI.getStackProbeSize(MF);
+ unsigned ProbeSize = TLI.getStackProbeSize(MF);
+ int64_t NegProbeSize = -(int64_t)ProbeSize;
assert(isInt<32>(NegProbeSize) && "Unhandled probe size");
int64_t NumBlocks = NegFrameSize / NegProbeSize;
int64_t NegResidualSize = NegFrameSize % NegProbeSize;
@@ -1394,10 +1217,9 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
Register FPReg = MI.getOperand(1).getReg();
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
bool HasBP = RegInfo->hasBasePointer(MF);
+ Register BPReg = RegInfo->getBaseRegister(MF);
Align MaxAlign = MFI.getMaxAlign();
- // Initialize current frame pointer.
const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR);
- BuildMI(PrologMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg);
// Subroutines to generate .cfi_* directives.
auto buildDefCFAReg = [&](MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, Register Reg) {
@@ -1437,89 +1259,218 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
// Subroutine to store frame pointer and decrease stack pointer by probe size.
auto allocateAndProbe = [&](MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, int64_t NegSize,
- Register NegSizeReg, bool UseDForm) {
+ Register NegSizeReg, bool UseDForm,
+ Register StoreReg) {
if (UseDForm)
BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDU : PPC::STWU), SPReg)
- .addReg(FPReg)
+ .addReg(StoreReg)
.addImm(NegSize)
.addReg(SPReg);
else
BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
- .addReg(FPReg)
+ .addReg(StoreReg)
.addReg(SPReg)
.addReg(NegSizeReg);
};
- // Use FPReg to calculate CFA.
- if (needsCFI)
- buildDefCFA(PrologMBB, {MI}, FPReg, 0);
- // For case HasBP && MaxAlign > 1, we have to align the SP by performing
+ // Used to probe realignment gap [stackptr - (stackptr % align), stackptr)
+ // when HasBP && isPPC64. In such scenario, normally we have r0, r1, r12, r30
+ // available and r1 is already copied to r30 which is BPReg. So BPReg stores
+ // the value of stackptr.
+ // First we have to probe tail interval whose size is less than probesize,
+ // i.e., [stackptr - (stackptr % align) % probesize, stackptr). At this stage,
+ // ScratchReg stores the value of ((stackptr % align) % probesize). Then we
+ // probe each block sized probesize until stackptr meets
+ // (stackptr - (stackptr % align)). At this stage, ScratchReg is materialized
+ // as negprobesize. At both stages, TempReg stores the value of
+ // (stackptr - (stackptr % align)).
+ auto dynamicProbe = [&](MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, Register ScratchReg,
+ Register TempReg) {
+ assert(HasBP && isPPC64 && "Probe alignment part not available");
+ assert(isPowerOf2_64(ProbeSize) && "Probe size should be power of 2");
+ // ScratchReg = stackptr % align
+ BuildMI(MBB, MBBI, DL, TII.get(PPC::RLDICL), ScratchReg)
+ .addReg(BPReg)
+ .addImm(0)
+ .addImm(64 - Log2(MaxAlign));
+ // TempReg = stackptr - (stackptr % align)
+ BuildMI(MBB, MBBI, DL, TII.get(PPC::SUBFC8), TempReg)
+ .addReg(ScratchReg)
+ .addReg(BPReg);
+ // ScratchReg = (stackptr % align) % probesize
+ BuildMI(MBB, MBBI, DL, TII.get(PPC::RLDICL), ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(0)
+ .addImm(64 - Log2(ProbeSize));
+ Register CRReg = PPC::CR0;
+ // If (stackptr % align) % probesize == 0, we should not generate probe
+ // code. Layout of output assembly kinda like:
+ // bb.0:
+ // ...
+ // cmpldi $scratchreg, 0
+ // beq bb.2
+ // bb.1: # Probe tail interval
+ // neg $scratchreg, $scratchreg
+ // stdux $bpreg, r1, $scratchreg
+ // bb.2:
+ // <materialize negprobesize into $scratchreg>
+ // cmpd r1, $tempreg
+ // beq bb.4
+ // bb.3: # Loop to probe each block
+ // stdux $bpreg, r1, $scratchreg
+ // cmpd r1, $tempreg
+ // bne bb.3
+ // bb.4:
+ // ...
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+ MachineBasicBlock *ProbeResidualMBB = MF.CreateMachineBasicBlock(ProbedBB);
+ MF.insert(MBBInsertPoint, ProbeResidualMBB);
+ MachineBasicBlock *ProbeLoopPreHeaderMBB =
+ MF.CreateMachineBasicBlock(ProbedBB);
+ MF.insert(MBBInsertPoint, ProbeLoopPreHeaderMBB);
+ MachineBasicBlock *ProbeLoopBodyMBB = MF.CreateMachineBasicBlock(ProbedBB);
+ MF.insert(MBBInsertPoint, ProbeLoopBodyMBB);
+ MachineBasicBlock *ProbeExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
+ MF.insert(MBBInsertPoint, ProbeExitMBB);
+ // bb.4
+ ProbeExitMBB->splice(ProbeExitMBB->end(), &MBB, MBBI, MBB.end());
+ ProbeExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ // bb.0
+ BuildMI(&MBB, DL, TII.get(PPC::CMPDI), CRReg).addReg(ScratchReg).addImm(0);
+ BuildMI(&MBB, DL, TII.get(PPC::BCC))
+ .addImm(PPC::PRED_EQ)
+ .addReg(CRReg)
+ .addMBB(ProbeLoopPreHeaderMBB);
+ MBB.addSuccessor(ProbeResidualMBB);
+ MBB.addSuccessor(ProbeLoopPreHeaderMBB);
+ // bb.1
+ BuildMI(ProbeResidualMBB, DL, TII.get(PPC::NEG8), ScratchReg)
+ .addReg(ScratchReg);
+ allocateAndProbe(*ProbeResidualMBB, ProbeResidualMBB->end(), 0, ScratchReg,
+ false, BPReg);
+ ProbeResidualMBB->addSuccessor(ProbeLoopPreHeaderMBB);
+ // bb.2
+ MaterializeImm(*ProbeLoopPreHeaderMBB, ProbeLoopPreHeaderMBB->end(),
+ NegProbeSize, ScratchReg);
+ BuildMI(ProbeLoopPreHeaderMBB, DL, TII.get(PPC::CMPD), CRReg)
+ .addReg(SPReg)
+ .addReg(TempReg);
+ BuildMI(ProbeLoopPreHeaderMBB, DL, TII.get(PPC::BCC))
+ .addImm(PPC::PRED_EQ)
+ .addReg(CRReg)
+ .addMBB(ProbeExitMBB);
+ ProbeLoopPreHeaderMBB->addSuccessor(ProbeLoopBodyMBB);
+ ProbeLoopPreHeaderMBB->addSuccessor(ProbeExitMBB);
+ // bb.3
+ allocateAndProbe(*ProbeLoopBodyMBB, ProbeLoopBodyMBB->end(), 0, ScratchReg,
+ false, BPReg);
+ BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::CMPD), CRReg)
+ .addReg(SPReg)
+ .addReg(TempReg);
+ BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(CRReg)
+ .addMBB(ProbeLoopBodyMBB);
+ ProbeLoopBodyMBB->addSuccessor(ProbeExitMBB);
+ ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
+ // Update liveins.
+ recomputeLiveIns(*ProbeResidualMBB);
+ recomputeLiveIns(*ProbeLoopPreHeaderMBB);
+ recomputeLiveIns(*ProbeLoopBodyMBB);
+ recomputeLiveIns(*ProbeExitMBB);
+ return ProbeExitMBB;
+ };
+ // For case HasBP && MaxAlign > 1, we have to realign the SP by performing
// SP = SP - SP % MaxAlign.
if (HasBP && MaxAlign > 1) {
- if (isPPC64)
- BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg)
- .addReg(FPReg)
- .addImm(0)
- .addImm(64 - Log2(MaxAlign));
- else
- BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg)
+ // FIXME: Currently only probe the gap [stackptr & alignmask, stackptr) in
+ // 64-bit mode.
+ if (isPPC64) {
+ // Use BPReg to calculate CFA.
+ if (needsCFI)
+ buildDefCFA(*CurrentMBB, {MI}, BPReg, 0);
+ // Since we have SPReg copied to BPReg at the moment, FPReg can be used as
+ // TempReg.
+ Register TempReg = FPReg;
+ CurrentMBB = dynamicProbe(*CurrentMBB, {MI}, ScratchReg, TempReg);
+ // Copy BPReg to FPReg to meet the definition of PROBED_STACKALLOC_64.
+ BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg)
+ .addReg(BPReg)
+ .addReg(BPReg);
+ } else {
+ // Initialize current frame pointer.
+ BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg)
+ .addReg(SPReg)
+ .addReg(SPReg);
+ // Use FPReg to calculate CFA.
+ if (needsCFI)
+ buildDefCFA(*CurrentMBB, {MI}, FPReg, 0);
+ BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg)
.addReg(FPReg)
.addImm(0)
.addImm(32 - Log2(MaxAlign))
.addImm(31);
- BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::SUBFC8 : PPC::SUBFC),
- SPReg)
- .addReg(ScratchReg)
- .addReg(SPReg);
+ BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::SUBFC), SPReg)
+ .addReg(ScratchReg)
+ .addReg(SPReg);
+ }
+ } else {
+ // Initialize current frame pointer.
+ BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg);
+ // Use FPReg to calculate CFA.
+ if (needsCFI)
+ buildDefCFA(*CurrentMBB, {MI}, FPReg, 0);
}
// Probe residual part.
if (NegResidualSize) {
bool ResidualUseDForm = CanUseDForm(NegResidualSize);
if (!ResidualUseDForm)
- MaterializeImm(PrologMBB, {MI}, NegResidualSize, ScratchReg);
- allocateAndProbe(PrologMBB, {MI}, NegResidualSize, ScratchReg,
- ResidualUseDForm);
+ MaterializeImm(*CurrentMBB, {MI}, NegResidualSize, ScratchReg);
+ allocateAndProbe(*CurrentMBB, {MI}, NegResidualSize, ScratchReg,
+ ResidualUseDForm, FPReg);
}
bool UseDForm = CanUseDForm(NegProbeSize);
// If number of blocks is small, just probe them directly.
if (NumBlocks < 3) {
if (!UseDForm)
- MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg);
+ MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
for (int i = 0; i < NumBlocks; ++i)
- allocateAndProbe(PrologMBB, {MI}, NegProbeSize, ScratchReg, UseDForm);
+ allocateAndProbe(*CurrentMBB, {MI}, NegProbeSize, ScratchReg, UseDForm,
+ FPReg);
if (needsCFI) {
// Restore using SPReg to calculate CFA.
- buildDefCFAReg(PrologMBB, {MI}, SPReg);
+ buildDefCFAReg(*CurrentMBB, {MI}, SPReg);
}
} else {
// Since CTR is a volatile register and current shrinkwrap implementation
// won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a
// CTR loop to probe.
// Calculate trip count and stores it in CTRReg.
- MaterializeImm(PrologMBB, {MI}, NumBlocks, ScratchReg);
- BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
+ MaterializeImm(*CurrentMBB, {MI}, NumBlocks, ScratchReg);
+ BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
.addReg(ScratchReg, RegState::Kill);
if (!UseDForm)
- MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg);
+ MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
// Create MBBs of the loop.
MachineFunction::iterator MBBInsertPoint =
- std::next(PrologMBB.getIterator());
+ std::next(CurrentMBB->getIterator());
MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB);
MF.insert(MBBInsertPoint, LoopMBB);
MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
MF.insert(MBBInsertPoint, ExitMBB);
// Synthesize the loop body.
allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg,
- UseDForm);
+ UseDForm, FPReg);
BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ))
.addMBB(LoopMBB);
LoopMBB->addSuccessor(ExitMBB);
LoopMBB->addSuccessor(LoopMBB);
// Synthesize the exit MBB.
- ExitMBB->splice(ExitMBB->end(), &PrologMBB,
+ ExitMBB->splice(ExitMBB->end(), CurrentMBB,
std::next(MachineBasicBlock::iterator(MI)),
- PrologMBB.end());
- ExitMBB->transferSuccessorsAndUpdatePHIs(&PrologMBB);
- PrologMBB.addSuccessor(LoopMBB);
+ CurrentMBB->end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(CurrentMBB);
+ CurrentMBB->addSuccessor(LoopMBB);
if (needsCFI) {
// Restore using SPReg to calculate CFA.
buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
@@ -1551,8 +1502,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
// Get processor type.
bool isPPC64 = Subtarget.isPPC64();
- // Get the ABI.
- bool isSVR4ABI = Subtarget.isSVR4ABI();
// Check if the link register (LR) has been saved.
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1600,24 +1549,16 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
SingleScratchReg = ScratchReg == TempReg;
if (HasFP) {
- if (isSVR4ABI) {
- int FPIndex = FI->getFramePointerSaveIndex();
- assert(FPIndex && "No Frame Pointer Save Slot!");
- FPOffset = MFI.getObjectOffset(FPIndex);
- } else {
- FPOffset = getFramePointerSaveOffset();
- }
+ int FPIndex = FI->getFramePointerSaveIndex();
+ assert(FPIndex && "No Frame Pointer Save Slot!");
+ FPOffset = MFI.getObjectOffset(FPIndex);
}
int BPOffset = 0;
if (HasBP) {
- if (isSVR4ABI) {
int BPIndex = FI->getBasePointerSaveIndex();
assert(BPIndex && "No Base Pointer Save Slot!");
BPOffset = MFI.getObjectOffset(BPIndex);
- } else {
- BPOffset = getBasePointerSaveOffset();
- }
}
int PBPOffset = 0;
@@ -1703,11 +1644,18 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
// offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red
// zone add this offset back now.
+ // If the function has a base pointer, the stack pointer has been copied
+ // to it so we can restore it by copying in the other direction.
+ if (HasRedZone && HasBP) {
+ BuildMI(MBB, MBBI, dl, OrInst, RBReg).
+ addReg(BPReg).
+ addReg(BPReg);
+ }
// If this function contained a fastcc call and GuaranteedTailCallOpt is
// enabled (=> hasFastCall()==true) the fastcc call might contain a tail
// call which invalidates the stack pointer value in SP(0). So we use the
- // value of R31 in this case.
- if (FI->hasFastCall()) {
+ // value of R31 in this case. Similar situation exists with setjmp.
+ else if (FI->hasFastCall() || MF.exposesReturnsTwice()) {
assert(HasFP && "Expecting a valid frame pointer.");
if (!HasRedZone)
RBReg = FPReg;
@@ -2053,7 +2001,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
bool HasGPSaveArea = false;
bool HasG8SaveArea = false;
bool HasFPSaveArea = false;
- bool HasVRSAVESaveArea = false;
bool HasVRSaveArea = false;
SmallVector<CalleeSavedInfo, 18> GPRegs;
@@ -2093,8 +2040,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
} else if (PPC::CRBITRCRegClass.contains(Reg) ||
PPC::CRRCRegClass.contains(Reg)) {
; // do nothing, as we already know whether CRs are spilled
- } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
- HasVRSAVESaveArea = true;
} else if (PPC::VRRCRegClass.contains(Reg) ||
PPC::SPERCRegClass.contains(Reg)) {
// Altivec and SPE are mutually exclusive, but have the same stack
@@ -2217,23 +2162,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
LowerBound -= 4; // The CR save area is always 4 bytes long.
}
- if (HasVRSAVESaveArea) {
- // FIXME SVR4: Is it actually possible to have multiple elements in CSI
- // which have the VRSAVE register class?
- // Adjust the frame index of the VRSAVE spill slot.
- for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
- unsigned Reg = CSI[i].getReg();
-
- if (PPC::VRSAVERCRegClass.contains(Reg)) {
- int FI = CSI[i].getFrameIdx();
-
- MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
- }
- }
-
- LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
- }
-
// Both Altivec and SPE have the same alignment and padding requirements
// within the stack frame.
if (HasVRSaveArea) {
@@ -2273,8 +2201,8 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
// needed alignment padding.
unsigned StackSize = determineFrameLayout(MF, true);
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
- hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
+ if (MFI.hasVarSizedObjects() || spillsCR(MF) || hasNonRISpills(MF) ||
+ (hasSpills(MF) && !isInt<16>(StackSize))) {
const TargetRegisterClass &GPRC = PPC::GPRCRegClass;
const TargetRegisterClass &G8RC = PPC::G8RCRegClass;
const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC;
@@ -2288,7 +2216,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
MFI.hasVarSizedObjects() && MFI.getMaxAlign() > getStackAlign();
// These kinds of spills might need two registers.
- if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
+ if (spillsCR(MF) || HasAlVars)
RS->addScavengingFrameIndex(
MFI.CreateStackObject(Size, Alignment, false));
}
@@ -2365,9 +2293,6 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
- // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used.
- if (Reg == PPC::VRSAVE)
- continue;
// CR2 through CR4 are the nonvolatile CR fields.
bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
@@ -2532,10 +2457,6 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
- // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used.
- if (Reg == PPC::VRSAVE)
- continue;
-
if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8ffd89ef5ccd..693b0adaede4 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -43,6 +43,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
@@ -138,7 +139,6 @@ namespace {
///
class PPCDAGToDAGISel : public SelectionDAGISel {
const PPCTargetMachine &TM;
- const PPCSubtarget *PPCSubTarget = nullptr;
const PPCSubtarget *Subtarget = nullptr;
const PPCTargetLowering *PPCLowering = nullptr;
unsigned GlobalBaseReg = 0;
@@ -150,14 +150,10 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override {
// Make sure we re-emit a set of the global base reg if necessary
GlobalBaseReg = 0;
- PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
Subtarget = &MF.getSubtarget<PPCSubtarget>();
PPCLowering = Subtarget->getTargetLowering();
SelectionDAGISel::runOnMachineFunction(MF);
- if (!Subtarget->isSVR4ABI())
- InsertVRSaveCode(MF);
-
return true;
}
@@ -218,7 +214,7 @@ namespace {
/// SelectCC - Select a comparison of the specified values with the
/// specified condition code, returning the CR# of the expression.
SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- const SDLoc &dl);
+ const SDLoc &dl, SDValue Chain = SDValue());
/// SelectAddrImmOffs - Return true if the operand is valid for a preinc
/// immediate field. Note that the operand at this point is already the
@@ -295,6 +291,13 @@ namespace {
Align(16));
}
+ /// SelectAddrImmX34 - Returns true if the address N can be represented by
+ /// a base register plus a signed 34-bit displacement. Suitable for use by
+ /// PSTXVP and friends.
+ bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) {
+ return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG);
+ }
+
// Select an address into a single register.
bool SelectAddr(SDValue N, SDValue &Base) {
Base = N;
@@ -340,8 +343,6 @@ namespace {
return true;
}
- void InsertVRSaveCode(MachineFunction &MF);
-
StringRef getPassName() const override {
return "PowerPC DAG->DAG Pattern Instruction Selection";
}
@@ -351,6 +352,7 @@ namespace {
private:
bool trySETCC(SDNode *N);
+ bool tryFoldSWTestBRCC(SDNode *N);
bool tryAsSingleRLDICL(SDNode *N);
bool tryAsSingleRLDICR(SDNode *N);
bool tryAsSingleRLWINM(SDNode *N);
@@ -375,70 +377,6 @@ private:
} // end anonymous namespace
-/// InsertVRSaveCode - Once the entire function has been instruction selected,
-/// all virtual registers are created and all machine instructions are built,
-/// check to see if we need to save/restore VRSAVE. If so, do it.
-void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
- // Check to see if this function uses vector registers, which means we have to
- // save and restore the VRSAVE register and update it with the regs we use.
- //
- // In this case, there will be virtual registers of vector type created
- // by the scheduler. Detect them now.
- bool HasVectorVReg = false;
- for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
- unsigned Reg = Register::index2VirtReg(i);
- if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
- HasVectorVReg = true;
- break;
- }
- }
- if (!HasVectorVReg) return; // nothing to do.
-
- // If we have a vector register, we want to emit code into the entry and exit
- // blocks to save and restore the VRSAVE register. We do this here (instead
- // of marking all vector instructions as clobbering VRSAVE) for two reasons:
- //
- // 1. This (trivially) reduces the load on the register allocator, by not
- // having to represent the live range of the VRSAVE register.
- // 2. This (more significantly) allows us to create a temporary virtual
- // register to hold the saved VRSAVE value, allowing this temporary to be
- // register allocated, instead of forcing it to be spilled to the stack.
-
- // Create two vregs - one to hold the VRSAVE register that is live-in to the
- // function and one for the value after having bits or'd into it.
- Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
- Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
-
- const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
- MachineBasicBlock &EntryBB = *Fn.begin();
- DebugLoc dl;
- // Emit the following code into the entry block:
- // InVRSAVE = MFVRSAVE
- // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
- // MTVRSAVE UpdatedVRSAVE
- MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point
- BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
- BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
- UpdatedVRSAVE).addReg(InVRSAVE);
- BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
-
- // Find all return blocks, outputting a restore in each epilog.
- for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
- if (BB->isReturnBlock()) {
- IP = BB->end(); --IP;
-
- // Skip over all terminator instructions, which are part of the return
- // sequence.
- MachineBasicBlock::iterator I2 = IP;
- while (I2 != BB->begin() && (--I2)->isTerminator())
- IP = I2;
-
- // Emit: MTVRSAVE InVRSave
- BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
- }
- }
-}
-
/// getGlobalBaseReg - Output the instructions required to put the
/// base address to use for accessing globals into a register.
///
@@ -648,6 +586,8 @@ bool PPCDAGToDAGISel::tryTLSXFormStore(StoreSDNode *ST) {
SDValue Offset = ST->getOffset();
if (!Offset.isUndef())
return false;
+ if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR)
+ return false;
SDLoc dl(ST);
EVT MemVT = ST->getMemoryVT();
@@ -691,6 +631,8 @@ bool PPCDAGToDAGISel::tryTLSXFormLoad(LoadSDNode *LD) {
SDValue Offset = LD->getOffset();
if (!Offset.isUndef())
return false;
+ if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR)
+ return false;
SDLoc dl(LD);
EVT MemVT = LD->getMemoryVT();
@@ -800,251 +742,6 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
return false;
}
-// Predict the number of instructions that would be generated by calling
-// selectI64Imm(N).
-static unsigned selectI64ImmInstrCountDirect(int64_t Imm) {
- // Assume no remaining bits.
- unsigned Remainder = 0;
- // Assume no shift required.
- unsigned Shift = 0;
-
- // If it can't be represented as a 32 bit value.
- if (!isInt<32>(Imm)) {
- Shift = countTrailingZeros<uint64_t>(Imm);
- int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
- // If the shifted value fits 32 bits.
- if (isInt<32>(ImmSh)) {
- // Go with the shifted value.
- Imm = ImmSh;
- } else {
- // Still stuck with a 64 bit value.
- Remainder = Imm;
- Shift = 32;
- Imm >>= 32;
- }
- }
-
- // Intermediate operand.
- unsigned Result = 0;
-
- // Handle first 32 bits.
- unsigned Lo = Imm & 0xFFFF;
-
- // Simple value.
- if (isInt<16>(Imm)) {
- // Just the Lo bits.
- ++Result;
- } else if (Lo) {
- // Handle the Hi bits and Lo bits.
- Result += 2;
- } else {
- // Just the Hi bits.
- ++Result;
- }
-
- // If no shift, we're done.
- if (!Shift) return Result;
-
- // If Hi word == Lo word,
- // we can use rldimi to insert the Lo word into Hi word.
- if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
- ++Result;
- return Result;
- }
-
- // Shift for next step if the upper 32-bits were not zero.
- if (Imm)
- ++Result;
-
- // Add in the last bits as required.
- if ((Remainder >> 16) & 0xFFFF)
- ++Result;
- if (Remainder & 0xFFFF)
- ++Result;
-
- return Result;
-}
-
-static uint64_t Rot64(uint64_t Imm, unsigned R) {
- return (Imm << R) | (Imm >> (64 - R));
-}
-
-static unsigned selectI64ImmInstrCount(int64_t Imm) {
- unsigned Count = selectI64ImmInstrCountDirect(Imm);
-
- // If the instruction count is 1 or 2, we do not need further analysis
- // since rotate + load constant requires at least 2 instructions.
- if (Count <= 2)
- return Count;
-
- for (unsigned r = 1; r < 63; ++r) {
- uint64_t RImm = Rot64(Imm, r);
- unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
- Count = std::min(Count, RCount);
-
- // See comments in selectI64Imm for an explanation of the logic below.
- unsigned LS = findLastSet(RImm);
- if (LS != r-1)
- continue;
-
- uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
- uint64_t RImmWithOnes = RImm | OnesMask;
-
- RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
- Count = std::min(Count, RCount);
- }
-
- return Count;
-}
-
-// Select a 64-bit constant. For cost-modeling purposes, selectI64ImmInstrCount
-// (above) needs to be kept in sync with this function.
-static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl,
- int64_t Imm) {
- // Assume no remaining bits.
- unsigned Remainder = 0;
- // Assume no shift required.
- unsigned Shift = 0;
-
- // If it can't be represented as a 32 bit value.
- if (!isInt<32>(Imm)) {
- Shift = countTrailingZeros<uint64_t>(Imm);
- int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
- // If the shifted value fits 32 bits.
- if (isInt<32>(ImmSh)) {
- // Go with the shifted value.
- Imm = ImmSh;
- } else {
- // Still stuck with a 64 bit value.
- Remainder = Imm;
- Shift = 32;
- Imm >>= 32;
- }
- }
-
- // Intermediate operand.
- SDNode *Result;
-
- // Handle first 32 bits.
- unsigned Lo = Imm & 0xFFFF;
- unsigned Hi = (Imm >> 16) & 0xFFFF;
-
- auto getI32Imm = [CurDAG, dl](unsigned Imm) {
- return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
- };
-
- // Simple value.
- if (isInt<16>(Imm)) {
- uint64_t SextImm = SignExtend64(Lo, 16);
- SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
- // Just the Lo bits.
- Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
- } else if (Lo) {
- // Handle the Hi bits.
- unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
- Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
- // And Lo bits.
- Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
- SDValue(Result, 0), getI32Imm(Lo));
- } else {
- // Just the Hi bits.
- Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
- }
-
- // If no shift, we're done.
- if (!Shift) return Result;
-
- // If Hi word == Lo word,
- // we can use rldimi to insert the Lo word into Hi word.
- if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
- SDValue Ops[] =
- { SDValue(Result, 0), SDValue(Result, 0), getI32Imm(Shift), getI32Imm(0)};
- return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
- }
-
- // Shift for next step if the upper 32-bits were not zero.
- if (Imm) {
- Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
- SDValue(Result, 0),
- getI32Imm(Shift),
- getI32Imm(63 - Shift));
- }
-
- // Add in the last bits as required.
- if ((Hi = (Remainder >> 16) & 0xFFFF)) {
- Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
- SDValue(Result, 0), getI32Imm(Hi));
- }
- if ((Lo = Remainder & 0xFFFF)) {
- Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
- SDValue(Result, 0), getI32Imm(Lo));
- }
-
- return Result;
-}
-
-static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl,
- int64_t Imm) {
- unsigned Count = selectI64ImmInstrCountDirect(Imm);
-
- // If the instruction count is 1 or 2, we do not need further analysis
- // since rotate + load constant requires at least 2 instructions.
- if (Count <= 2)
- return selectI64ImmDirect(CurDAG, dl, Imm);
-
- unsigned RMin = 0;
-
- int64_t MatImm;
- unsigned MaskEnd;
-
- for (unsigned r = 1; r < 63; ++r) {
- uint64_t RImm = Rot64(Imm, r);
- unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
- if (RCount < Count) {
- Count = RCount;
- RMin = r;
- MatImm = RImm;
- MaskEnd = 63;
- }
-
- // If the immediate to generate has many trailing zeros, it might be
- // worthwhile to generate a rotated value with too many leading ones
- // (because that's free with li/lis's sign-extension semantics), and then
- // mask them off after rotation.
-
- unsigned LS = findLastSet(RImm);
- // We're adding (63-LS) higher-order ones, and we expect to mask them off
- // after performing the inverse rotation by (64-r). So we need that:
- // 63-LS == 64-r => LS == r-1
- if (LS != r-1)
- continue;
-
- uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
- uint64_t RImmWithOnes = RImm | OnesMask;
-
- RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
- if (RCount < Count) {
- Count = RCount;
- RMin = r;
- MatImm = RImmWithOnes;
- MaskEnd = LS;
- }
- }
-
- if (!RMin)
- return selectI64ImmDirect(CurDAG, dl, Imm);
-
- auto getI32Imm = [CurDAG, dl](unsigned Imm) {
- return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
- };
-
- SDValue Val = SDValue(selectI64ImmDirect(CurDAG, dl, MatImm), 0);
- return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
- getI32Imm(64 - RMin), getI32Imm(MaskEnd));
-}
-
static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
unsigned MaxTruncation = 0;
// Cannot use range-based for loop here as we need the actual use (i.e. we
@@ -1101,6 +798,274 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
return MaxTruncation;
}
+// For any 32 < Num < 64, check if the Imm contains at least Num consecutive
+// zeros and return the number of bits by the left of these consecutive zeros.
+static int findContiguousZerosAtLeast(uint64_t Imm, unsigned Num) {
+ unsigned HiTZ = countTrailingZeros<uint32_t>(Hi_32(Imm));
+ unsigned LoLZ = countLeadingZeros<uint32_t>(Lo_32(Imm));
+ if ((HiTZ + LoLZ) >= Num)
+ return (32 + HiTZ);
+ return 0;
+}
+
+// Direct materialization of 64-bit constants by enumerated patterns.
+static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl,
+ uint64_t Imm, unsigned &InstCnt) {
+ unsigned TZ = countTrailingZeros<uint64_t>(Imm);
+ unsigned LZ = countLeadingZeros<uint64_t>(Imm);
+ unsigned TO = countTrailingOnes<uint64_t>(Imm);
+ unsigned LO = countLeadingOnes<uint64_t>(Imm);
+ unsigned Hi32 = Hi_32(Imm);
+ unsigned Lo32 = Lo_32(Imm);
+ SDNode *Result = nullptr;
+ unsigned Shift = 0;
+
+ auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ };
+
+ // Following patterns use 1 instructions to materialize the Imm.
+ InstCnt = 1;
+ // 1-1) Patterns : {zeros}{15-bit valve}
+ // {ones}{15-bit valve}
+ if (isInt<16>(Imm)) {
+ SDValue SDImm = CurDAG->getTargetConstant(Imm, dl, MVT::i64);
+ return CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
+ }
+ // 1-2) Patterns : {zeros}{15-bit valve}{16 zeros}
+ // {ones}{15-bit valve}{16 zeros}
+ if (TZ > 15 && (LZ > 32 || LO > 32))
+ return CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64,
+ getI32Imm((Imm >> 16) & 0xffff));
+
+ // Following patterns use 2 instructions to materialize the Imm.
+ InstCnt = 2;
+ assert(LZ < 64 && "Unexpected leading zeros here.");
+ // Count of ones follwing the leading zeros.
+ unsigned FO = countLeadingOnes<uint64_t>(Imm << LZ);
+ // 2-1) Patterns : {zeros}{31-bit value}
+ // {ones}{31-bit value}
+ if (isInt<32>(Imm)) {
+ uint64_t ImmHi16 = (Imm >> 16) & 0xffff;
+ unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+ Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+ return CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(Imm & 0xffff));
+ }
+ // 2-2) Patterns : {zeros}{ones}{15-bit value}{zeros}
+ // {zeros}{15-bit value}{zeros}
+ // {zeros}{ones}{15-bit value}
+ // {ones}{15-bit value}{zeros}
+ // We can take advantage of LI's sign-extension semantics to generate leading
+ // ones, and then use RLDIC to mask off the ones in both sides after rotation.
+ if ((LZ + FO + TZ) > 48) {
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+ getI32Imm((Imm >> TZ) & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(TZ), getI32Imm(LZ));
+ }
+ // 2-3) Pattern : {zeros}{15-bit value}{ones}
+ // Shift right the Imm by (48 - LZ) bits to construct a negtive 16 bits value,
+ // therefore we can take advantage of LI's sign-extension semantics, and then
+ // mask them off after rotation.
+ //
+ // +--LZ--||-15-bit-||--TO--+ +-------------|--16-bit--+
+ // |00000001bbbbbbbbb1111111| -> |00000000000001bbbbbbbbb1|
+ // +------------------------+ +------------------------+
+ // 63 0 63 0
+ // Imm (Imm >> (48 - LZ) & 0xffff)
+ // +----sext-----|--16-bit--+ +clear-|-----------------+
+ // |11111111111111bbbbbbbbb1| -> |00000001bbbbbbbbb1111111|
+ // +------------------------+ +------------------------+
+ // 63 0 63 0
+ // LI8: sext many leading zeros RLDICL: rotate left (48 - LZ), clear left LZ
+ if ((LZ + TO) > 48) {
+ // Since the immediates with (LZ > 32) have been handled by previous
+ // patterns, here we have (LZ <= 32) to make sure we will not shift right
+ // the Imm by a negative value.
+ assert(LZ <= 32 && "Unexpected shift value.");
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+ getI32Imm((Imm >> (48 - LZ) & 0xffff)));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(48 - LZ), getI32Imm(LZ));
+ }
+ // 2-4) Patterns : {zeros}{ones}{15-bit value}{ones}
+ // {ones}{15-bit value}{ones}
+ // We can take advantage of LI's sign-extension semantics to generate leading
+ // ones, and then use RLDICL to mask off the ones in left sides (if required)
+ // after rotation.
+ //
+ // +-LZ-FO||-15-bit-||--TO--+ +-------------|--16-bit--+
+ // |00011110bbbbbbbbb1111111| -> |000000000011110bbbbbbbbb|
+ // +------------------------+ +------------------------+
+ // 63 0 63 0
+ // Imm (Imm >> TO) & 0xffff
+ // +----sext-----|--16-bit--+ +LZ|---------------------+
+ // |111111111111110bbbbbbbbb| -> |00011110bbbbbbbbb1111111|
+ // +------------------------+ +------------------------+
+ // 63 0 63 0
+ // LI8: sext many leading zeros RLDICL: rotate left TO, clear left LZ
+ if ((LZ + FO + TO) > 48) {
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+ getI32Imm((Imm >> TO) & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(TO), getI32Imm(LZ));
+ }
+ // 2-5) Pattern : {32 zeros}{****}{0}{15-bit value}
+ // If Hi32 is zero and the Lo16(in Lo32) can be presented as a positive 16 bit
+ // value, we can use LI for Lo16 without generating leading ones then add the
+ // Hi16(in Lo32).
+ if (LZ == 32 && ((Lo32 & 0x8000) == 0)) {
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+ getI32Imm(Lo32 & 0xffff));
+ return CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(Lo32 >> 16));
+ }
+ // 2-6) Patterns : {******}{49 zeros}{******}
+ // {******}{49 ones}{******}
+ // If the Imm contains 49 consecutive zeros/ones, it means that a total of 15
+ // bits remain on both sides. Rotate right the Imm to construct an int<16>
+ // value, use LI for int<16> value and then use RLDICL without mask to rotate
+ // it back.
+ //
+ // 1) findContiguousZerosAtLeast(Imm, 49)
+ // +------|--zeros-|------+ +---ones--||---15 bit--+
+ // |bbbbbb0000000000aaaaaa| -> |0000000000aaaaaabbbbbb|
+ // +----------------------+ +----------------------+
+ // 63 0 63 0
+ //
+ // 2) findContiguousZerosAtLeast(~Imm, 49)
+ // +------|--ones--|------+ +---ones--||---15 bit--+
+ // |bbbbbb1111111111aaaaaa| -> |1111111111aaaaaabbbbbb|
+ // +----------------------+ +----------------------+
+ // 63 0 63 0
+ if ((Shift = findContiguousZerosAtLeast(Imm, 49)) ||
+ (Shift = findContiguousZerosAtLeast(~Imm, 49))) {
+ uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift));
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64,
+ getI32Imm(RotImm & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(Shift), getI32Imm(0));
+ }
+
+ // Following patterns use 3 instructions to materialize the Imm.
+ InstCnt = 3;
+ // 3-1) Patterns : {zeros}{ones}{31-bit value}{zeros}
+ // {zeros}{31-bit value}{zeros}
+ // {zeros}{ones}{31-bit value}
+ // {ones}{31-bit value}{zeros}
+ // We can take advantage of LIS's sign-extension semantics to generate leading
+ // ones, add the remaining bits with ORI, and then use RLDIC to mask off the
+ // ones in both sides after rotation.
+ if ((LZ + FO + TZ) > 32) {
+ uint64_t ImmHi16 = (Imm >> (TZ + 16)) & 0xffff;
+ unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+ Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm((Imm >> TZ) & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(TZ), getI32Imm(LZ));
+ }
+ // 3-2) Pattern : {zeros}{31-bit value}{ones}
+ // Shift right the Imm by (32 - LZ) bits to construct a negtive 32 bits value,
+ // therefore we can take advantage of LIS's sign-extension semantics, add
+ // the remaining bits with ORI, and then mask them off after rotation.
+ // This is similar to Pattern 2-3, please refer to the diagram there.
+ if ((LZ + TO) > 32) {
+ // Since the immediates with (LZ > 32) have been handled by previous
+ // patterns, here we have (LZ <= 32) to make sure we will not shift right
+ // the Imm by a negative value.
+ assert(LZ <= 32 && "Unexpected shift value.");
+ Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64,
+ getI32Imm((Imm >> (48 - LZ)) & 0xffff));
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm((Imm >> (32 - LZ)) & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(32 - LZ), getI32Imm(LZ));
+ }
+ // 3-3) Patterns : {zeros}{ones}{31-bit value}{ones}
+ // {ones}{31-bit value}{ones}
+ // We can take advantage of LIS's sign-extension semantics to generate leading
+ // ones, add the remaining bits with ORI, and then use RLDICL to mask off the
+ // ones in left sides (if required) after rotation.
+ // This is similar to Pattern 2-4, please refer to the diagram there.
+ if ((LZ + FO + TO) > 32) {
+ Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64,
+ getI32Imm((Imm >> (TO + 16)) & 0xffff));
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm((Imm >> TO) & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(TO), getI32Imm(LZ));
+ }
+ // 3-4) Patterns : High word == Low word
+ if (Hi32 == Lo32) {
+ // Handle the first 32 bits.
+ uint64_t ImmHi16 = (Lo32 >> 16) & 0xffff;
+ unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+ Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(Lo32 & 0xffff));
+ // Use rldimi to insert the Low word into High word.
+ SDValue Ops[] = {SDValue(Result, 0), SDValue(Result, 0), getI32Imm(32),
+ getI32Imm(0)};
+ return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
+ }
+ // 3-5) Patterns : {******}{33 zeros}{******}
+ // {******}{33 ones}{******}
+ // If the Imm contains 33 consecutive zeros/ones, it means that a total of 31
+ // bits remain on both sides. Rotate right the Imm to construct an int<32>
+ // value, use LIS + ORI for int<32> value and then use RLDICL without mask to
+ // rotate it back.
+ // This is similar to Pattern 2-6, please refer to the diagram there.
+ if ((Shift = findContiguousZerosAtLeast(Imm, 33)) ||
+ (Shift = findContiguousZerosAtLeast(~Imm, 33))) {
+ uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift));
+ uint64_t ImmHi16 = (RotImm >> 16) & 0xffff;
+ unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8;
+ Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16));
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(RotImm & 0xffff));
+ return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(Shift), getI32Imm(0));
+ }
+
+ InstCnt = 0;
+ return nullptr;
+}
+
+static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl, uint64_t Imm,
+ unsigned *InstCnt = nullptr) {
+ unsigned InstCntDirect = 0;
+ // No more than 3 instructions is used if we can select the i64 immediate
+ // directly.
+ SDNode *Result = selectI64ImmDirect(CurDAG, dl, Imm, InstCntDirect);
+ if (Result) {
+ if (InstCnt)
+ *InstCnt = InstCntDirect;
+ return Result;
+ }
+ auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+ };
+ // Handle the upper 32 bit value.
+ Result =
+ selectI64ImmDirect(CurDAG, dl, Imm & 0xffffffff00000000, InstCntDirect);
+ // Add in the last bits as required.
+ if (uint32_t Hi16 = (Lo_32(Imm) >> 16) & 0xffff) {
+ Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Hi16));
+ ++InstCntDirect;
+ }
+ if (uint32_t Lo16 = Lo_32(Imm) & 0xffff) {
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0),
+ getI32Imm(Lo16));
+ ++InstCntDirect;
+ }
+ if (InstCnt)
+ *InstCnt = InstCntDirect;
+ return Result;
+}
+
// Select a 64-bit constant.
static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) {
SDLoc dl(N);
@@ -1253,6 +1218,7 @@ class BitPermutationSelector {
}
break;
case ISD::SHL:
+ case PPCISD::SHL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned ShiftAmt = V.getConstantOperandVal(1);
@@ -1268,6 +1234,7 @@ class BitPermutationSelector {
}
break;
case ISD::SRL:
+ case PPCISD::SRL:
if (isa<ConstantSDNode>(V.getOperand(1))) {
unsigned ShiftAmt = V.getConstantOperandVal(1);
@@ -2147,11 +2114,14 @@ class BitPermutationSelector {
unsigned NumAndInsts = (unsigned) NeedsRotate +
(unsigned) (bool) Res;
+ unsigned NumOfSelectInsts = 0;
+ selectI64Imm(CurDAG, dl, Mask, &NumOfSelectInsts);
+ assert(NumOfSelectInsts > 0 && "Failed to select an i64 constant.");
if (Use32BitInsts)
NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
(unsigned) (ANDIMask != 0 && ANDISMask != 0);
else
- NumAndInsts += selectI64ImmInstrCount(Mask) + /* and */ 1;
+ NumAndInsts += NumOfSelectInsts + /* and */ 1;
unsigned NumRLInsts = 0;
bool FirstBG = true;
@@ -2375,12 +2345,14 @@ class BitPermutationSelector {
Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
ExtendToInt64(ANDIVal, dl), ANDISVal), 0);
} else {
- if (InstCnt) *InstCnt += selectI64ImmInstrCount(Mask) + /* and */ 1;
-
- SDValue MaskVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0);
- Res =
- SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
- ExtendToInt64(Res, dl), MaskVal), 0);
+ unsigned NumOfSelectInsts = 0;
+ SDValue MaskVal =
+ SDValue(selectI64Imm(CurDAG, dl, Mask, &NumOfSelectInsts), 0);
+ Res = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+ ExtendToInt64(Res, dl), MaskVal),
+ 0);
+ if (InstCnt)
+ *InstCnt += NumOfSelectInsts + /* and */ 1;
}
}
@@ -2411,7 +2383,7 @@ class BitPermutationSelector {
}
void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) {
- BitGroups.erase(remove_if(BitGroups, F), BitGroups.end());
+ erase_if(BitGroups, F);
}
SmallVector<ValueBit, 64> Bits;
@@ -3661,6 +3633,12 @@ bool PPCDAGToDAGISel::tryIntCompareInGPR(SDNode *N) {
if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
return false;
+ // For POWER10, it is more profitable to use the set boolean extension
+ // instructions rather than the integer compare elimination codegen.
+ // Users can override this via the command line option, `--ppc-gpr-icmps`.
+ if (!(CmpInGPR.getNumOccurrences() > 0) && Subtarget->isISA3_1())
+ return false;
+
switch (N->getOpcode()) {
default: break;
case ISD::ZERO_EXTEND:
@@ -3708,7 +3686,7 @@ bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
/// SelectCC - Select a comparison of the specified values with the specified
/// condition code, returning the CR# of the expression.
SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- const SDLoc &dl) {
+ const SDLoc &dl, SDValue Chain) {
// Always select the LHS.
unsigned Opc;
@@ -3861,7 +3839,12 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
assert(Subtarget->hasVSX() && "__float128 requires VSX");
Opc = PPC::XSCMPUQP;
}
- return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+ if (Chain)
+ return SDValue(
+ CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::Other, LHS, RHS, Chain),
+ 0);
+ else
+ return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
}
static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC, const EVT &VT,
@@ -3936,7 +3919,8 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
// getVCmpInst: return the vector compare instruction for the specified
// vector type and condition code. Since this is for altivec specific code,
-// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
+// only support the altivec types (v16i8, v8i16, v4i32, v2i64, v1i128,
+// and v4f32).
static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
bool HasVSX, bool &Swap, bool &Negate) {
Swap = false;
@@ -4017,6 +4001,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
return PPC::VCMPEQUW;
else if (VecVT == MVT::v2i64)
return PPC::VCMPEQUD;
+ else if (VecVT == MVT::v1i128)
+ return PPC::VCMPEQUQ;
break;
case ISD::SETGT:
if (VecVT == MVT::v16i8)
@@ -4027,6 +4013,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
return PPC::VCMPGTSW;
else if (VecVT == MVT::v2i64)
return PPC::VCMPGTSD;
+ else if (VecVT == MVT::v1i128)
+ return PPC::VCMPGTSQ;
break;
case ISD::SETUGT:
if (VecVT == MVT::v16i8)
@@ -4037,6 +4025,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
return PPC::VCMPGTUW;
else if (VecVT == MVT::v2i64)
return PPC::VCMPGTUD;
+ else if (VecVT == MVT::v1i128)
+ return PPC::VCMPGTUQ;
break;
default:
break;
@@ -4048,17 +4038,23 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
SDLoc dl(N);
unsigned Imm;
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ bool IsStrict = N->isStrictFPOpcode();
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(N->getOperand(IsStrict ? 3 : 2))->get();
EVT PtrVT =
CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
bool isPPC64 = (PtrVT == MVT::i64);
+ SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+
+ SDValue LHS = N->getOperand(IsStrict ? 1 : 0);
+ SDValue RHS = N->getOperand(IsStrict ? 2 : 1);
- if (!Subtarget->useCRBits() && isInt32Immediate(N->getOperand(1), Imm)) {
+ if (!IsStrict && !Subtarget->useCRBits() && isInt32Immediate(RHS, Imm)) {
// We can codegen setcc op, imm very efficiently compared to a brcond.
// Check for those cases here.
// setcc op, 0
if (Imm == 0) {
- SDValue Op = N->getOperand(0);
+ SDValue Op = LHS;
switch (CC) {
default: break;
case ISD::SETEQ: {
@@ -4093,7 +4089,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
}
}
} else if (Imm == ~0U) { // setcc op, -1
- SDValue Op = N->getOperand(0);
+ SDValue Op = LHS;
switch (CC) {
default: break;
case ISD::SETEQ:
@@ -4136,13 +4132,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
}
}
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
-
// Altivec Vector compare instructions do not set any CR register by default and
// vector compare operations return the same type as the operands.
- if (LHS.getValueType().isVector()) {
- if (Subtarget->hasQPX() || Subtarget->hasSPE())
+ if (!IsStrict && LHS.getValueType().isVector()) {
+ if (Subtarget->hasSPE())
return false;
EVT VecVT = LHS.getValueType();
@@ -4169,7 +4162,9 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
bool Inv;
unsigned Idx = getCRIdxForSetCC(CC, Inv);
- SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+ SDValue CCReg = SelectCC(LHS, RHS, CC, dl, Chain);
+ if (IsStrict)
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), CCReg.getValue(1));
SDValue IntCR;
// SPE e*cmp* instructions only set the 'gt' bit, so hard-code that
@@ -4272,8 +4267,10 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
(FalseRes.getOpcode() != ISD::SELECT_CC || CC != ISD::SETEQ)))
return false;
- bool InnerIsSel = FalseRes.getOpcode() == ISD::SELECT_CC;
- SDValue SetOrSelCC = InnerIsSel ? FalseRes : FalseRes.getOperand(0);
+ SDValue SetOrSelCC = FalseRes.getOpcode() == ISD::SELECT_CC
+ ? FalseRes
+ : FalseRes.getOperand(0);
+ bool InnerIsSel = SetOrSelCC.getOpcode() == ISD::SELECT_CC;
if (SetOrSelCC.getOpcode() != ISD::SETCC &&
SetOrSelCC.getOpcode() != ISD::SELECT_CC)
return false;
@@ -4382,6 +4379,81 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
return true;
}
+// Return true if it's a software square-root/divide operand.
+static bool isSWTestOp(SDValue N) {
+ if (N.getOpcode() == PPCISD::FTSQRT)
+ return true;
+ if (N.getNumOperands() < 1 || !isa<ConstantSDNode>(N.getOperand(0)))
+ return false;
+ switch (N.getConstantOperandVal(0)) {
+ case Intrinsic::ppc_vsx_xvtdivdp:
+ case Intrinsic::ppc_vsx_xvtdivsp:
+ case Intrinsic::ppc_vsx_xvtsqrtdp:
+ case Intrinsic::ppc_vsx_xvtsqrtsp:
+ return true;
+ }
+ return false;
+}
+
+bool PPCDAGToDAGISel::tryFoldSWTestBRCC(SDNode *N) {
+ assert(N->getOpcode() == ISD::BR_CC && "ISD::BR_CC is expected.");
+ // We are looking for following patterns, where `truncate to i1` actually has
+ // the same semantic with `and 1`.
+ // (br_cc seteq, (truncateToi1 SWTestOp), 0) -> (BCC PRED_NU, SWTestOp)
+ // (br_cc seteq, (and SWTestOp, 2), 0) -> (BCC PRED_NE, SWTestOp)
+ // (br_cc seteq, (and SWTestOp, 4), 0) -> (BCC PRED_LE, SWTestOp)
+ // (br_cc seteq, (and SWTestOp, 8), 0) -> (BCC PRED_GE, SWTestOp)
+ // (br_cc setne, (truncateToi1 SWTestOp), 0) -> (BCC PRED_UN, SWTestOp)
+ // (br_cc setne, (and SWTestOp, 2), 0) -> (BCC PRED_EQ, SWTestOp)
+ // (br_cc setne, (and SWTestOp, 4), 0) -> (BCC PRED_GT, SWTestOp)
+ // (br_cc setne, (and SWTestOp, 8), 0) -> (BCC PRED_LT, SWTestOp)
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return false;
+
+ SDValue CmpRHS = N->getOperand(3);
+ if (!isa<ConstantSDNode>(CmpRHS) ||
+ cast<ConstantSDNode>(CmpRHS)->getSExtValue() != 0)
+ return false;
+
+ SDValue CmpLHS = N->getOperand(2);
+ if (CmpLHS.getNumOperands() < 1 || !isSWTestOp(CmpLHS.getOperand(0)))
+ return false;
+
+ unsigned PCC = 0;
+ bool IsCCNE = CC == ISD::SETNE;
+ if (CmpLHS.getOpcode() == ISD::AND &&
+ isa<ConstantSDNode>(CmpLHS.getOperand(1)))
+ switch (CmpLHS.getConstantOperandVal(1)) {
+ case 1:
+ PCC = IsCCNE ? PPC::PRED_UN : PPC::PRED_NU;
+ break;
+ case 2:
+ PCC = IsCCNE ? PPC::PRED_EQ : PPC::PRED_NE;
+ break;
+ case 4:
+ PCC = IsCCNE ? PPC::PRED_GT : PPC::PRED_LE;
+ break;
+ case 8:
+ PCC = IsCCNE ? PPC::PRED_LT : PPC::PRED_GE;
+ break;
+ default:
+ return false;
+ }
+ else if (CmpLHS.getOpcode() == ISD::TRUNCATE &&
+ CmpLHS.getValueType() == MVT::i1)
+ PCC = IsCCNE ? PPC::PRED_UN : PPC::PRED_NU;
+
+ if (PCC) {
+ SDLoc dl(N);
+ SDValue Ops[] = {getI32Imm(PCC, dl), CmpLHS.getOperand(0), N->getOperand(4),
+ N->getOperand(0)};
+ CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+ return true;
+ }
+ return false;
+}
+
bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) {
assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
unsigned Imm;
@@ -4661,7 +4733,48 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ if (!Subtarget->isISA3_1())
+ break;
+ unsigned Opcode = 0;
+ switch (N->getConstantOperandVal(0)) {
+ default:
+ break;
+ case Intrinsic::ppc_altivec_vstribr_p:
+ Opcode = PPC::VSTRIBR_rec;
+ break;
+ case Intrinsic::ppc_altivec_vstribl_p:
+ Opcode = PPC::VSTRIBL_rec;
+ break;
+ case Intrinsic::ppc_altivec_vstrihr_p:
+ Opcode = PPC::VSTRIHR_rec;
+ break;
+ case Intrinsic::ppc_altivec_vstrihl_p:
+ Opcode = PPC::VSTRIHL_rec;
+ break;
+ }
+ if (!Opcode)
+ break;
+
+ // Generate the appropriate vector string isolate intrinsic to match.
+ EVT VTs[] = {MVT::v16i8, MVT::Glue};
+ SDValue VecStrOp =
+ SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, N->getOperand(2)), 0);
+ // Vector string isolate instructions update the EQ bit of CR6.
+ // Generate a SETBC instruction to extract the bit and place it in a GPR.
+ SDValue SubRegIdx = CurDAG->getTargetConstant(PPC::sub_eq, dl, MVT::i32);
+ SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32);
+ SDValue CRBit = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1,
+ CR6Reg, SubRegIdx, VecStrOp.getValue(1)),
+ 0);
+ CurDAG->SelectNodeTo(N, PPC::SETBC, MVT::i32, CRBit);
+ return;
+ }
+
case ISD::SETCC:
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS:
if (trySETCC(N))
return;
break;
@@ -4813,8 +4926,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
switch (LoadedVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Invalid PPC load type!");
- case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
- case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
case MVT::f64: Opcode = PPC::LFDUX; break;
case MVT::f32: Opcode = PPC::LFSUX; break;
case MVT::i32: Opcode = PPC::LWZUX; break;
@@ -4961,6 +5072,32 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
}
+ case ISD::MUL: {
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.getOpcode() != ISD::Constant || Op1.getValueType() != MVT::i64)
+ break;
+
+ // If the multiplier fits int16, we can handle it with mulli.
+ int64_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue();
+ unsigned Shift = countTrailingZeros<uint64_t>(Imm);
+ if (isInt<16>(Imm) || !Shift)
+ break;
+
+ // If the shifted value fits int16, we can do this transformation:
+ // (mul X, c1 << c2) -> (rldicr (mulli X, c1) c2). We do this in ISEL due to
+ // DAGCombiner prefers (shl (mul X, c1), c2) -> (mul X, c1 << c2).
+ uint64_t ImmSh = Imm >> Shift;
+ if (isInt<16>(ImmSh)) {
+ uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16);
+ SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
+ SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI8, dl, MVT::i64,
+ N->getOperand(0), SDImm);
+ CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, SDValue(MulNode, 0),
+ getI32Imm(Shift, dl), getI32Imm(63 - Shift, dl));
+ return;
+ }
+ break;
+ }
// FIXME: Remove this once the ANDI glue bug is fixed:
case PPCISD::ANDI_rec_1_EQ_BIT:
case PPCISD::ANDI_rec_1_GT_BIT: {
@@ -5095,12 +5232,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
SelectCCOp = PPC::SELECT_CC_F16;
else if (Subtarget->hasSPE())
SelectCCOp = PPC::SELECT_CC_SPE;
- else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
- SelectCCOp = PPC::SELECT_CC_QFRC;
- else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
- SelectCCOp = PPC::SELECT_CC_QSRC;
- else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
- SelectCCOp = PPC::SELECT_CC_QBRC;
else if (N->getValueType(0) == MVT::v2f64 ||
N->getValueType(0) == MVT::v2i64)
SelectCCOp = PPC::SELECT_CC_VSRC;
@@ -5192,6 +5323,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
return;
}
case ISD::BR_CC: {
+ if (tryFoldSWTestBRCC(N))
+ return;
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
unsigned PCC =
getPredicateForSetCC(CC, N->getOperand(2).getValueType(), Subtarget);
@@ -5856,9 +5989,6 @@ void PPCDAGToDAGISel::PeepholeCROps() {
case PPC::SELECT_I8:
case PPC::SELECT_F4:
case PPC::SELECT_F8:
- case PPC::SELECT_QFRC:
- case PPC::SELECT_QSRC:
- case PPC::SELECT_QBRC:
case PPC::SELECT_SPE:
case PPC::SELECT_SPE4:
case PPC::SELECT_VRRC:
@@ -6177,9 +6307,6 @@ void PPCDAGToDAGISel::PeepholeCROps() {
case PPC::SELECT_I8:
case PPC::SELECT_F4:
case PPC::SELECT_F8:
- case PPC::SELECT_QFRC:
- case PPC::SELECT_QSRC:
- case PPC::SELECT_QBRC:
case PPC::SELECT_SPE:
case PPC::SELECT_SPE4:
case PPC::SELECT_VRRC:
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 641b2facdc41..9215c17cb94b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -74,6 +74,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionXCOFF.h"
#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/BranchProbability.h"
@@ -120,6 +121,11 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
+// TODO - Remove this option if soft fp128 has been fully supported .
+static cl::opt<bool>
+ EnableSoftFP128("enable-soft-fp128",
+ cl::desc("temp option to enable soft fp128"), cl::Hidden);
+
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
@@ -145,7 +151,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
if (!useSoftFloat()) {
if (hasSPE()) {
addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
- addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
+ // EFPU2 APU only supports f32
+ if (!Subtarget.hasEFPU2())
+ addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
} else {
addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
@@ -215,13 +223,36 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
if (isPPC64 || Subtarget.hasFPCVT()) {
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote);
+ AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote);
+ AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
isPPC64 ? MVT::i64 : MVT::i32);
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
isPPC64 ? MVT::i64 : MVT::i32);
+
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote);
+ AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote);
+ AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+ AddPromotedToType(ISD::FP_TO_SINT, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+ AddPromotedToType(ISD::FP_TO_UINT, MVT::i1,
+ isPPC64 ? MVT::i64 : MVT::i32);
} else {
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
}
@@ -247,6 +278,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// PPC (the libcall is not available).
setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::ppcf128, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::ppcf128, Custom);
// We do not currently implement these libm ops for PowerPC.
setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
@@ -299,8 +332,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
- if (Subtarget.hasVSX())
- setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);
+ if (Subtarget.hasVSX()) {
+ setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal);
+ }
if (Subtarget.hasFSQRT()) {
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
@@ -338,6 +373,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FMA , MVT::f32, Legal);
}
+ if (Subtarget.hasSPE())
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
// If we're enabling GP optimizations, use hardware square root
@@ -415,6 +453,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
if (!Subtarget.useCRBits())
setOperationAction(ISD::SETCC, MVT::i32, Custom);
+ if (Subtarget.hasFPU()) {
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Legal);
+
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Legal);
+ }
+
// PowerPC does not have BRCOND which requires SetCC
if (!Subtarget.useCRBits())
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -431,9 +479,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
} else {
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
// PowerPC does not have [U|S]INT_TO_FP
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Expand);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
}
@@ -561,36 +612,56 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+
if (Subtarget.has64BitSupport()) {
// They also have instructions for converting between i64 and fp.
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Expand);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
// This is just the low 32 bits of a (signed) fp->i64 conversion.
// We cannot do this with Promote because i64 is not a legal type.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
- if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
+ if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) {
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+ }
} else {
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
if (Subtarget.hasSPE()) {
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
- } else
+ } else {
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+ }
}
// With the instructions enabled under FPCVT, we can do everything.
if (Subtarget.hasFPCVT()) {
if (Subtarget.has64BitSupport()) {
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
}
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
@@ -613,6 +684,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
}
+ // PowerPC has better expansions for funnel shifts than the generic
+ // TargetLowering::expandFunnelShift.
+ if (Subtarget.has64BitSupport()) {
+ setOperationAction(ISD::FSHL, MVT::i64, Custom);
+ setOperationAction(ISD::FSHR, MVT::i64, Custom);
+ }
+ setOperationAction(ISD::FSHL, MVT::i32, Custom);
+ setOperationAction(ISD::FSHR, MVT::i32, Custom);
+
if (Subtarget.hasVSX()) {
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
@@ -745,9 +825,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
}
- for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
- setOperationAction(ISD::ABS, VT, Custom);
-
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
// with merges, splats, etc.
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
@@ -767,6 +844,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v4i32,
Subtarget.useCRBits() ? Legal : Expand);
setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
@@ -776,11 +857,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
- // Without hasP8Altivec set, v2i64 SMAX isn't available.
- // But ABS custom lowering requires SMAX support.
- if (!Subtarget.hasP8Altivec())
- setOperationAction(ISD::ABS, MVT::v2i64, Expand);
-
// Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
@@ -809,6 +885,27 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
else
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ if (Subtarget.isISA3_1()) {
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MULHS, MVT::v2i64, Legal);
+ setOperationAction(ISD::MULHU, MVT::v2i64, Legal);
+ setOperationAction(ISD::MULHS, MVT::v4i32, Legal);
+ setOperationAction(ISD::MULHU, MVT::v4i32, Legal);
+ setOperationAction(ISD::UDIV, MVT::v2i64, Legal);
+ setOperationAction(ISD::SDIV, MVT::v2i64, Legal);
+ setOperationAction(ISD::UDIV, MVT::v4i32, Legal);
+ setOperationAction(ISD::SDIV, MVT::v4i32, Legal);
+ setOperationAction(ISD::UREM, MVT::v2i64, Legal);
+ setOperationAction(ISD::SREM, MVT::v2i64, Legal);
+ setOperationAction(ISD::UREM, MVT::v4i32, Legal);
+ setOperationAction(ISD::SREM, MVT::v4i32, Legal);
+ setOperationAction(ISD::UREM, MVT::v1i128, Legal);
+ setOperationAction(ISD::SREM, MVT::v1i128, Legal);
+ setOperationAction(ISD::UDIV, MVT::v1i128, Legal);
+ setOperationAction(ISD::SDIV, MVT::v1i128, Legal);
+ setOperationAction(ISD::ROTL, MVT::v1i128, Legal);
+ }
+
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -920,7 +1017,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SUB, MVT::v2i64, Expand);
}
- setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
+ if (Subtarget.isISA3_1())
+ setOperationAction(ISD::SETCC, MVT::v1i128, Legal);
+ else
+ setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
@@ -929,6 +1029,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
@@ -937,6 +1041,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// Custom handling for partial vectors of integers converted to
// floating point. We already have optimal handling for v2i32 through
// the DAG combine, so those aren't necessary.
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i8, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i8, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i16, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i8, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i8, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i16, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
@@ -968,7 +1080,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
- setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
@@ -982,7 +1094,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
- setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
@@ -1065,6 +1177,48 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
+ } else if (Subtarget.hasAltivec() && EnableSoftFP128) {
+ addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+
+ for (MVT FPT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+
+ setOperationAction(ISD::LOAD, MVT::f128, Promote);
+ setOperationAction(ISD::STORE, MVT::f128, Promote);
+
+ AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32);
+ AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32);
+
+ // Set FADD/FSUB as libcall to avoid the legalizer to expand the
+ // fp_to_uint and int_to_fp.
+ setOperationAction(ISD::FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+
+ setOperationAction(ISD::FMUL, MVT::f128, Expand);
+ setOperationAction(ISD::FDIV, MVT::f128, Expand);
+ setOperationAction(ISD::FNEG, MVT::f128, Expand);
+ setOperationAction(ISD::FABS, MVT::f128, Expand);
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
+ setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
+ setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+ setOperationAction(ISD::FMA, MVT::f128, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+
+ // Expand the fp_extend if the target type is fp128.
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);
+
+ // Expand the fp_round if the source type is fp128.
+ for (MVT VT : {MVT::f32, MVT::f64}) {
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
+ }
}
if (Subtarget.hasP9Altivec()) {
@@ -1081,164 +1235,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
}
}
- if (Subtarget.hasQPX()) {
- setOperationAction(ISD::FADD, MVT::v4f64, Legal);
- setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
- setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
- setOperationAction(ISD::FREM, MVT::v4f64, Expand);
-
- setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
- setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
-
- setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
- setOperationAction(ISD::STORE , MVT::v4f64, Custom);
-
- setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
-
- if (!Subtarget.useCRBits())
- setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
- setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
- setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
- setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
-
- setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
- setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
-
- setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
-
- setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
- setOperationAction(ISD::FABS , MVT::v4f64, Legal);
- setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
- setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
- setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
- setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
- setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
- setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
- setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
- setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
-
- setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
-
- setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
- setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
-
- addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
-
- setOperationAction(ISD::FADD, MVT::v4f32, Legal);
- setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
- setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
- setOperationAction(ISD::FREM, MVT::v4f32, Expand);
-
- setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
- setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
-
- setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
- setOperationAction(ISD::STORE , MVT::v4f32, Custom);
-
- if (!Subtarget.useCRBits())
- setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
- setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
- setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
- setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
-
- setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
- setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
-
- setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
- setOperationAction(ISD::FABS , MVT::v4f32, Legal);
- setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
- setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
- setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
- setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
- setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
- setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
- setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
- setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
-
- setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
-
- setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
- setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
-
- addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
-
- setOperationAction(ISD::AND , MVT::v4i1, Legal);
- setOperationAction(ISD::OR , MVT::v4i1, Legal);
- setOperationAction(ISD::XOR , MVT::v4i1, Legal);
-
- if (!Subtarget.useCRBits())
- setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
- setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
-
- setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
- setOperationAction(ISD::STORE , MVT::v4i1, Custom);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
- setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
- setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
-
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
-
- addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
-
- setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
- setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
-
- setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
- setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
-
- setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
- setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
-
- // These need to set FE_INEXACT, and so cannot be vectorized here.
- setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
- setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
-
- if (TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
- setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
-
- setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
- setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
- } else {
- setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
- setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
-
- setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
- setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
- }
-
- // TODO: Handle constrained floating-point operations of v4f64
+ if (Subtarget.pairedVectorMemops()) {
+ addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
+ setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v256i1, Custom);
+ }
+ if (Subtarget.hasMMA()) {
+ addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);
+ setOperationAction(ISD::LOAD, MVT::v512i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v512i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
}
if (Subtarget.has64BitSupport())
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+ if (Subtarget.isISA3_1())
+ setOperationAction(ISD::SRA, MVT::v1i128, Legal);
+
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
if (!isPPC64) {
@@ -1315,8 +1329,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setLibcallName(RTLIB::POW_F128, "powf128");
setLibcallName(RTLIB::FMIN_F128, "fminf128");
setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
- setLibcallName(RTLIB::POWI_F128, "__powikf2");
setLibcallName(RTLIB::REM_F128, "fmodf128");
+ setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+ setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+ setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+ setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+ setLibcallName(RTLIB::ROUND_F128, "roundf128");
+ setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+ setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+ setLibcallName(RTLIB::RINT_F128, "rintf128");
+ setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+ setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+ setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+ setLibcallName(RTLIB::FMA_F128, "fmaf128");
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
@@ -1379,6 +1404,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
MaxLoadsPerMemcmpOptSize = 4;
}
+ IsStrictFPEnabled = true;
+
// Let the subtarget (CPU) decide if a predictable select is more expensive
// than the corresponding branch. This information is used in CGP to decide
// when to convert selects into branches.
@@ -1421,8 +1448,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
// 16byte and wider vectors are passed on 16byte boundary.
// The rest is 8 on PPC64 and 4 on PPC32 boundary.
Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
- if (Subtarget.hasAltivec() || Subtarget.hasQPX())
- getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));
+ if (Subtarget.hasAltivec())
+ getMaxByValAlign(Ty, Alignment, Align(16));
return Alignment.value();
}
@@ -1438,16 +1465,6 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
return VT.isScalarInteger();
}
-/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
-/// type is cheaper than a multiply followed by a shift.
-/// This is true for words and doublewords on 64-bit PowerPC.
-bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
- if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
- isOperationLegal(ISD::MULHU, Type)))
- return true;
- return TargetLowering::isMulhCheaperThanMulShift(Type);
-}
-
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
@@ -1468,6 +1485,10 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "PPCISD::FP_TO_SINT_IN_VSR";
case PPCISD::FRE: return "PPCISD::FRE";
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
+ case PPCISD::FTSQRT:
+ return "PPCISD::FTSQRT";
+ case PPCISD::FSQRT:
+ return "PPCISD::FSQRT";
case PPCISD::STFIWX: return "PPCISD::STFIWX";
case PPCISD::VPERM: return "PPCISD::VPERM";
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
@@ -1515,7 +1536,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::ANDI_rec_1_GT_BIT:
return "PPCISD::ANDI_rec_1_GT_BIT";
case PPCISD::VCMP: return "PPCISD::VCMP";
- case PPCISD::VCMPo: return "PPCISD::VCMPo";
+ case PPCISD::VCMP_rec: return "PPCISD::VCMP_rec";
case PPCISD::LBRX: return "PPCISD::LBRX";
case PPCISD::STBRX: return "PPCISD::STBRX";
case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
@@ -1552,6 +1573,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
+ case PPCISD::PADDI_DTPREL:
+ return "PPCISD::PADDI_DTPREL";
case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
case PPCISD::SC: return "PPCISD::SC";
case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
@@ -1560,12 +1583,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
case PPCISD::VABSD: return "PPCISD::VABSD";
- case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
- case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
- case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
- case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
- case PPCISD::QBFLT: return "PPCISD::QBFLT";
- case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
@@ -1573,8 +1590,35 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";
+ case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
+ return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
+ case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
+ return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
+ case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD";
+ case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD";
+ case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
+ case PPCISD::XXMFACC: return "PPCISD::XXMFACC";
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
+ case PPCISD::STRICT_FADDRTZ:
+ return "PPCISD::STRICT_FADDRTZ";
+ case PPCISD::STRICT_FCTIDZ:
+ return "PPCISD::STRICT_FCTIDZ";
+ case PPCISD::STRICT_FCTIWZ:
+ return "PPCISD::STRICT_FCTIWZ";
+ case PPCISD::STRICT_FCTIDUZ:
+ return "PPCISD::STRICT_FCTIDUZ";
+ case PPCISD::STRICT_FCTIWUZ:
+ return "PPCISD::STRICT_FCTIWUZ";
+ case PPCISD::STRICT_FCFID:
+ return "PPCISD::STRICT_FCFID";
+ case PPCISD::STRICT_FCFIDU:
+ return "PPCISD::STRICT_FCFIDU";
+ case PPCISD::STRICT_FCFIDS:
+ return "PPCISD::STRICT_FCFIDS";
+ case PPCISD::STRICT_FCFIDUS:
+ return "PPCISD::STRICT_FCFIDUS";
+ case PPCISD::LXVRZX: return "PPCISD::LXVRZX";
}
return nullptr;
}
@@ -1584,9 +1628,6 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
if (!VT.isVector())
return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
- if (Subtarget.hasQPX())
- return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
-
return VT.changeVectorElementTypeToInteger();
}
@@ -2360,36 +2401,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
return SDValue();
}
-/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
-/// amount, otherwise return -1.
-int PPC::isQVALIGNIShuffleMask(SDNode *N) {
- EVT VT = N->getValueType(0);
- if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
- return -1;
-
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-
- // Find the first non-undef value in the shuffle mask.
- unsigned i;
- for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
- /*search*/;
-
- if (i == 4) return -1; // all undef.
-
- // Otherwise, check to see if the rest of the elements are consecutively
- // numbered from this value.
- unsigned ShiftAmt = SVOp->getMaskElt(i);
- if (ShiftAmt < i) return -1;
- ShiftAmt -= i;
-
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != 4; ++i)
- if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
- return -1;
-
- return ShiftAmt;
-}
-
//===----------------------------------------------------------------------===//
// Addressing Mode Selection
//===----------------------------------------------------------------------===//
@@ -2431,6 +2442,20 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
return false;
}
+/// isIntS34Immediate - This method tests if value of node given can be
+/// accurately represented as a sign extension from a 34-bit value. If so,
+/// this returns true and the immediate.
+bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
+
+ Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+ return isInt<34>(Imm);
+}
+bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
+ return isIntS34Immediate(Op.getNode(), Imm);
+}
+
/// SelectAddressRegReg - Given the specified addressed, check to see if it
/// can be represented as an indexed [r+r] operation. Returns false if it
/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
@@ -2631,6 +2656,55 @@ bool PPCTargetLowering::SelectAddressRegImm(
return true; // [r+0]
}
+/// Similar to the 16-bit case but for instructions that take a 34-bit
+/// displacement field (prefixed loads/stores).
+bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
+ SDValue &Base,
+ SelectionDAG &DAG) const {
+ // Only on 64-bit targets.
+ if (N.getValueType() != MVT::i64)
+ return false;
+
+ SDLoc dl(N);
+ int64_t Imm = 0;
+
+ if (N.getOpcode() == ISD::ADD) {
+ if (!isIntS34Immediate(N.getOperand(1), Imm))
+ return false;
+ Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ else
+ Base = N.getOperand(0);
+ return true;
+ }
+
+ if (N.getOpcode() == ISD::OR) {
+ if (!isIntS34Immediate(N.getOperand(1), Imm))
+ return false;
+ // If this is an or of disjoint bitfields, we can codegen this as an add
+ // (for better address arithmetic) if the LHS and RHS of the OR are
+ // provably disjoint.
+ KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+ if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
+ return false;
+ if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+ Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+ else
+ Base = N.getOperand(0);
+ Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+ return true;
+ }
+
+ if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
+ Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+ Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+ return true;
+ }
+
+ return false;
+}
+
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
/// represented as an indexed [r+r] operation.
bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
@@ -2760,16 +2834,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
return false;
}
- // PowerPC doesn't have preinc load/store instructions for vectors (except
- // for QPX, which does have preinc r+r forms).
- if (VT.isVector()) {
- if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
- return false;
- } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
- AM = ISD::PRE_INC;
- return true;
- }
- }
+ // PowerPC doesn't have preinc load/store instructions for vectors
+ if (VT.isVector())
+ return false;
if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
// Common code will reject creating a pre-inc form if the base pointer
@@ -3064,6 +3131,15 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
TLSModel::Model Model = TM.getTLSModel(GV);
if (Model == TLSModel::LocalExec) {
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
+ SDValue TGA = DAG.getTargetGlobalAddress(
+ GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG));
+ SDValue MatAddr =
+ DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA);
+ return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr);
+ }
+
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
PPCII::MO_TPREL_HA);
SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -3076,29 +3152,44 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
}
if (Model == TLSModel::InitialExec) {
- SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
- SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
- PPCII::MO_TLS);
- SDValue GOTPtr;
- if (is64bit) {
- setUsesTOCBasePtr(DAG);
- SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
- GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
- PtrVT, GOTReg, TGA);
+ bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
+ SDValue TGA = DAG.getTargetGlobalAddress(
+ GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
+ SDValue TGATLS = DAG.getTargetGlobalAddress(
+ GV, dl, PtrVT, 0,
+ IsPCRel ? (PPCII::MO_TLS | PPCII::MO_PCREL_FLAG) : PPCII::MO_TLS);
+ SDValue TPOffset;
+ if (IsPCRel) {
+ SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA);
+ TPOffset = DAG.getLoad(MVT::i64, dl, DAG.getEntryNode(), MatPCRel,
+ MachinePointerInfo());
} else {
- if (!TM.isPositionIndependent())
- GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
- else if (picLevel == PICLevel::SmallPIC)
- GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
- else
- GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ SDValue GOTPtr;
+ if (is64bit) {
+ setUsesTOCBasePtr(DAG);
+ SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+ GOTPtr =
+ DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA);
+ } else {
+ if (!TM.isPositionIndependent())
+ GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+ else if (picLevel == PICLevel::SmallPIC)
+ GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+ else
+ GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+ }
+ TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr);
}
- SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
- PtrVT, TGA, GOTPtr);
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
}
if (Model == TLSModel::GeneralDynamic) {
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_GOT_TLSGD_PCREL_FLAG);
+ return DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
+ }
+
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
SDValue GOTPtr;
if (is64bit) {
@@ -3117,6 +3208,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
}
if (Model == TLSModel::LocalDynamic) {
+ if (Subtarget.isUsingPCRelativeCalls()) {
+ SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+ PPCII::MO_GOT_TLSLD_PCREL_FLAG);
+ SDValue MatPCRel =
+ DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
+ return DAG.getNode(PPCISD::PADDI_DTPREL, dl, PtrVT, MatPCRel, TGA);
+ }
+
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
SDValue GOTPtr;
if (is64bit) {
@@ -3491,11 +3590,6 @@ static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
PPC::F11, PPC::F12, PPC::F13};
-/// QFPR - The set of QPX registers that should be allocated for arguments.
-static const MCPhysReg QFPR[] = {
- PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
- PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
-
/// CalculateStackSlotSize - Calculates the size reserved for this argument on
/// the stack.
static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
@@ -3525,10 +3619,6 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
Alignment = Align(16);
- // QPX vector types stored in double-precision are padded to a 32 byte
- // boundary.
- else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
- Alignment = Align(32);
// ByVal parameters are aligned as requested.
if (Flags.isByVal()) {
@@ -3560,14 +3650,11 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
/// stack slot (instead of being passed in registers). ArgOffset,
/// AvailableFPRs, and AvailableVRs must hold the current argument
/// position, and will be updated to account for this argument.
-static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
- ISD::ArgFlagsTy Flags,
- unsigned PtrByteSize,
- unsigned LinkageSize,
- unsigned ParamAreaSize,
- unsigned &ArgOffset,
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
+ unsigned PtrByteSize, unsigned LinkageSize,
+ unsigned ParamAreaSize, unsigned &ArgOffset,
unsigned &AvailableFPRs,
- unsigned &AvailableVRs, bool HasQPX) {
+ unsigned &AvailableVRs) {
bool UseMemory = false;
// Respect alignment of argument on the stack.
@@ -3591,11 +3678,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
// However, if the argument is actually passed in an FPR or a VR,
// we don't use memory after all.
if (!Flags.isByVal()) {
- if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
- // QPX registers overlap with the scalar FP registers.
- (HasQPX && (ArgVT == MVT::v4f32 ||
- ArgVT == MVT::v4f64 ||
- ArgVT == MVT::v4i1)))
+ if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
if (AvailableFPRs > 0) {
--AvailableFPRs;
return false;
@@ -3630,11 +3713,8 @@ SDValue PPCTargetLowering::LowerFormalArguments(
if (Subtarget.is64BitELFABI())
return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
InVals);
- if (Subtarget.is32BitELFABI())
- return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
- InVals);
-
- return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
+ assert(Subtarget.is32BitELFABI());
+ return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
InVals);
}
@@ -3734,18 +3814,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
RC = &PPC::VRRCRegClass;
break;
case MVT::v4f32:
- RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
+ RC = &PPC::VRRCRegClass;
break;
case MVT::v2f64:
case MVT::v2i64:
RC = &PPC::VRRCRegClass;
break;
- case MVT::v4f64:
- RC = &PPC::QFRCRegClass;
- break;
- case MVT::v4i1:
- RC = &PPC::QBRCRegClass;
- break;
}
SDValue ArgValue;
@@ -3944,7 +4018,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
const unsigned Num_GPR_Regs = array_lengthof(GPR);
const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
const unsigned Num_VR_Regs = array_lengthof(VR);
- const unsigned Num_QFPR_Regs = Num_FPR_Regs;
// Do a first pass over the arguments to determine whether the ABI
// guarantees that our caller has allocated the parameter save area
@@ -3963,8 +4036,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
PtrByteSize, LinkageSize, ParamAreaSize,
- NumBytes, AvailableFPRs, AvailableVRs,
- Subtarget.hasQPX()))
+ NumBytes, AvailableFPRs, AvailableVRs))
HasParameterArea = true;
}
@@ -3974,7 +4046,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
unsigned ArgOffset = LinkageSize;
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
- unsigned &QFPR_idx = FPR_idx;
SmallVector<SDValue, 8> MemOps;
Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
@@ -4217,51 +4288,20 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
case MVT::v2i64:
case MVT::v1i128:
case MVT::f128:
- if (!Subtarget.hasQPX()) {
- // These can be scalar arguments or elements of a vector array type
- // passed directly. The latter are used to implement ELFv2 homogenous
- // vector aggregates.
- if (VR_idx != Num_VR_Regs) {
- unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
- ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
- ++VR_idx;
- } else {
- if (CallConv == CallingConv::Fast)
- ComputeArgOffset();
- needsLoad = true;
- }
- if (CallConv != CallingConv::Fast || needsLoad)
- ArgOffset += 16;
- break;
- } // not QPX
-
- assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
- "Invalid QPX parameter type");
- LLVM_FALLTHROUGH;
-
- case MVT::v4f64:
- case MVT::v4i1:
- // QPX vectors are treated like their scalar floating-point subregisters
- // (except that they're larger).
- unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
- if (QFPR_idx != Num_QFPR_Regs) {
- const TargetRegisterClass *RC;
- switch (ObjectVT.getSimpleVT().SimpleTy) {
- case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
- case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
- default: RC = &PPC::QBRCRegClass; break;
- }
-
- unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
+ if (VR_idx != Num_VR_Regs) {
+ unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
- ++QFPR_idx;
+ ++VR_idx;
} else {
if (CallConv == CallingConv::Fast)
ComputeArgOffset();
needsLoad = true;
}
if (CallConv != CallingConv::Fast || needsLoad)
- ArgOffset += Sz;
+ ArgOffset += 16;
break;
}
@@ -4328,366 +4368,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
return Chain;
}
-SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
- // TODO: add description of PPC stack frame format, or at least some docs.
- //
- MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo &MFI = MF.getFrameInfo();
- PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-
- EVT PtrVT = getPointerTy(MF.getDataLayout());
- bool isPPC64 = PtrVT == MVT::i64;
- // Potential tail calls could cause overwriting of argument stack slots.
- bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
- (CallConv == CallingConv::Fast));
- unsigned PtrByteSize = isPPC64 ? 8 : 4;
- unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
- unsigned ArgOffset = LinkageSize;
- // Area that is at least reserved in caller of this function.
- unsigned MinReservedArea = ArgOffset;
-
- static const MCPhysReg GPR_32[] = { // 32-bit registers.
- PPC::R3, PPC::R4, PPC::R5, PPC::R6,
- PPC::R7, PPC::R8, PPC::R9, PPC::R10,
- };
- static const MCPhysReg GPR_64[] = { // 64-bit registers.
- PPC::X3, PPC::X4, PPC::X5, PPC::X6,
- PPC::X7, PPC::X8, PPC::X9, PPC::X10,
- };
- static const MCPhysReg VR[] = {
- PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
- PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
- };
-
- const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
- const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
- const unsigned Num_VR_Regs = array_lengthof( VR);
-
- unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
-
- const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
-
- // In 32-bit non-varargs functions, the stack space for vectors is after the
- // stack space for non-vectors. We do not use this space unless we have
- // too many vectors to fit in registers, something that only occurs in
- // constructed examples:), but we have to walk the arglist to figure
- // that out...for the pathological case, compute VecArgOffset as the
- // start of the vector parameter area. Computing VecArgOffset is the
- // entire point of the following loop.
- unsigned VecArgOffset = ArgOffset;
- if (!isVarArg && !isPPC64) {
- for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
- ++ArgNo) {
- EVT ObjectVT = Ins[ArgNo].VT;
- ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-
- if (Flags.isByVal()) {
- // ObjSize is the true size, ArgSize rounded up to multiple of regs.
- unsigned ObjSize = Flags.getByValSize();
- unsigned ArgSize =
- ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
- VecArgOffset += ArgSize;
- continue;
- }
-
- switch(ObjectVT.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Unhandled argument type!");
- case MVT::i1:
- case MVT::i32:
- case MVT::f32:
- VecArgOffset += 4;
- break;
- case MVT::i64: // PPC64
- case MVT::f64:
- // FIXME: We are guaranteed to be !isPPC64 at this point.
- // Does MVT::i64 apply?
- VecArgOffset += 8;
- break;
- case MVT::v4f32:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v16i8:
- // Nothing to do, we're only looking at Nonvector args here.
- break;
- }
- }
- }
- // We've found where the vector parameter area in memory is. Skip the
- // first 12 parameters; these don't use that memory.
- VecArgOffset = ((VecArgOffset+15)/16)*16;
- VecArgOffset += 12*16;
-
- // Add DAG nodes to load the arguments or copy them out of registers. On
- // entry to a function on PPC, the arguments start after the linkage area,
- // although the first ones are often in registers.
-
- SmallVector<SDValue, 8> MemOps;
- unsigned nAltivecParamsAtEnd = 0;
- Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
- unsigned CurArgIdx = 0;
- for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
- SDValue ArgVal;
- bool needsLoad = false;
- EVT ObjectVT = Ins[ArgNo].VT;
- unsigned ObjSize = ObjectVT.getSizeInBits()/8;
- unsigned ArgSize = ObjSize;
- ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
- if (Ins[ArgNo].isOrigArg()) {
- std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
- CurArgIdx = Ins[ArgNo].getOrigArgIndex();
- }
- unsigned CurArgOffset = ArgOffset;
-
- // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
- if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
- ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
- if (isVarArg || isPPC64) {
- MinReservedArea = ((MinReservedArea+15)/16)*16;
- MinReservedArea += CalculateStackSlotSize(ObjectVT,
- Flags,
- PtrByteSize);
- } else nAltivecParamsAtEnd++;
- } else
- // Calculate min reserved area.
- MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
- Flags,
- PtrByteSize);
-
- // FIXME the codegen can be much improved in some cases.
- // We do not have to keep everything in memory.
- if (Flags.isByVal()) {
- assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
-
- // ObjSize is the true size, ArgSize rounded up to multiple of registers.
- ObjSize = Flags.getByValSize();
- ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
- // Objects of size 1 and 2 are right justified, everything else is
- // left justified. This means the memory address is adjusted forwards.
- if (ObjSize==1 || ObjSize==2) {
- CurArgOffset = CurArgOffset + (4 - ObjSize);
- }
- // The value of the object is its address.
- int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- InVals.push_back(FIN);
- if (ObjSize==1 || ObjSize==2) {
- if (GPR_idx != Num_GPR_Regs) {
- unsigned VReg;
- if (isPPC64)
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
- else
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
- SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
- EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
- SDValue Store =
- DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo(&*FuncArg), ObjType);
- MemOps.push_back(Store);
- ++GPR_idx;
- }
-
- ArgOffset += PtrByteSize;
-
- continue;
- }
- for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
- // Store whatever pieces of the object are in registers
- // to memory. ArgOffset will be the address of the beginning
- // of the object.
- if (GPR_idx != Num_GPR_Regs) {
- unsigned VReg;
- if (isPPC64)
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
- else
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
- int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
- SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo(&*FuncArg, j));
- MemOps.push_back(Store);
- ++GPR_idx;
- ArgOffset += PtrByteSize;
- } else {
- ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
- break;
- }
- }
- continue;
- }
-
- switch (ObjectVT.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("Unhandled argument type!");
- case MVT::i1:
- case MVT::i32:
- if (!isPPC64) {
- if (GPR_idx != Num_GPR_Regs) {
- unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
- ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-
- if (ObjectVT == MVT::i1)
- ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
-
- ++GPR_idx;
- } else {
- needsLoad = true;
- ArgSize = PtrByteSize;
- }
- // All int arguments reserve stack space in the Darwin ABI.
- ArgOffset += PtrByteSize;
- break;
- }
- LLVM_FALLTHROUGH;
- case MVT::i64: // PPC64
- if (GPR_idx != Num_GPR_Regs) {
- unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
- ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
-
- if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
- // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
- // value to MVT::i64 and then truncate to the correct register size.
- ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
-
- ++GPR_idx;
- } else {
- needsLoad = true;
- ArgSize = PtrByteSize;
- }
- // All int arguments reserve stack space in the Darwin ABI.
- ArgOffset += 8;
- break;
-
- case MVT::f32:
- case MVT::f64:
- // Every 4 bytes of argument space consumes one of the GPRs available for
- // argument passing.
- if (GPR_idx != Num_GPR_Regs) {
- ++GPR_idx;
- if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
- ++GPR_idx;
- }
- if (FPR_idx != Num_FPR_Regs) {
- unsigned VReg;
-
- if (ObjectVT == MVT::f32)
- VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
- else
- VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
-
- ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
- ++FPR_idx;
- } else {
- needsLoad = true;
- }
-
- // All FP arguments reserve stack space in the Darwin ABI.
- ArgOffset += isPPC64 ? 8 : ObjSize;
- break;
- case MVT::v4f32:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v16i8:
- // Note that vector arguments in registers don't reserve stack space,
- // except in varargs functions.
- if (VR_idx != Num_VR_Regs) {
- unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
- ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
- if (isVarArg) {
- while ((ArgOffset % 16) != 0) {
- ArgOffset += PtrByteSize;
- if (GPR_idx != Num_GPR_Regs)
- GPR_idx++;
- }
- ArgOffset += 16;
- GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
- }
- ++VR_idx;
- } else {
- if (!isVarArg && !isPPC64) {
- // Vectors go after all the nonvectors.
- CurArgOffset = VecArgOffset;
- VecArgOffset += 16;
- } else {
- // Vectors are aligned.
- ArgOffset = ((ArgOffset+15)/16)*16;
- CurArgOffset = ArgOffset;
- ArgOffset += 16;
- }
- needsLoad = true;
- }
- break;
- }
-
- // We need to load the argument to a virtual register if we determined above
- // that we ran out of physical registers of the appropriate type.
- if (needsLoad) {
- int FI = MFI.CreateFixedObject(ObjSize,
- CurArgOffset + (ArgSize - ObjSize),
- isImmutable);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
- }
-
- InVals.push_back(ArgVal);
- }
-
- // Allow for Altivec parameters at the end, if needed.
- if (nAltivecParamsAtEnd) {
- MinReservedArea = ((MinReservedArea+15)/16)*16;
- MinReservedArea += 16*nAltivecParamsAtEnd;
- }
-
- // Area that is at least reserved in the caller of this function.
- MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
-
- // Set the size that is at least reserved in caller of this function. Tail
- // call optimized functions' reserved stack space needs to be aligned so that
- // taking the difference between two stack areas will result in an aligned
- // stack.
- MinReservedArea =
- EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
- FuncInfo->setMinReservedArea(MinReservedArea);
-
- // If the function takes variable number of arguments, make a frame index for
- // the start of the first vararg value... for expansion of llvm.va_start.
- if (isVarArg) {
- int Depth = ArgOffset;
-
- FuncInfo->setVarArgsFrameIndex(
- MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
- Depth, true));
- SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
-
- // If this function is vararg, store any remaining integer argument regs
- // to their spots on the stack so that they may be loaded by dereferencing
- // the result of va_next.
- for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
- unsigned VReg;
-
- if (isPPC64)
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
- else
- VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-
- SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
- SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
- MemOps.push_back(Store);
- // Increment the address by four for the next argument to store
- SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
- FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
- }
- }
-
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-
- return Chain;
-}
-
/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
/// adjusted to accommodate the arguments for the tailcall.
static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
@@ -4758,6 +4438,13 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
if (STICallee->isUsingPCRelativeCalls())
return false;
+ // If the GV is not a strong definition then we need to assume it can be
+ // replaced by another function at link time. The function that replaces
+ // it may not share the same TOC as the caller since the callee may be
+ // replaced by a PC Relative version of the same function.
+ if (!GV->isStrongDefinitionForLinker())
+ return false;
+
// The medium and large code models are expected to provide a sufficiently
// large TOC to provide all data addressing needs of a module with a
// single TOC.
@@ -4765,12 +4452,6 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
CodeModel::Large == TM.getCodeModel())
return true;
- // Otherwise we need to ensure callee and caller are in the same section,
- // since the linker may allocate multiple TOCs, and we don't know which
- // sections will belong to the same TOC base.
- if (!GV->isStrongDefinitionForLinker())
- return false;
-
// Any explicitly-specified sections and section prefixes must also match.
// Also, if we're using -ffunction-sections, then each function is always in
// a different section (the same is true for COMDAT functions).
@@ -4814,10 +4495,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
for (const ISD::OutputArg& Param : Outs) {
if (Param.Flags.isNest()) continue;
- if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
- PtrByteSize, LinkageSize, ParamAreaSize,
- NumBytes, AvailableFPRs, AvailableVRs,
- Subtarget.hasQPX()))
+ if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize,
+ LinkageSize, ParamAreaSize, NumBytes,
+ AvailableFPRs, AvailableVRs))
return true;
}
return false;
@@ -5331,66 +5011,53 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
Subtarget.is32BitELFABI() && !isLocalCallee() &&
Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
- // On AIX, direct function calls reference the symbol for the function's
- // entry point, which is named by prepending a "." before the function's
- // C-linkage name.
- const auto getAIXFuncEntryPointSymbolSDNode =
- [&](StringRef FuncName, bool IsDeclaration,
- const XCOFF::StorageClass &SC) {
- auto &Context = DAG.getMachineFunction().getMMI().getContext();
-
- MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
- Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));
-
- if (IsDeclaration && !S->hasRepresentedCsectSet()) {
- // On AIX, an undefined symbol needs to be associated with a
- // MCSectionXCOFF to get the correct storage mapping class.
- // In this case, XCOFF::XMC_PR.
- MCSectionXCOFF *Sec = Context.getXCOFFSection(
- S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
- SectionKind::getMetadata());
- S->setRepresentedCsect(Sec);
- }
+ const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
+ const TargetMachine &TM = Subtarget.getTargetMachine();
+ const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
+ MCSymbolXCOFF *S =
+ cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM));
- MVT PtrVT =
- DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
- return DAG.getMCSymbol(S, PtrVT);
- };
+ MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ return DAG.getMCSymbol(S, PtrVT);
+ };
if (isFunctionGlobalAddress(Callee)) {
- const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
- const GlobalValue *GV = G->getGlobal();
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
- if (!Subtarget.isAIXABI())
- return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
- UsePlt ? PPCII::MO_PLT : 0);
-
- assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
- const GlobalObject *GO = cast<GlobalObject>(GV);
- const XCOFF::StorageClass SC =
- TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
- return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(),
- SC);
+ if (Subtarget.isAIXABI()) {
+ assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
+ return getAIXFuncEntryPointSymbolSDNode(GV);
+ }
+ return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
+ UsePlt ? PPCII::MO_PLT : 0);
}
if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *SymName = S->getSymbol();
- if (!Subtarget.isAIXABI())
- return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
- UsePlt ? PPCII::MO_PLT : 0);
+ if (Subtarget.isAIXABI()) {
+ // If there exists a user-declared function whose name is the same as the
+ // ExternalSymbol's, then we pick up the user-declared version.
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
+ if (const Function *F =
+ dyn_cast_or_null<Function>(Mod->getNamedValue(SymName)))
+ return getAIXFuncEntryPointSymbolSDNode(F);
+
+ // On AIX, direct function calls reference the symbol for the function's
+ // entry point, which is named by prepending a "." before the function's
+ // C-linkage name. A Qualname is returned here because an external
+ // function entry point is a csect with XTY_ER property.
+ const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
+ auto &Context = DAG.getMachineFunction().getMMI().getContext();
+ MCSectionXCOFF *Sec = Context.getXCOFFSection(
+ (Twine(".") + Twine(SymName)).str(), XCOFF::XMC_PR, XCOFF::XTY_ER,
+ SectionKind::getMetadata());
+ return Sec->getQualNameSymbol();
+ };
- // If there exists a user-declared function whose name is the same as the
- // ExternalSymbol's, then we pick up the user-declared version.
- const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
- if (const Function *F =
- dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) {
- const XCOFF::StorageClass SC =
- TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F);
- return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(),
- SC);
+ SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
}
-
- return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
+ return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
+ UsePlt ? PPCII::MO_PLT : 0);
}
// No transformation needed.
@@ -5735,19 +5402,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
CLI.NoMerge);
- if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
- return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
- InVals, CB);
-
- if (Subtarget.isSVR4ABI())
- return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
- InVals, CB);
-
if (Subtarget.isAIXABI())
return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
InVals, CB);
- return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+ assert(Subtarget.isSVR4ABI());
+ if (Subtarget.isPPC64())
+ return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+ InVals, CB);
+ return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
InVals, CB);
}
@@ -6044,7 +5707,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
unsigned NumBytes = LinkageSize;
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
- unsigned &QFPR_idx = FPR_idx;
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
@@ -6058,7 +5720,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
const unsigned NumGPRs = array_lengthof(GPR);
const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
const unsigned NumVRs = array_lengthof(VR);
- const unsigned NumQFPRs = NumFPRs;
// On ELFv2, we can avoid allocating the parameter area if all the arguments
// can be passed to the callee in registers.
@@ -6073,9 +5734,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
for (unsigned i = 0; i != NumOps; ++i) {
if (Outs[i].Flags.isNest()) continue;
if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
- PtrByteSize, LinkageSize, ParamAreaSize,
- NumBytesTmp, AvailableFPRs, AvailableVRs,
- Subtarget.hasQPX()))
+ PtrByteSize, LinkageSize, ParamAreaSize,
+ NumBytesTmp, AvailableFPRs, AvailableVRs))
HasParameterArea = true;
}
}
@@ -6123,20 +5783,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
continue;
break;
case MVT::v4f32:
- // When using QPX, this is handled like a FP register, otherwise, it
- // is an Altivec register.
- if (Subtarget.hasQPX()) {
- if (++NumFPRsUsed <= NumFPRs)
- continue;
- } else {
- if (++NumVRsUsed <= NumVRs)
- continue;
- }
+ if (++NumVRsUsed <= NumVRs)
+ continue;
break;
case MVT::f32:
case MVT::f64:
- case MVT::v4f64: // QPX
- case MVT::v4i1: // QPX
if (++NumFPRsUsed <= NumFPRs)
continue;
break;
@@ -6498,7 +6149,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
case MVT::v2i64:
case MVT::v1i128:
case MVT::f128:
- if (!Subtarget.hasQPX()) {
// These can be scalar arguments or elements of a vector array type
// passed directly. The latter are used to implement ELFv2 homogenous
// vector aggregates.
@@ -6554,63 +6204,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
if (!IsFastCall)
ArgOffset += 16;
break;
- } // not QPX
-
- assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
- "Invalid QPX parameter type");
-
- LLVM_FALLTHROUGH;
- case MVT::v4f64:
- case MVT::v4i1: {
- bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
- if (CFlags.IsVarArg) {
- assert(HasParameterArea &&
- "Parameter area must exist if we have a varargs call.");
- // We could elide this store in the case where the object fits
- // entirely in R registers. Maybe later.
- SDValue Store =
- DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Store);
- if (QFPR_idx != NumQFPRs) {
- SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
- PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
- }
- ArgOffset += (IsF32 ? 16 : 32);
- for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
- if (GPR_idx == NumGPRs)
- break;
- SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
- DAG.getConstant(i, dl, PtrVT));
- SDValue Load =
- DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
- }
- break;
- }
-
- // Non-varargs QPX params go into registers or on the stack.
- if (QFPR_idx != NumQFPRs) {
- RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
- } else {
- if (IsFastCall)
- ComputePtrOff();
-
- assert(HasParameterArea &&
- "Parameter area must exist to pass an argument in memory.");
- LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- true, CFlags.IsTailCall, true, MemOpChains,
- TailCallArguments, dl);
- if (IsFastCall)
- ArgOffset += (IsF32 ? 16 : 32);
- }
-
- if (!IsFastCall)
- ArgOffset += (IsF32 ? 16 : 32);
- break;
- }
}
}
@@ -6664,384 +6257,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
Callee, SPDiff, NumBytes, Ins, InVals, CB);
}
-SDValue PPCTargetLowering::LowerCall_Darwin(
- SDValue Chain, SDValue Callee, CallFlags CFlags,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- const CallBase *CB) const {
- unsigned NumOps = Outs.size();
-
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- bool isPPC64 = PtrVT == MVT::i64;
- unsigned PtrByteSize = isPPC64 ? 8 : 4;
-
- MachineFunction &MF = DAG.getMachineFunction();
-
- // Mark this function as potentially containing a function that contains a
- // tail call. As a consequence the frame pointer will be used for dynamicalloc
- // and restoring the callers stack pointer in this functions epilog. This is
- // done because by tail calling the called function might overwrite the value
- // in this function's (MF) stack pointer stack slot 0(SP).
- if (getTargetMachine().Options.GuaranteedTailCallOpt &&
- CFlags.CallConv == CallingConv::Fast)
- MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
-
- // Count how many bytes are to be pushed on the stack, including the linkage
- // area, and parameter passing area. We start with 24/48 bytes, which is
- // prereserved space for [SP][CR][LR][3 x unused].
- unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
- unsigned NumBytes = LinkageSize;
-
- // Add up all the space actually used.
- // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
- // they all go in registers, but we must reserve stack space for them for
- // possible use by the caller. In varargs or 64-bit calls, parameters are
- // assigned stack space in order, with padding so Altivec parameters are
- // 16-byte aligned.
- unsigned nAltivecParamsAtEnd = 0;
- for (unsigned i = 0; i != NumOps; ++i) {
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
- EVT ArgVT = Outs[i].VT;
- // Varargs Altivec parameters are padded to a 16 byte boundary.
- if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
- ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
- ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
- if (!CFlags.IsVarArg && !isPPC64) {
- // Non-varargs Altivec parameters go after all the non-Altivec
- // parameters; handle those later so we know how much padding we need.
- nAltivecParamsAtEnd++;
- continue;
- }
- // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
- NumBytes = ((NumBytes+15)/16)*16;
- }
- NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
- }
-
- // Allow for Altivec parameters at the end, if needed.
- if (nAltivecParamsAtEnd) {
- NumBytes = ((NumBytes+15)/16)*16;
- NumBytes += 16*nAltivecParamsAtEnd;
- }
-
- // The prolog code of the callee may store up to 8 GPR argument registers to
- // the stack, allowing va_start to index over them in memory if its varargs.
- // Because we cannot tell if this is needed on the caller side, we have to
- // conservatively assume that it is needed. As such, make sure we have at
- // least enough stack space for the caller to store the 8 GPRs.
- NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
-
- // Tail call needs the stack to be aligned.
- if (getTargetMachine().Options.GuaranteedTailCallOpt &&
- CFlags.CallConv == CallingConv::Fast)
- NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
-
- // Calculate by how many bytes the stack has to be adjusted in case of tail
- // call optimization.
- int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
-
- // To protect arguments on the stack from being clobbered in a tail call,
- // force all the loads to happen before doing any other lowering.
- if (CFlags.IsTailCall)
- Chain = DAG.getStackArgumentTokenFactor(Chain);
-
- // Adjust the stack pointer for the new arguments...
- // These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
- SDValue CallSeqStart = Chain;
-
- // Load the return address and frame pointer so it can be move somewhere else
- // later.
- SDValue LROp, FPOp;
- Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
-
- // Set up a copy of the stack pointer for use loading and storing any
- // arguments that may not fit in the registers available for argument
- // passing.
- SDValue StackPtr;
- if (isPPC64)
- StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
- else
- StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
-
- // Figure out which arguments are going to go in registers, and which in
- // memory. Also, if this is a vararg function, floating point operations
- // must be stored to our stack, and loaded into integer regs as well, if
- // any integer regs are available for argument passing.
- unsigned ArgOffset = LinkageSize;
- unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
-
- static const MCPhysReg GPR_32[] = { // 32-bit registers.
- PPC::R3, PPC::R4, PPC::R5, PPC::R6,
- PPC::R7, PPC::R8, PPC::R9, PPC::R10,
- };
- static const MCPhysReg GPR_64[] = { // 64-bit registers.
- PPC::X3, PPC::X4, PPC::X5, PPC::X6,
- PPC::X7, PPC::X8, PPC::X9, PPC::X10,
- };
- static const MCPhysReg VR[] = {
- PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
- PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
- };
- const unsigned NumGPRs = array_lengthof(GPR_32);
- const unsigned NumFPRs = 13;
- const unsigned NumVRs = array_lengthof(VR);
-
- const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
-
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
- SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
-
- SmallVector<SDValue, 8> MemOpChains;
- for (unsigned i = 0; i != NumOps; ++i) {
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
-
- // PtrOff will be used to store the current argument to the stack if a
- // register cannot be found for it.
- SDValue PtrOff;
-
- PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
-
- PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-
- // On PPC64, promote integers to 64-bit values.
- if (isPPC64 && Arg.getValueType() == MVT::i32) {
- // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
- unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
- }
-
- // FIXME memcpy is used way more than necessary. Correctness first.
- // Note: "by value" is code for passing a structure by value, not
- // basic types.
- if (Flags.isByVal()) {
- unsigned Size = Flags.getByValSize();
- // Very small objects are passed right-justified. Everything else is
- // passed left-justified.
- if (Size==1 || Size==2) {
- EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
- if (GPR_idx != NumGPRs) {
- SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
- MachinePointerInfo(), VT);
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
-
- ArgOffset += PtrByteSize;
- } else {
- SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
- PtrOff.getValueType());
- SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
- Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
- CallSeqStart,
- Flags, DAG, dl);
- ArgOffset += PtrByteSize;
- }
- continue;
- }
- // Copy entire object into memory. There are cases where gcc-generated
- // code assumes it is there, even if it could be put entirely into
- // registers. (This is not what the doc says.)
- Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
- CallSeqStart,
- Flags, DAG, dl);
-
- // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
- // copy the pieces of the object that fit into registers from the
- // parameter save area.
- for (unsigned j=0; j<Size; j+=PtrByteSize) {
- SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
- SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
- if (GPR_idx != NumGPRs) {
- SDValue Load =
- DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
- ArgOffset += PtrByteSize;
- } else {
- ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
- break;
- }
- }
- continue;
- }
-
- switch (Arg.getSimpleValueType().SimpleTy) {
- default: llvm_unreachable("Unexpected ValueType for argument!");
- case MVT::i1:
- case MVT::i32:
- case MVT::i64:
- if (GPR_idx != NumGPRs) {
- if (Arg.getValueType() == MVT::i1)
- Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
-
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
- } else {
- LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, CFlags.IsTailCall, false, MemOpChains,
- TailCallArguments, dl);
- }
- ArgOffset += PtrByteSize;
- break;
- case MVT::f32:
- case MVT::f64:
- if (FPR_idx != NumFPRs) {
- RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
-
- if (CFlags.IsVarArg) {
- SDValue Store =
- DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Store);
-
- // Float varargs are always shadowed in available integer registers
- if (GPR_idx != NumGPRs) {
- SDValue Load =
- DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
- }
- if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
- SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
- PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
- SDValue Load =
- DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
- }
- } else {
- // If we have any FPRs remaining, we may also have GPRs remaining.
- // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
- // GPRs.
- if (GPR_idx != NumGPRs)
- ++GPR_idx;
- if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
- !isPPC64) // PPC64 has 64-bit GPR's obviously :)
- ++GPR_idx;
- }
- } else
- LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, CFlags.IsTailCall, false, MemOpChains,
- TailCallArguments, dl);
- if (isPPC64)
- ArgOffset += 8;
- else
- ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
- break;
- case MVT::v4f32:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v16i8:
- if (CFlags.IsVarArg) {
- // These go aligned on the stack, or in the corresponding R registers
- // when within range. The Darwin PPC ABI doc claims they also go in
- // V registers; in fact gcc does this only for arguments that are
- // prototyped, not for those that match the ... We do it for all
- // arguments, seems to work.
- while (ArgOffset % 16 !=0) {
- ArgOffset += PtrByteSize;
- if (GPR_idx != NumGPRs)
- GPR_idx++;
- }
- // We could elide this store in the case where the object fits
- // entirely in R registers. Maybe later.
- PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
- DAG.getConstant(ArgOffset, dl, PtrVT));
- SDValue Store =
- DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Store);
- if (VR_idx != NumVRs) {
- SDValue Load =
- DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
- }
- ArgOffset += 16;
- for (unsigned i=0; i<16; i+=PtrByteSize) {
- if (GPR_idx == NumGPRs)
- break;
- SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
- DAG.getConstant(i, dl, PtrVT));
- SDValue Load =
- DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
- }
- break;
- }
-
- // Non-varargs Altivec params generally go in registers, but have
- // stack space allocated at the end.
- if (VR_idx != NumVRs) {
- // Doesn't have GPR space allocated.
- RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
- } else if (nAltivecParamsAtEnd==0) {
- // We are emitting Altivec params in order.
- LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, CFlags.IsTailCall, true, MemOpChains,
- TailCallArguments, dl);
- ArgOffset += 16;
- }
- break;
- }
- }
- // If all Altivec parameters fit in registers, as they usually do,
- // they get stack space following the non-Altivec parameters. We
- // don't track this here because nobody below needs it.
- // If there are more Altivec parameters than fit in registers emit
- // the stores here.
- if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {
- unsigned j = 0;
- // Offset is aligned; skip 1st 12 params which go in V registers.
- ArgOffset = ((ArgOffset+15)/16)*16;
- ArgOffset += 12*16;
- for (unsigned i = 0; i != NumOps; ++i) {
- SDValue Arg = OutVals[i];
- EVT ArgType = Outs[i].VT;
- if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
- ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
- if (++j > NumVRs) {
- SDValue PtrOff;
- // We are emitting Altivec params in order.
- LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
- isPPC64, CFlags.IsTailCall, true, MemOpChains,
- TailCallArguments, dl);
- ArgOffset += 16;
- }
- }
- }
- }
-
- if (!MemOpChains.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
-
- // On Darwin, R12 must contain the address of an indirect callee. This does
- // not mean the MTCTR instruction must use R12; it's easier to model this as
- // an extra parameter, so do that.
- if (CFlags.IsIndirect) {
- assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
- RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
- PPC::R12), Callee));
- }
-
- // Build a sequence of copy-to-reg nodes chained together with token chain
- // and flag operands which copy the outgoing args into the appropriate regs.
- SDValue InFlag;
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
-
- if (CFlags.IsTailCall)
- PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
- TailCallArguments);
-
- return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
- Callee, SPDiff, NumBytes, Ins, InVals, CB);
-}
-
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
CCState &State) {
@@ -7052,9 +6267,10 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
- assert((!ValVT.isInteger() ||
- (ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&
- "Integer argument exceeds register size: should have been legalized");
+ if (ValVT.isVector() && !State.getMachineFunction()
+ .getTarget()
+ .Options.EnableAIXExtendedAltivecABI)
+ report_fatal_error("the default Altivec AIX ABI is not yet supported");
if (ValVT == MVT::f128)
report_fatal_error("f128 is unimplemented on AIX.");
@@ -7062,9 +6278,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
if (ArgFlags.isNest())
report_fatal_error("Nest arguments are unimplemented.");
- if (ValVT.isVector() || LocVT.isVector())
- report_fatal_error("Vector arguments are unimplemented on AIX.");
-
static const MCPhysReg GPR_32[] = {// 32-bit registers.
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
@@ -7072,6 +6285,11 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
+ static const MCPhysReg VR[] = {// Vector registers.
+ PPC::V2, PPC::V3, PPC::V4, PPC::V5,
+ PPC::V6, PPC::V7, PPC::V8, PPC::V9,
+ PPC::V10, PPC::V11, PPC::V12, PPC::V13};
+
if (ArgFlags.isByVal()) {
if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
report_fatal_error("Pass-by-value arguments with alignment greater than "
@@ -7116,7 +6334,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i32: {
const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
// AIX integer arguments are always passed in register width.
- if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
+ if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
: CCValAssign::LocInfo::ZExt;
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
@@ -7167,6 +6385,25 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
}
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2i64:
+ case MVT::v2f64:
+ case MVT::v1i128: {
+ if (State.isVarArg())
+ report_fatal_error(
+ "variadic arguments for vector types are unimplemented for AIX");
+
+ if (unsigned VReg = State.AllocateReg(VR))
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
+ else {
+ report_fatal_error(
+ "passing vector parameters to the stack is unimplemented for AIX");
+ }
+ return false;
+ }
}
return true;
}
@@ -7187,6 +6424,14 @@ static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
return &PPC::F4RCRegClass;
case MVT::f64:
return &PPC::F8RCRegClass;
+ case MVT::v4f32:
+ case MVT::v4i32:
+ case MVT::v8i16:
+ case MVT::v16i8:
+ case MVT::v2i64:
+ case MVT::v2f64:
+ case MVT::v1i128:
+ return &PPC::VRRCRegClass;
}
}
@@ -7194,7 +6439,7 @@ static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
SelectionDAG &DAG, SDValue ArgValue,
MVT LocVT, const SDLoc &dl) {
assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
- assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());
+ assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
if (Flags.isSExt())
ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
@@ -7281,8 +6526,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
const PPCSubtarget &Subtarget =
static_cast<const PPCSubtarget &>(DAG.getSubtarget());
- if (Subtarget.hasQPX())
- report_fatal_error("QPX support is not supported on AIX.");
const bool IsPPC64 = Subtarget.isPPC64();
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
@@ -7291,6 +6534,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
SmallVector<CCValAssign, 16> ArgLocs;
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
const EVT PtrVT = getPointerTy(MF.getDataLayout());
@@ -7305,6 +6549,9 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
CCValAssign &VA = ArgLocs[I++];
MVT LocVT = VA.getLocVT();
ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
+ if (VA.isMemLoc() && VA.getValVT().isVector())
+ report_fatal_error(
+ "passing vector parameters to the stack is unimplemented for AIX");
// For compatibility with the AIX XL compiler, the float args in the
// parameter save area are initialized even if the argument is available
@@ -7315,6 +6562,15 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
if (VA.isMemLoc() && VA.needsCustom())
continue;
+ if (VA.isRegLoc()) {
+ if (VA.getValVT().isScalarInteger())
+ FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
+ else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector())
+ FuncInfo->appendParameterType(VA.getValVT().SimpleTy == MVT::f32
+ ? PPCFunctionInfo::ShortFloatPoint
+ : PPCFunctionInfo::LongFloatPoint);
+ }
+
if (Flags.isByVal() && VA.isMemLoc()) {
const unsigned Size =
alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
@@ -7360,10 +6616,10 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
// to extracting the value from the register directly, and elide the
// stores when the arguments address is not taken, but that will need to
// be future work.
- SDValue Store =
- DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,
- DAG.getObjectPtrOffset(dl, FIN, Offset),
- MachinePointerInfo::getFixedStack(MF, FI, Offset));
+ SDValue Store = DAG.getStore(
+ CopyFrom.getValue(1), dl, CopyFrom,
+ DAG.getObjectPtrOffset(dl, FIN, TypeSize::Fixed(Offset)),
+ MachinePointerInfo::getFixedStack(MF, FI, Offset));
MemOps.push_back(Store);
};
@@ -7378,6 +6634,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
const CCValAssign RL = ArgLocs[I++];
HandleRegLoc(RL.getLocReg(), Offset);
+ FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
}
if (Offset != StackSize) {
@@ -7399,7 +6656,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
if (ValVT.isScalarInteger() &&
- (ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
+ (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
ArgValue =
truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
}
@@ -7440,7 +6697,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
// aligned stack.
CallerReservedArea =
EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
- PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
FuncInfo->setMinReservedArea(CallerReservedArea);
if (isVarArg) {
@@ -7502,10 +6758,6 @@ SDValue PPCTargetLowering::LowerCall_AIX(
const PPCSubtarget& Subtarget =
static_cast<const PPCSubtarget&>(DAG.getSubtarget());
- if (Subtarget.hasQPX())
- report_fatal_error("QPX is not supported on AIX.");
- if (Subtarget.hasAltivec())
- report_fatal_error("Altivec support is unimplemented on AIX.");
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<CCValAssign, 16> ArgLocs;
@@ -7562,11 +6814,12 @@ SDValue PPCTargetLowering::LowerCall_AIX(
}
auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
- return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
- (LoadOffset != 0)
- ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
- : Arg,
- MachinePointerInfo(), VT);
+ return DAG.getExtLoad(
+ ISD::ZEXTLOAD, dl, PtrVT, Chain,
+ (LoadOffset != 0)
+ ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
+ : Arg,
+ MachinePointerInfo(), VT);
};
unsigned LoadOffset = 0;
@@ -7596,9 +6849,11 @@ SDValue PPCTargetLowering::LowerCall_AIX(
// Only memcpy the bytes that don't pass in register.
MemcpyFlags.setByValSize(ByValSize - LoadOffset);
Chain = CallSeqStart = createMemcpyOutsideCallSeq(
- (LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
- : Arg,
- DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),
+ (LoadOffset != 0)
+ ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
+ : Arg,
+ DAG.getObjectPtrOffset(dl, StackPtr,
+ TypeSize::Fixed(ByValVA.getLocMemOffset())),
CallSeqStart, MemcpyFlags, DAG, dl);
continue;
}
@@ -7648,6 +6903,10 @@ SDValue PPCTargetLowering::LowerCall_AIX(
const MVT LocVT = VA.getLocVT();
const MVT ValVT = VA.getValVT();
+ if (VA.isMemLoc() && VA.getValVT().isVector())
+ report_fatal_error(
+ "passing vector parameters to the stack is unimplemented for AIX");
+
switch (VA.getLocInfo()) {
default:
report_fatal_error("Unexpected argument extension type.");
@@ -7689,7 +6948,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
// f32 in 32-bit GPR
// f64 in 64-bit GPR
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
- else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())
+ else if (Arg.getValueType().getFixedSizeInBits() <
+ LocVT.getFixedSizeInBits())
// f32 in 64-bit GPR.
RegsToPass.push_back(std::make_pair(
VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
@@ -8048,20 +7308,45 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
// <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
- assert(Op.getValueType().isVector() && "Vector type expected.");
-
- SDLoc DL(Op);
- SDValue N1 = Op.getOperand(0);
- unsigned SrcSize = N1.getValueType().getSizeInBits();
- assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
- SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
-
EVT TrgVT = Op.getValueType();
+ assert(TrgVT.isVector() && "Vector type expected.");
unsigned TrgNumElts = TrgVT.getVectorNumElements();
EVT EltVT = TrgVT.getVectorElementType();
+ if (!isOperationCustom(Op.getOpcode(), TrgVT) ||
+ TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) ||
+ !isPowerOf2_32(EltVT.getSizeInBits()))
+ return SDValue();
+
+ SDValue N1 = Op.getOperand(0);
+ EVT SrcVT = N1.getValueType();
+ unsigned SrcSize = SrcVT.getSizeInBits();
+ if (SrcSize > 256 ||
+ !isPowerOf2_32(SrcVT.getVectorNumElements()) ||
+ !isPowerOf2_32(SrcVT.getVectorElementType().getSizeInBits()))
+ return SDValue();
+ if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
+ return SDValue();
+
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
+ SDLoc DL(Op);
+ SDValue Op1, Op2;
+ if (SrcSize == 256) {
+ EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout());
+ EVT SplitVT =
+ N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
+ unsigned SplitNumElts = SplitVT.getVectorNumElements();
+ Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
+ DAG.getConstant(0, DL, VecIdxTy));
+ Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
+ DAG.getConstant(SplitNumElts, DL, VecIdxTy));
+ }
+ else {
+ Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
+ Op2 = DAG.getUNDEF(WideVT);
+ }
+
// First list the elements we want to keep.
unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
SmallVector<int, 16> ShuffV;
@@ -8077,16 +7362,17 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
// ShuffV.push_back(i + WideNumElts);
ShuffV.push_back(WideNumElts + 1);
- SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
- return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
+ Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1);
+ Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2);
+ return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV);
}
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
/// possible.
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
- // Not FP? Not a fsel.
+ // Not FP, or using SPE? Not a fsel.
if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
- !Op.getOperand(2).getValueType().isFloatingPoint())
+ !Op.getOperand(2).getValueType().isFloatingPoint() || Subtarget.hasSPE())
return Op;
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -8202,54 +7488,105 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
- SelectionDAG &DAG,
- const SDLoc &dl) const {
- assert(Op.getOperand(0).getValueType().isFloatingPoint());
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() == MVT::f32)
- Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
-
- SDValue Tmp;
+static unsigned getPPCStrictOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("No strict version of this opcode!");
+ case PPCISD::FCTIDZ:
+ return PPCISD::STRICT_FCTIDZ;
+ case PPCISD::FCTIWZ:
+ return PPCISD::STRICT_FCTIWZ;
+ case PPCISD::FCTIDUZ:
+ return PPCISD::STRICT_FCTIDUZ;
+ case PPCISD::FCTIWUZ:
+ return PPCISD::STRICT_FCTIWUZ;
+ case PPCISD::FCFID:
+ return PPCISD::STRICT_FCFID;
+ case PPCISD::FCFIDU:
+ return PPCISD::STRICT_FCFIDU;
+ case PPCISD::FCFIDS:
+ return PPCISD::STRICT_FCFIDS;
+ case PPCISD::FCFIDUS:
+ return PPCISD::STRICT_FCFIDUS;
+ }
+}
+
+static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
+ const PPCSubtarget &Subtarget) {
+ SDLoc dl(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+ // TODO: Any other flags to propagate?
+ SDNodeFlags Flags;
+ Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+ // For strict nodes, source is the second operand.
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ assert(Src.getValueType().isFloatingPoint());
+ if (Src.getValueType() == MVT::f32) {
+ if (IsStrict) {
+ Src =
+ DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+ DAG.getVTList(MVT::f64, MVT::Other), {Chain, Src}, Flags);
+ Chain = Src.getValue(1);
+ } else
+ Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+ }
+ SDValue Conv;
+ unsigned Opc = ISD::DELETED_NODE;
switch (Op.getSimpleValueType().SimpleTy) {
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
case MVT::i32:
- Tmp = DAG.getNode(
- Op.getOpcode() == ISD::FP_TO_SINT
- ? PPCISD::FCTIWZ
- : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
- dl, MVT::f64, Src);
+ Opc = IsSigned ? PPCISD::FCTIWZ
+ : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
break;
case MVT::i64:
- assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+ assert((IsSigned || Subtarget.hasFPCVT()) &&
"i64 FP_TO_UINT is supported only with FPCVT");
- Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
- PPCISD::FCTIDUZ,
- dl, MVT::f64, Src);
- break;
+ Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
+ }
+ if (IsStrict) {
+ Opc = getPPCStrictOpcode(Opc);
+ Conv = DAG.getNode(Opc, dl, DAG.getVTList(MVT::f64, MVT::Other),
+ {Chain, Src}, Flags);
+ } else {
+ Conv = DAG.getNode(Opc, dl, MVT::f64, Src);
}
+ return Conv;
+}
+
+void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+ SelectionDAG &DAG,
+ const SDLoc &dl) const {
+ SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+ bool IsStrict = Op->isStrictFPOpcode();
// Convert the FP value to an int value through memory.
bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
- (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
+ (IsSigned || Subtarget.hasFPCVT());
SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Emit a store to the stack slot.
- SDValue Chain;
+ SDValue Chain = IsStrict ? Tmp.getValue(1) : DAG.getEntryNode();
Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
if (i32Stack) {
MachineFunction &MF = DAG.getMachineFunction();
Alignment = Align(4);
MachineMemOperand *MMO =
MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
- SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
+ SDValue Ops[] = { Chain, Tmp, FIPtr };
Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
} else
- Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment);
+ Chain = DAG.getStore(Chain, dl, Tmp, FIPtr, MPI, Alignment);
// Result is a load from the stack slot. If loading 4 bytes, make sure to
// add in a bias on big endian.
@@ -8271,76 +7608,100 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
SelectionDAG &DAG,
const SDLoc &dl) const {
- assert(Op.getOperand(0).getValueType().isFloatingPoint());
- SDValue Src = Op.getOperand(0);
-
- if (Src.getValueType() == MVT::f32)
- Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
-
- SDValue Tmp;
- switch (Op.getSimpleValueType().SimpleTy) {
- default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
- case MVT::i32:
- Tmp = DAG.getNode(
- Op.getOpcode() == ISD::FP_TO_SINT
- ? PPCISD::FCTIWZ
- : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
- dl, MVT::f64, Src);
- Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
- break;
- case MVT::i64:
- assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
- "i64 FP_TO_UINT is supported only with FPCVT");
- Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
- PPCISD::FCTIDUZ,
- dl, MVT::f64, Src);
- Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
- break;
- }
- return Tmp;
+ SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
+ SDValue Mov = DAG.getNode(PPCISD::MFVSR, dl, Op.getValueType(), Conv);
+ if (Op->isStrictFPOpcode())
+ return DAG.getMergeValues({Mov, Conv.getValue(1)}, dl);
+ else
+ return Mov;
}
SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+ EVT DstVT = Op.getValueType();
// FP to INT conversions are legal for f128.
- if (Op->getOperand(0).getValueType() == MVT::f128)
- return Op;
+ if (SrcVT == MVT::f128)
+ return Subtarget.hasP9Vector() ? Op : SDValue();
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
// PPC (the libcall is not available).
- if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
- if (Op.getValueType() == MVT::i32) {
- if (Op.getOpcode() == ISD::FP_TO_SINT) {
- SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
- MVT::f64, Op.getOperand(0),
+ if (SrcVT == MVT::ppcf128) {
+ if (DstVT == MVT::i32) {
+ // TODO: Conservatively pass only nofpexcept flag here. Need to check and
+ // set other fast-math flags to FP operations in both strict and
+ // non-strict cases. (FP_TO_SINT, FSUB)
+ SDNodeFlags Flags;
+ Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+ if (IsSigned) {
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src,
DAG.getIntPtrConstant(0, dl));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
- MVT::f64, Op.getOperand(0),
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src,
DAG.getIntPtrConstant(1, dl));
- // Add the two halves of the long double in round-to-zero mode.
- SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
-
- // Now use a smaller FP_TO_SINT.
- return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
- }
- if (Op.getOpcode() == ISD::FP_TO_UINT) {
+ // Add the two halves of the long double in round-to-zero mode, and use
+ // a smaller FP_TO_SINT.
+ if (IsStrict) {
+ SDValue Res = DAG.getNode(PPCISD::STRICT_FADDRTZ, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ {Op.getOperand(0), Lo, Hi}, Flags);
+ return DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ {Res.getValue(1), Res}, Flags);
+ } else {
+ SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+ return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+ }
+ } else {
const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
- SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
- // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
- // FIXME: generated code sucks.
- // TODO: Are there fast-math-flags to propagate to this FSUB?
- SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
- Op.getOperand(0), Tmp);
- True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
- True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
- DAG.getConstant(0x80000000, dl, MVT::i32));
- SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
- Op.getOperand(0));
- return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
- ISD::SETGE);
+ SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
+ SDValue SignMask = DAG.getConstant(0x80000000, dl, DstVT);
+ if (IsStrict) {
+ // Sel = Src < 0x80000000
+ // FltOfs = select Sel, 0.0, 0x80000000
+ // IntOfs = select Sel, 0, 0x80000000
+ // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
+ SDValue Chain = Op.getOperand(0);
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+ EVT DstSetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);
+ SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT,
+ Chain, true);
+ Chain = Sel.getValue(1);
+
+ SDValue FltOfs = DAG.getSelect(
+ dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst);
+ Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);
+
+ SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl,
+ DAG.getVTList(SrcVT, MVT::Other),
+ {Chain, Src, FltOfs}, Flags);
+ Chain = Val.getValue(1);
+ SDValue SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
+ DAG.getVTList(DstVT, MVT::Other),
+ {Chain, Val}, Flags);
+ Chain = SInt.getValue(1);
+ SDValue IntOfs = DAG.getSelect(
+ dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), SignMask);
+ SDValue Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);
+ return DAG.getMergeValues({Result, Chain}, dl);
+ } else {
+ // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+ // FIXME: generated code sucks.
+ SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Cst);
+ True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
+ True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, SignMask);
+ SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+ return DAG.getSelectCC(dl, Src, Cst, True, False, ISD::SETGE);
+ }
}
}
@@ -8369,6 +7730,10 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
ReuseLoadInfo &RLI,
SelectionDAG &DAG,
ISD::LoadExtType ET) const {
+ // Conservatively skip reusing for constrained FP nodes.
+ if (Op->isStrictFPOpcode())
+ return false;
+
SDLoc dl(Op);
bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
(Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
@@ -8388,6 +7753,13 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
if (LD->getMemoryVT() != MemVT)
return false;
+ // If the result of the load is an illegal type, then we can't build a
+ // valid chain for reuse since the legalised loads and token factor node that
+ // ties the legalised loads together uses a different output chain then the
+ // illegal load.
+ if (!isTypeLegal(LD->getValueType(0)))
+ return false;
+
RLI.Ptr = LD->getBasePtr();
if (LD->isIndexed() && !LD->getOffset().isUndef()) {
assert(LD->getAddressingMode() == ISD::PRE_INC &&
@@ -8452,13 +7824,41 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
continue;
if (UI->getOpcode() != ISD::SINT_TO_FP &&
- UI->getOpcode() != ISD::UINT_TO_FP)
+ UI->getOpcode() != ISD::UINT_TO_FP &&
+ UI->getOpcode() != ISD::STRICT_SINT_TO_FP &&
+ UI->getOpcode() != ISD::STRICT_UINT_TO_FP)
return true;
}
return false;
}
+static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
+ const PPCSubtarget &Subtarget,
+ SDValue Chain = SDValue()) {
+ bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+ SDLoc dl(Op);
+
+ // TODO: Any other flags to propagate?
+ SDNodeFlags Flags;
+ Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+ // If we have FCFIDS, then use it when converting to single-precision.
+ // Otherwise, convert to double-precision and then round.
+ bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
+ unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
+ : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
+ EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
+ if (Op->isStrictFPOpcode()) {
+ if (!Chain)
+ Chain = Op.getOperand(0);
+ return DAG.getNode(getPPCStrictOpcode(ConvOpc), dl,
+ DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, Flags);
+ } else
+ return DAG.getNode(ConvOpc, dl, ConvTy, Src);
+}
+
/// Custom lowers integer to floating point conversions to use
/// the direct move instructions available in ISA 2.07 to avoid the
/// need for load/store combinations.
@@ -8470,25 +7870,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
"Invalid floating point type as target of conversion");
assert(Subtarget.hasFPCVT() &&
"Int to FP conversions with direct moves require FPCVT");
- SDValue FP;
- SDValue Src = Op.getOperand(0);
- bool SinglePrec = Op.getValueType() == MVT::f32;
+ SDValue Src = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
- bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
- unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
- (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);
-
- if (WordInt) {
- FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
- dl, MVT::f64, Src);
- FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
- }
- else {
- FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
- FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
- }
-
- return FP;
+ bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+ unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
+ SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src);
+ return convertIntToFP(Op, Mov, DAG, Subtarget);
}
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
@@ -8513,17 +7901,23 @@ static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const {
-
+ bool IsStrict = Op->isStrictFPOpcode();
unsigned Opc = Op.getOpcode();
- assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) &&
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
+ Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
"Unexpected conversion type");
assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
"Supports conversions to v2f64/v4f32 only.");
- bool SignedConv = Opc == ISD::SINT_TO_FP;
+ // TODO: Any other flags to propagate?
+ SDNodeFlags Flags;
+ Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+ bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
bool FourEltRes = Op.getValueType() == MVT::v4f32;
- SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
+ SDValue Wide = widenVec(DAG, Src, dl);
EVT WideVT = Wide.getValueType();
unsigned WideNumElts = WideVT.getVectorNumElements();
MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
@@ -8548,7 +7942,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
SDValue Extend;
if (SignedConv) {
Arrange = DAG.getBitcast(IntermediateVT, Arrange);
- EVT ExtVT = Op.getOperand(0).getValueType();
+ EVT ExtVT = Src.getValueType();
if (Subtarget.hasP9Altivec())
ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
IntermediateVT.getVectorNumElements());
@@ -8558,14 +7952,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
} else
Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
+ if (IsStrict)
+ return DAG.getNode(Opc, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
+ {Op.getOperand(0), Extend}, Flags);
+
return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
}
SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
+ bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
- EVT InVT = Op.getOperand(0).getValueType();
+ // TODO: Any other flags to propagate?
+ SDNodeFlags Flags;
+ Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
+
+ EVT InVT = Src.getValueType();
EVT OutVT = Op.getValueType();
if (OutVT.isVector() && OutVT.isFloatingPoint() &&
isOperationCustom(Op.getOpcode(), InVT))
@@ -8573,37 +7980,21 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
// Conversions to f128 are legal.
if (Op.getValueType() == MVT::f128)
- return Op;
-
- if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
- if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
- return SDValue();
-
- SDValue Value = Op.getOperand(0);
- // The values are now known to be -1 (false) or 1 (true). To convert this
- // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
- // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
- Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
- SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
-
- Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
-
- if (Op.getValueType() != MVT::v4f64)
- Value = DAG.getNode(ISD::FP_ROUND, dl,
- Op.getValueType(), Value,
- DAG.getIntPtrConstant(1, dl));
- return Value;
- }
+ return Subtarget.hasP9Vector() ? Op : SDValue();
// Don't handle ppc_fp128 here; let it be lowered to a libcall.
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
return SDValue();
- if (Op.getOperand(0).getValueType() == MVT::i1)
- return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
- DAG.getConstantFP(1.0, dl, Op.getValueType()),
- DAG.getConstantFP(0.0, dl, Op.getValueType()));
+ if (Src.getValueType() == MVT::i1) {
+ SDValue Sel = DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src,
+ DAG.getConstantFP(1.0, dl, Op.getValueType()),
+ DAG.getConstantFP(0.0, dl, Op.getValueType()));
+ if (IsStrict)
+ return DAG.getMergeValues({Sel, Chain}, dl);
+ else
+ return Sel;
+ }
// If we have direct moves, we can do all the conversion, skip the store/load
// however, without FPCVT we can't do most conversions.
@@ -8611,22 +8002,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
Subtarget.isPPC64() && Subtarget.hasFPCVT())
return LowerINT_TO_FPDirectMove(Op, DAG, dl);
- assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+ assert((IsSigned || Subtarget.hasFPCVT()) &&
"UINT_TO_FP is supported only with FPCVT");
- // If we have FCFIDS, then use it when converting to single-precision.
- // Otherwise, convert to double-precision and then round.
- unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
- ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
- : PPCISD::FCFIDS)
- : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
- : PPCISD::FCFID);
- MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
- ? MVT::f32
- : MVT::f64;
-
- if (Op.getOperand(0).getValueType() == MVT::i64) {
- SDValue SINT = Op.getOperand(0);
+ if (Src.getValueType() == MVT::i64) {
+ SDValue SINT = Src;
// When converting to single-precision, we actually need to convert
// to double-precision first and then round to single-precision.
// To avoid double-rounding effects during that operation, we have
@@ -8714,16 +8094,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
- MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(), FrameIdx));
+ SDValue Store = DAG.getStore(Chain, dl, SINT.getOperand(0), FIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), FrameIdx));
+ Chain = Store;
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
"Expected an i32 store");
RLI.Ptr = FIdx;
- RLI.Chain = Store;
+ RLI.Chain = Chain;
RLI.MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
RLI.Alignment = Align(4);
@@ -8736,18 +8116,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
PPCISD::LFIWZX : PPCISD::LFIWAX,
dl, DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i32, MMO);
+ Chain = Bits.getValue(1);
} else
Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
- SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
+ SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget, Chain);
+ if (IsStrict)
+ Chain = FP.getValue(1);
- if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
- FP = DAG.getNode(ISD::FP_ROUND, dl,
- MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+ if (IsStrict)
+ FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
+ DAG.getVTList(MVT::f32, MVT::Other),
+ {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
+ else
+ FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
+ DAG.getIntPtrConstant(0, dl));
+ }
return FP;
}
- assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+ assert(Src.getValueType() == MVT::i32 &&
"Unhandled INT_TO_FP type in custom expander!");
// Since we only generate this in 64-bit mode, we can take advantage of
// 64-bit registers. In particular, sign extend the input value into the
@@ -8761,21 +8150,20 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
ReuseLoadInfo RLI;
bool ReusingLoad;
- if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
- DAG))) {
+ if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) {
int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
- MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(), FrameIdx));
+ SDValue Store = DAG.getStore(Chain, dl, Src, FIdx,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), FrameIdx));
+ Chain = Store;
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
"Expected an i32 store");
RLI.Ptr = FIdx;
- RLI.Chain = Store;
+ RLI.Chain = Chain;
RLI.MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
RLI.Alignment = Align(4);
@@ -8785,10 +8173,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
- Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
- PPCISD::LFIWZX : PPCISD::LFIWAX,
- dl, DAG.getVTList(MVT::f64, MVT::Other),
- Ops, MVT::i32, MMO);
+ Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other), Ops,
+ MVT::i32, MMO);
+ Chain = Ld.getValue(1);
if (ReusingLoad)
spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
} else {
@@ -8798,25 +8186,34 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
- SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
- Op.getOperand(0));
+ SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src);
// STD the extended value into the stack slot.
SDValue Store = DAG.getStore(
- DAG.getEntryNode(), dl, Ext64, FIdx,
+ Chain, dl, Ext64, FIdx,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+ Chain = Store;
// Load the value as a double.
Ld = DAG.getLoad(
- MVT::f64, dl, Store, FIdx,
+ MVT::f64, dl, Chain, FIdx,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+ Chain = Ld.getValue(1);
}
// FCFID it and return it.
- SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
- if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
- FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
- DAG.getIntPtrConstant(0, dl));
+ SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget, Chain);
+ if (IsStrict)
+ Chain = FP.getValue(1);
+ if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+ if (IsStrict)
+ FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
+ DAG.getVTList(MVT::f32, MVT::Other),
+ {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
+ else
+ FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
+ DAG.getIntPtrConstant(0, dl));
+ }
return FP;
}
@@ -8851,16 +8248,24 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
Chain = MFFS.getValue(1);
- // Save FP register to stack slot
- int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
- SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
+ SDValue CWD;
+ if (isTypeLegal(MVT::i64)) {
+ CWD = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS));
+ } else {
+ // Save FP register to stack slot
+ int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+ Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
- // Load FP Control Word from low 32 bits of stack slot.
- SDValue Four = DAG.getConstant(4, dl, PtrVT);
- SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
- SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
- Chain = CWD.getValue(1);
+ // Load FP Control Word from low 32 bits of stack slot.
+ assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
+ "Stack slot adjustment is valid only on big endian subtargets!");
+ SDValue Four = DAG.getConstant(4, dl, PtrVT);
+ SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+ CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
+ Chain = CWD.getValue(1);
+ }
// Transform as necessary
SDValue CWD1 =
@@ -8971,6 +8376,31 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(OutOps, dl);
}
+SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ unsigned BitWidth = VT.getSizeInBits();
+
+ bool IsFSHL = Op.getOpcode() == ISD::FSHL;
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+ SDValue Z = Op.getOperand(2);
+ EVT AmtVT = Z.getValueType();
+
+ // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+ // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ // This is simpler than TargetLowering::expandFunnelShift because we can rely
+ // on PowerPC shift by BW being well defined.
+ Z = DAG.getNode(ISD::AND, dl, AmtVT, Z,
+ DAG.getConstant(BitWidth - 1, dl, AmtVT));
+ SDValue SubZ =
+ DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z);
+ X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ);
+ Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z);
+ return DAG.getNode(ISD::OR, dl, VT, X, Y);
+}
+
//===----------------------------------------------------------------------===//
// Vector related lowering.
//
@@ -8986,7 +8416,7 @@ static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
// For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
- if (Val == ((1LU << (SplatSize * 8)) - 1)) {
+ if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
SplatSize = 1;
Val = 0xFF;
}
@@ -9164,110 +8594,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
- if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
- // We first build an i32 vector, load it into a QPX register,
- // then convert it to a floating-point vector and compare it
- // to a zero vector to get the boolean result.
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
- MachinePointerInfo PtrInfo =
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
- assert(BVN->getNumOperands() == 4 &&
- "BUILD_VECTOR for v4i1 does not have 4 operands");
-
- bool IsConst = true;
- for (unsigned i = 0; i < 4; ++i) {
- if (BVN->getOperand(i).isUndef()) continue;
- if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
- IsConst = false;
- break;
- }
- }
-
- if (IsConst) {
- Constant *One =
- ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
- Constant *NegOne =
- ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
-
- Constant *CV[4];
- for (unsigned i = 0; i < 4; ++i) {
- if (BVN->getOperand(i).isUndef())
- CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
- else if (isNullConstant(BVN->getOperand(i)))
- CV[i] = NegOne;
- else
- CV[i] = One;
- }
-
- Constant *CP = ConstantVector::get(CV);
- SDValue CPIdx =
- DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));
-
- SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
- SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
- return DAG.getMemIntrinsicNode(
- PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
- }
-
- SmallVector<SDValue, 4> Stores;
- for (unsigned i = 0; i < 4; ++i) {
- if (BVN->getOperand(i).isUndef()) continue;
-
- unsigned Offset = 4*i;
- SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
- Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
-
- unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
- if (StoreSize > 4) {
- Stores.push_back(
- DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
- PtrInfo.getWithOffset(Offset), MVT::i32));
- } else {
- SDValue StoreValue = BVN->getOperand(i);
- if (StoreSize < 4)
- StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
-
- Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
- PtrInfo.getWithOffset(Offset)));
- }
- }
-
- SDValue StoreChain;
- if (!Stores.empty())
- StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
- else
- StoreChain = DAG.getEntryNode();
-
- // Now load from v4i32 into the QPX register; this will extend it to
- // v4i64 but not yet convert it to a floating point. Nevertheless, this
- // is typed as v4f64 because the QPX register integer states are not
- // explicitly represented.
-
- SDValue Ops[] = {StoreChain,
- DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
- FIdx};
- SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
-
- SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
- dl, VTs, Ops, MVT::v4i32, PtrInfo);
- LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
- DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
- LoadedVect);
-
- SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
-
- return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
- }
-
- // All other QPX vectors are handled by generic code.
- if (Subtarget.hasQPX())
- return SDValue();
-
// Check if this is a splat of a constant value.
APInt APSplatBits, APSplatUndef;
unsigned SplatBitSize;
@@ -9278,14 +8604,41 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// If it is a splat of a double, check if we can shrink it to a 32 bit
// non-denormal float which when converted back to double gives us the same
- // double. This is to exploit the XXSPLTIDP instruction.
- if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
- (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
- convertToNonDenormSingle(APSplatBits)) {
- SDValue SplatNode = DAG.getNode(
- PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
- DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
- return DAG.getBitcast(Op.getValueType(), SplatNode);
+ // double. This is to exploit the XXSPLTIDP instruction.+ // If we lose precision, we use XXSPLTI32DX.
+ if (BVNIsConstantSplat && (SplatBitSize == 64) &&
+ Subtarget.hasPrefixInstrs()) {
+ if (convertToNonDenormSingle(APSplatBits) &&
+ (Op->getValueType(0) == MVT::v2f64)) {
+ SDValue SplatNode = DAG.getNode(
+ PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
+ DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
+ return DAG.getBitcast(Op.getValueType(), SplatNode);
+ } else { // We may lose precision, so we have to use XXSPLTI32DX.
+
+ uint32_t Hi =
+ (uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);
+ uint32_t Lo =
+ (uint32_t)(APSplatBits.getZExtValue() & 0xFFFFFFFF);
+ SDValue SplatNode = DAG.getUNDEF(MVT::v2i64);
+
+ if (!Hi || !Lo)
+ // If either load is 0, then we should generate XXLXOR to set to 0.
+ SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64);
+
+ if (Hi)
+ SplatNode = DAG.getNode(
+ PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
+ DAG.getTargetConstant(0, dl, MVT::i32),
+ DAG.getTargetConstant(Hi, dl, MVT::i32));
+
+ if (Lo)
+ SplatNode =
+ DAG.getNode(PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
+ DAG.getTargetConstant(1, dl, MVT::i32),
+ DAG.getTargetConstant(Lo, dl, MVT::i32));
+
+ return DAG.getBitcast(Op.getValueType(), SplatNode);
+ }
}
if (!BVNIsConstantSplat || SplatBitSize > 32) {
@@ -9304,7 +8657,12 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// Checking for a single use of this load, we have to check for vector
// width (128 bits) / ElementSize uses (since each operand of the
// BUILD_VECTOR is a separate use of the value.
- if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
+ unsigned NumUsesOfInputLD = 128 / ElementSize;
+ for (SDValue BVInOp : Op->ops())
+ if (BVInOp.isUndef())
+ NumUsesOfInputLD--;
+ assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
+ if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
((Subtarget.hasVSX() && ElementSize == 64) ||
(Subtarget.hasP9Vector() && ElementSize == 32))) {
SDValue Ops[] = {
@@ -9312,17 +8670,21 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
LD->getBasePtr(), // Ptr
DAG.getValueType(Op.getValueType()) // VT
};
- return
- DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
- DAG.getVTList(Op.getValueType(), MVT::Other),
- Ops, LD->getMemoryVT(), LD->getMemOperand());
+ SDValue LdSplt = DAG.getMemIntrinsicNode(
+ PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+ // Replace all uses of the output chain of the original load with the
+ // output chain of the new load.
+ DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
+ LdSplt.getValue(1));
+ return LdSplt;
}
}
- // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
- // lowered to VSX instructions under certain conditions.
+ // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
+ // 32-bits can be lowered to VSX instructions under certain conditions.
// Without VSX, there is no pattern more efficient than expanding the node.
- if (Subtarget.hasVSX() &&
+ if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
Subtarget.hasP8Vector()))
return Op;
@@ -9351,7 +8713,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
// turned into a 4-byte splat of 0xABABABAB.
if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
- return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
+ return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,
Op.getValueType(), DAG, dl);
if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
@@ -9447,17 +8809,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
}
- // vsplti + sra self.
- if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
- SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
- static const unsigned IIDs[] = { // Intrinsic to use for each size.
- Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
- Intrinsic::ppc_altivec_vsraw
- };
- Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
- return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
- }
-
// vsplti + rol self.
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
@@ -9957,6 +9308,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SDValue LdSplt =
DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
Ops, LD->getMemoryVT(), LD->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), LdSplt.getValue(1));
if (LdSplt.getValueType() != SVOp->getValueType(0))
LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
return LdSplt;
@@ -10060,42 +9412,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
}
}
- if (Subtarget.hasQPX()) {
- if (VT.getVectorNumElements() != 4)
- return SDValue();
-
- if (V2.isUndef()) V2 = V1;
-
- int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
- if (AlignIdx != -1) {
- return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
- DAG.getConstant(AlignIdx, dl, MVT::i32));
- } else if (SVOp->isSplat()) {
- int SplatIdx = SVOp->getSplatIndex();
- if (SplatIdx >= 4) {
- std::swap(V1, V2);
- SplatIdx -= 4;
- }
-
- return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
- DAG.getConstant(SplatIdx, dl, MVT::i32));
- }
-
- // Lower this into a qvgpci/qvfperm pair.
-
- // Compute the qvgpci literal
- unsigned idx = 0;
- for (unsigned i = 0; i < 4; ++i) {
- int m = SVOp->getMaskElt(i);
- unsigned mm = m >= 0 ? (unsigned) m : i;
- idx |= mm << (3-i)*3;
- }
-
- SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
- DAG.getConstant(idx, dl, MVT::i32));
- return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
- }
-
// Cases that are handled by instructions that take permute immediates
// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
// selected by the instruction selector.
@@ -10357,6 +9673,26 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
return false;
break;
+ case Intrinsic::ppc_altivec_vcmpequq:
+ case Intrinsic::ppc_altivec_vcmpgtsq:
+ case Intrinsic::ppc_altivec_vcmpgtuq:
+ if (!Subtarget.isISA3_1())
+ return false;
+ switch (IntrinsicID) {
+ default:
+ llvm_unreachable("Unknown comparison intrinsic.");
+ case Intrinsic::ppc_altivec_vcmpequq:
+ CompareOpc = 455;
+ break;
+ case Intrinsic::ppc_altivec_vcmpgtsq:
+ CompareOpc = 903;
+ break;
+ case Intrinsic::ppc_altivec_vcmpgtuq:
+ CompareOpc = 647;
+ break;
+ }
+ break;
+
// VSX predicate comparisons use the same infrastructure
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
case Intrinsic::ppc_vsx_xvcmpgedp_p:
@@ -10480,6 +9816,26 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
else
return false;
break;
+ case Intrinsic::ppc_altivec_vcmpequq_p:
+ case Intrinsic::ppc_altivec_vcmpgtsq_p:
+ case Intrinsic::ppc_altivec_vcmpgtuq_p:
+ if (!Subtarget.isISA3_1())
+ return false;
+ switch (IntrinsicID) {
+ default:
+ llvm_unreachable("Unknown comparison intrinsic.");
+ case Intrinsic::ppc_altivec_vcmpequq_p:
+ CompareOpc = 455;
+ break;
+ case Intrinsic::ppc_altivec_vcmpgtsq_p:
+ CompareOpc = 903;
+ break;
+ case Intrinsic::ppc_altivec_vcmpgtuq_p:
+ CompareOpc = 647;
+ break;
+ }
+ isDot = true;
+ break;
}
return true;
}
@@ -10493,11 +9849,32 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc dl(Op);
- if (IntrinsicID == Intrinsic::thread_pointer) {
+ switch (IntrinsicID) {
+ case Intrinsic::thread_pointer:
// Reads the thread pointer register, used for __builtin_thread_pointer.
if (Subtarget.isPPC64())
return DAG.getRegister(PPC::X13, MVT::i64);
return DAG.getRegister(PPC::R2, MVT::i32);
+
+ case Intrinsic::ppc_mma_disassemble_acc:
+ case Intrinsic::ppc_vsx_disassemble_pair: {
+ int NumVecs = 2;
+ SDValue WideVec = Op.getOperand(1);
+ if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
+ NumVecs = 4;
+ WideVec = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, WideVec);
+ }
+ SmallVector<SDValue, 4> RetOps;
+ for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
+ SDValue Extract = DAG.getNode(
+ PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec,
+ DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
+ : VecNo,
+ dl, MVT::i64));
+ RetOps.push_back(Extract);
+ }
+ return DAG.getMergeValues(RetOps, dl);
+ }
}
// If this is a lowered altivec predicate compare, CompareOpc is set to the
@@ -10522,7 +9899,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getConstant(CompareOpc, dl, MVT::i32)
};
EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
- SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+ SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
// Now that we have the comparison, emit a copy from the CR to a GPR.
// This is flagged to the above dot comparison.
@@ -10683,154 +10060,51 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return Op;
}
-SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc dl(Op);
- SDNode *N = Op.getNode();
-
- assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
- "Unknown extract_vector_elt type");
-
- SDValue Value = N->getOperand(0);
-
- // The first part of this is like the store lowering except that we don't
- // need to track the chain.
-
- // The values are now known to be -1 (false) or 1 (true). To convert this
- // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
- // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
- Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
- // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
- // understand how to form the extending load.
- SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
-
- Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
-
- // Now convert to an integer and store.
- Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
- DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
- Value);
-
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
- MachinePointerInfo PtrInfo =
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
- SDValue StoreChain = DAG.getEntryNode();
- SDValue Ops[] = {StoreChain,
- DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
- Value, FIdx};
- SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
-
- StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
- dl, VTs, Ops, MVT::v4i32, PtrInfo);
-
- // Extract the value requested.
- unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
- Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
-
- SDValue IntVal =
- DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
-
- if (!Subtarget.useCRBits())
- return IntVal;
-
- return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
-}
-
-/// Lowering for QPX v4i1 loads
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
SDValue LoadChain = LN->getChain();
SDValue BasePtr = LN->getBasePtr();
+ EVT VT = Op.getValueType();
- if (Op.getValueType() == MVT::v4f64 ||
- Op.getValueType() == MVT::v4f32) {
- EVT MemVT = LN->getMemoryVT();
- unsigned Alignment = LN->getAlignment();
-
- // If this load is properly aligned, then it is legal.
- if (Alignment >= MemVT.getStoreSize())
- return Op;
-
- EVT ScalarVT = Op.getValueType().getScalarType(),
- ScalarMemVT = MemVT.getScalarType();
- unsigned Stride = ScalarMemVT.getStoreSize();
-
- SDValue Vals[4], LoadChains[4];
- for (unsigned Idx = 0; Idx < 4; ++Idx) {
- SDValue Load;
- if (ScalarVT != ScalarMemVT)
- Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
- BasePtr,
- LN->getPointerInfo().getWithOffset(Idx * Stride),
- ScalarMemVT, MinAlign(Alignment, Idx * Stride),
- LN->getMemOperand()->getFlags(), LN->getAAInfo());
- else
- Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
- LN->getPointerInfo().getWithOffset(Idx * Stride),
- MinAlign(Alignment, Idx * Stride),
- LN->getMemOperand()->getFlags(), LN->getAAInfo());
-
- if (Idx == 0 && LN->isIndexed()) {
- assert(LN->getAddressingMode() == ISD::PRE_INC &&
- "Unknown addressing mode on vector load");
- Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
- LN->getAddressingMode());
- }
-
- Vals[Idx] = Load;
- LoadChains[Idx] = Load.getValue(1);
-
- BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
- DAG.getConstant(Stride, dl,
- BasePtr.getValueType()));
- }
-
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
- SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
-
- if (LN->isIndexed()) {
- SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
- return DAG.getMergeValues(RetOps, dl);
- }
-
- SDValue RetOps[] = { Value, TF };
- return DAG.getMergeValues(RetOps, dl);
- }
-
- assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
- assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
-
- // To lower v4i1 from a byte array, we load the byte elements of the
- // vector and then reuse the BUILD_VECTOR logic.
-
- SDValue VectElmts[4], VectElmtChains[4];
- for (unsigned i = 0; i < 4; ++i) {
- SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
- Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
-
- VectElmts[i] = DAG.getExtLoad(
- ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
- LN->getPointerInfo().getWithOffset(i), MVT::i8,
- /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
- VectElmtChains[i] = VectElmts[i].getValue(1);
- }
-
- LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
- SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
+ if (VT != MVT::v256i1 && VT != MVT::v512i1)
+ return Op;
- SDValue RVals[] = { Value, LoadChain };
- return DAG.getMergeValues(RVals, dl);
+ // Type v256i1 is used for pairs and v512i1 is used for accumulators.
+ // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
+ // 2 or 4 vsx registers.
+ assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
+ "Type unsupported without MMA");
+ assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
+ "Type unsupported without paired vector support");
+ Align Alignment = LN->getAlign();
+ SmallVector<SDValue, 4> Loads;
+ SmallVector<SDValue, 4> LoadChains;
+ unsigned NumVecs = VT.getSizeInBits() / 128;
+ for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+ SDValue Load =
+ DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr,
+ LN->getPointerInfo().getWithOffset(Idx * 16),
+ commonAlignment(Alignment, Idx * 16),
+ LN->getMemOperand()->getFlags(), LN->getAAInfo());
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(16, dl, BasePtr.getValueType()));
+ Loads.push_back(Load);
+ LoadChains.push_back(Load.getValue(1));
+ }
+ if (Subtarget.isLittleEndian()) {
+ std::reverse(Loads.begin(), Loads.end());
+ std::reverse(LoadChains.begin(), LoadChains.end());
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+ SDValue Value =
+ DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
+ dl, VT, Loads);
+ SDValue RetOps[] = {Value, TF};
+ return DAG.getMergeValues(RetOps, dl);
}
-/// Lowering for QPX v4i1 stores
SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -10838,122 +10112,40 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SDValue StoreChain = SN->getChain();
SDValue BasePtr = SN->getBasePtr();
SDValue Value = SN->getValue();
+ EVT StoreVT = Value.getValueType();
- if (Value.getValueType() == MVT::v4f64 ||
- Value.getValueType() == MVT::v4f32) {
- EVT MemVT = SN->getMemoryVT();
- unsigned Alignment = SN->getAlignment();
-
- // If this store is properly aligned, then it is legal.
- if (Alignment >= MemVT.getStoreSize())
- return Op;
-
- EVT ScalarVT = Value.getValueType().getScalarType(),
- ScalarMemVT = MemVT.getScalarType();
- unsigned Stride = ScalarMemVT.getStoreSize();
-
- SDValue Stores[4];
- for (unsigned Idx = 0; Idx < 4; ++Idx) {
- SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
- DAG.getVectorIdxConstant(Idx, dl));
- SDValue Store;
- if (ScalarVT != ScalarMemVT)
- Store =
- DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
- SN->getPointerInfo().getWithOffset(Idx * Stride),
- ScalarMemVT, MinAlign(Alignment, Idx * Stride),
- SN->getMemOperand()->getFlags(), SN->getAAInfo());
- else
- Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
- SN->getPointerInfo().getWithOffset(Idx * Stride),
- MinAlign(Alignment, Idx * Stride),
- SN->getMemOperand()->getFlags(), SN->getAAInfo());
-
- if (Idx == 0 && SN->isIndexed()) {
- assert(SN->getAddressingMode() == ISD::PRE_INC &&
- "Unknown addressing mode on vector store");
- Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
- SN->getAddressingMode());
- }
-
- BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
- DAG.getConstant(Stride, dl,
- BasePtr.getValueType()));
- Stores[Idx] = Store;
- }
-
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
-
- if (SN->isIndexed()) {
- SDValue RetOps[] = { TF, Stores[0].getValue(1) };
- return DAG.getMergeValues(RetOps, dl);
- }
-
- return TF;
- }
-
- assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
- assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
-
- // The values are now known to be -1 (false) or 1 (true). To convert this
- // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
- // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
- Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
-
- // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
- // understand how to form the extending load.
- SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
-
- Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
-
- // Now convert to an integer and store.
- Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
- DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
- Value);
-
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
- MachinePointerInfo PtrInfo =
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
-
- SDValue Ops[] = {StoreChain,
- DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
- Value, FIdx};
- SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
-
- StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
- dl, VTs, Ops, MVT::v4i32, PtrInfo);
-
- // Move data into the byte array.
- SDValue Loads[4], LoadChains[4];
- for (unsigned i = 0; i < 4; ++i) {
- unsigned Offset = 4*i;
- SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
- Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
-
- Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
- PtrInfo.getWithOffset(Offset));
- LoadChains[i] = Loads[i].getValue(1);
- }
-
- StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-
- SDValue Stores[4];
- for (unsigned i = 0; i < 4; ++i) {
- SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
- Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
-
- Stores[i] = DAG.getTruncStore(
- StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
- MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
- SN->getAAInfo());
- }
-
- StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+ if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
+ return Op;
- return StoreChain;
+ // Type v256i1 is used for pairs and v512i1 is used for accumulators.
+ // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
+ // underlying registers individually.
+ assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
+ "Type unsupported without MMA");
+ assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
+ "Type unsupported without paired vector support");
+ Align Alignment = SN->getAlign();
+ SmallVector<SDValue, 4> Stores;
+ unsigned NumVecs = 2;
+ if (StoreVT == MVT::v512i1) {
+ Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
+ NumVecs = 4;
+ }
+ for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+ unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
+ SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
+ DAG.getConstant(VecNum, dl, MVT::i64));
+ SDValue Store =
+ DAG.getStore(StoreChain, dl, Elt, BasePtr,
+ SN->getPointerInfo().getWithOffset(Idx * 16),
+ commonAlignment(Alignment, Idx * 16),
+ SN->getMemOperand()->getFlags(), SN->getAAInfo());
+ BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(16, dl, BasePtr.getValueType()));
+ Stores.push_back(Store);
+ }
+ SDValue TF = DAG.getTokenFactor(dl, Stores);
+ return TF;
}
SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
@@ -11020,42 +10212,13 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
}
}
-SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
-
- assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");
-
- EVT VT = Op.getValueType();
- assert(VT.isVector() &&
- "Only set vector abs as custom, scalar abs shouldn't reach here!");
- assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
- VT == MVT::v16i8) &&
- "Unexpected vector element type!");
- assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
- "Current subtarget doesn't support smax v2i64!");
-
- // For vector abs, it can be lowered to:
- // abs x
- // ==>
- // y = -x
- // smax(x, y)
-
- SDLoc dl(Op);
- SDValue X = Op.getOperand(0);
- SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);
-
- // SMAX patch https://reviews.llvm.org/D47332
- // hasn't landed yet, so use intrinsic first here.
- // TODO: Should use SMAX directly once SMAX patch landed
- Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
- if (VT == MVT::v2i64)
- BifID = Intrinsic::ppc_altivec_vmaxsd;
- else if (VT == MVT::v8i16)
- BifID = Intrinsic::ppc_altivec_vmaxsh;
- else if (VT == MVT::v16i8)
- BifID = Intrinsic::ppc_altivec_vmaxsb;
+SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+ if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
+ !Subtarget.hasP9Vector())
+ return SDValue();
- return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
+ return Op;
}
// Custom lowering for fpext vf32 to v2f64
@@ -11168,8 +10331,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::STRICT_FP_TO_UINT:
+ case ISD::STRICT_FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
+ case ISD::STRICT_UINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
@@ -11179,16 +10346,20 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
+ case ISD::FSHL: return LowerFunnelShift(Op, DAG);
+ case ISD::FSHR: return LowerFunnelShift(Op, DAG);
+
// Vector-related lowering.
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
- case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
- case ISD::ABS: return LowerABS(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::STRICT_FP_ROUND:
+ case ISD::FP_ROUND:
+ return LowerFP_ROUND(Op, DAG);
case ISD::ROTL: return LowerROTL(Op, DAG);
// For counter-based loop handling.
@@ -11256,23 +10427,28 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
// LowerFP_TO_INT() can only handle f32 and f64.
- if (N->getOperand(0).getValueType() == MVT::ppcf128)
+ if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
+ MVT::ppcf128)
return;
Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
return;
case ISD::TRUNCATE: {
- EVT TrgVT = N->getValueType(0);
- EVT OpVT = N->getOperand(0).getValueType();
- if (TrgVT.isVector() &&
- isOperationCustom(N->getOpcode(), TrgVT) &&
- OpVT.getSizeInBits() <= 128 &&
- isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
- Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
+ if (!N->getValueType(0).isVector())
+ return;
+ SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG);
+ if (Lowered)
+ Results.push_back(Lowered);
return;
}
+ case ISD::FSHL:
+ case ISD::FSHR:
+ // Don't handle funnel shifts here.
+ return;
case ISD::BITCAST:
// Don't handle bitcast here.
return;
@@ -11444,17 +10620,88 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
return BB;
}
+static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
+ switch(MI.getOpcode()) {
+ default:
+ return false;
+ case PPC::COPY:
+ return TII->isSignExtended(MI);
+ case PPC::LHA:
+ case PPC::LHA8:
+ case PPC::LHAU:
+ case PPC::LHAU8:
+ case PPC::LHAUX:
+ case PPC::LHAUX8:
+ case PPC::LHAX:
+ case PPC::LHAX8:
+ case PPC::LWA:
+ case PPC::LWAUX:
+ case PPC::LWAX:
+ case PPC::LWAX_32:
+ case PPC::LWA_32:
+ case PPC::PLHA:
+ case PPC::PLHA8:
+ case PPC::PLHA8pc:
+ case PPC::PLHApc:
+ case PPC::PLWA:
+ case PPC::PLWA8:
+ case PPC::PLWA8pc:
+ case PPC::PLWApc:
+ case PPC::EXTSB:
+ case PPC::EXTSB8:
+ case PPC::EXTSB8_32_64:
+ case PPC::EXTSB8_rec:
+ case PPC::EXTSB_rec:
+ case PPC::EXTSH:
+ case PPC::EXTSH8:
+ case PPC::EXTSH8_32_64:
+ case PPC::EXTSH8_rec:
+ case PPC::EXTSH_rec:
+ case PPC::EXTSW:
+ case PPC::EXTSWSLI:
+ case PPC::EXTSWSLI_32_64:
+ case PPC::EXTSWSLI_32_64_rec:
+ case PPC::EXTSWSLI_rec:
+ case PPC::EXTSW_32:
+ case PPC::EXTSW_32_64:
+ case PPC::EXTSW_32_64_rec:
+ case PPC::EXTSW_rec:
+ case PPC::SRAW:
+ case PPC::SRAWI:
+ case PPC::SRAWI_rec:
+ case PPC::SRAW_rec:
+ return true;
+ }
+ return false;
+}
+
MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
MachineInstr &MI, MachineBasicBlock *BB,
bool is8bit, // operation
unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
+ // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+ const PPCInstrInfo *TII = Subtarget.getInstrInfo();
+
+ // If this is a signed comparison and the value being compared is not known
+ // to be sign extended, sign extend it here.
+ DebugLoc dl = MI.getDebugLoc();
+ MachineFunction *F = BB->getParent();
+ MachineRegisterInfo &RegInfo = F->getRegInfo();
+ Register incr = MI.getOperand(3).getReg();
+ bool IsSignExtended = Register::isVirtualRegister(incr) &&
+ isSignExtended(*RegInfo.getVRegDef(incr), TII);
+
+ if (CmpOpcode == PPC::CMPW && !IsSignExtended) {
+ Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+ BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg)
+ .addReg(MI.getOperand(3).getReg());
+ MI.getOperand(3).setReg(ValueReg);
+ }
// If we support part-word atomic mnemonics, just use them
if (Subtarget.hasPartwordAtomics())
return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
CmpPred);
- // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
// In 64 bit mode we have to use 64 bits for addresses, even though the
// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
// registers without caring whether they're 32 or 64, but here we're
@@ -11464,14 +10711,11 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction *F = BB->getParent();
MachineFunction::iterator It = ++BB->getIterator();
Register dest = MI.getOperand(0).getReg();
Register ptrA = MI.getOperand(1).getReg();
Register ptrB = MI.getOperand(2).getReg();
- Register incr = MI.getOperand(3).getReg();
- DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *loop2MBB =
@@ -11485,7 +10729,6 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
std::next(MachineBasicBlock::iterator(MI)), BB->end());
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
- MachineRegisterInfo &RegInfo = F->getRegInfo();
const TargetRegisterClass *RC =
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -12128,9 +11371,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
} else if (MI.getOpcode() == PPC::SELECT_CC_F4 ||
MI.getOpcode() == PPC::SELECT_CC_F8 ||
MI.getOpcode() == PPC::SELECT_CC_F16 ||
- MI.getOpcode() == PPC::SELECT_CC_QFRC ||
- MI.getOpcode() == PPC::SELECT_CC_QSRC ||
- MI.getOpcode() == PPC::SELECT_CC_QBRC ||
MI.getOpcode() == PPC::SELECT_CC_VRRC ||
MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
@@ -12140,9 +11380,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOpcode() == PPC::SELECT_F4 ||
MI.getOpcode() == PPC::SELECT_F8 ||
MI.getOpcode() == PPC::SELECT_F16 ||
- MI.getOpcode() == PPC::SELECT_QFRC ||
- MI.getOpcode() == PPC::SELECT_QSRC ||
- MI.getOpcode() == PPC::SELECT_QBRC ||
MI.getOpcode() == PPC::SELECT_SPE ||
MI.getOpcode() == PPC::SELECT_SPE4 ||
MI.getOpcode() == PPC::SELECT_VRRC ||
@@ -12180,9 +11417,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOpcode() == PPC::SELECT_F16 ||
MI.getOpcode() == PPC::SELECT_SPE4 ||
MI.getOpcode() == PPC::SELECT_SPE ||
- MI.getOpcode() == PPC::SELECT_QFRC ||
- MI.getOpcode() == PPC::SELECT_QSRC ||
- MI.getOpcode() == PPC::SELECT_QBRC ||
MI.getOpcode() == PPC::SELECT_VRRC ||
MI.getOpcode() == PPC::SELECT_VSFRC ||
MI.getOpcode() == PPC::SELECT_VSSRC ||
@@ -12665,11 +11899,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
// Set rounding mode to round-to-zero.
- BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
- BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1))
+ .addImm(31)
+ .addReg(PPC::RM, RegState::ImplicitDefine);
+
+ BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0))
+ .addImm(30)
+ .addReg(PPC::RM, RegState::ImplicitDefine);
// Perform addition.
- BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
+ auto MIB = BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest)
+ .addReg(Src1)
+ .addReg(Src2);
+ if (MI.getFlag(MachineInstr::NoFPExcept))
+ MIB.setMIFlag(MachineInstr::NoFPExcept);
// Restore FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
@@ -12728,10 +11971,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// the immediate to set the bits 62:63 of FPSCR.
unsigned Mode = MI.getOperand(1).getImm();
BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
- .addImm(31);
+ .addImm(31)
+ .addReg(PPC::RM, RegState::ImplicitDefine);
BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
- .addImm(30);
+ .addImm(30)
+ .addReg(PPC::RM, RegState::ImplicitDefine);
} else if (MI.getOpcode() == PPC::SETRND) {
DebugLoc dl = MI.getDebugLoc();
@@ -12841,6 +12086,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(NewFPSCRReg)
.addImm(0)
.addImm(0);
+ } else if (MI.getOpcode() == PPC::SETFLM) {
+ DebugLoc Dl = MI.getDebugLoc();
+
+ // Result of setflm is previous FPSCR content, so we need to save it first.
+ Register OldFPSCRReg = MI.getOperand(0).getReg();
+ BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);
+
+ // Put bits in 32:63 to FPSCR.
+ Register NewFPSCRReg = MI.getOperand(1).getReg();
+ BuildMI(*BB, MI, Dl, TII->get(PPC::MTFSF))
+ .addImm(255)
+ .addReg(NewFPSCRReg)
+ .addImm(0)
+ .addImm(0);
} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
return emitProbedAlloca(MI, BB);
@@ -12867,6 +12126,47 @@ static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
return RefinementSteps;
}
+SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+ const DenormalMode &Mode) const {
+ // We only have VSX Vector Test for software Square Root.
+ EVT VT = Op.getValueType();
+ if (!isTypeLegal(MVT::i1) ||
+ (VT != MVT::f64 &&
+ ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
+ return TargetLowering::getSqrtInputTest(Op, DAG, Mode);
+
+ SDLoc DL(Op);
+ // The output register of FTSQRT is CR field.
+ SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op);
+ // ftsqrt BF,FRB
+ // Let e_b be the unbiased exponent of the double-precision
+ // floating-point operand in register FRB.
+ // fe_flag is set to 1 if either of the following conditions occurs.
+ // - The double-precision floating-point operand in register FRB is a zero,
+ // a NaN, or an infinity, or a negative value.
+ // - e_b is less than or equal to -970.
+ // Otherwise fe_flag is set to 0.
+ // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
+ // not eligible for iteration. (zero/negative/infinity/nan or unbiased
+ // exponent is less than -970)
+ SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32);
+ return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1,
+ FTSQRT, SRIdxVal),
+ 0);
+}
+
+SDValue
+PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
+ SelectionDAG &DAG) const {
+ // We only have VSX Vector Square Root.
+ EVT VT = Op.getValueType();
+ if (VT != MVT::f64 &&
+ ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
+ return TargetLowering::getSqrtResultForDenormInput(Op, DAG);
+
+ return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op);
+}
+
SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR,
@@ -12875,9 +12175,7 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
- (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
- (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
- (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+ (VT == MVT::v2f64 && Subtarget.hasVSX())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
@@ -12896,9 +12194,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
(VT == MVT::f64 && Subtarget.hasFRE()) ||
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
- (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
- (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
- (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+ (VT == MVT::v2f64 && Subtarget.hasVSX())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
@@ -12996,24 +12292,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
EVT VT;
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
default: return false;
- case Intrinsic::ppc_qpx_qvlfd:
- case Intrinsic::ppc_qpx_qvlfda:
- VT = MVT::v4f64;
- break;
- case Intrinsic::ppc_qpx_qvlfs:
- case Intrinsic::ppc_qpx_qvlfsa:
- VT = MVT::v4f32;
- break;
- case Intrinsic::ppc_qpx_qvlfcd:
- case Intrinsic::ppc_qpx_qvlfcda:
- VT = MVT::v2f64;
- break;
- case Intrinsic::ppc_qpx_qvlfcs:
- case Intrinsic::ppc_qpx_qvlfcsa:
- VT = MVT::v2f32;
- break;
- case Intrinsic::ppc_qpx_qvlfiwa:
- case Intrinsic::ppc_qpx_qvlfiwz:
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_vsx_lxvw4x:
@@ -13042,24 +12320,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
EVT VT;
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
default: return false;
- case Intrinsic::ppc_qpx_qvstfd:
- case Intrinsic::ppc_qpx_qvstfda:
- VT = MVT::v4f64;
- break;
- case Intrinsic::ppc_qpx_qvstfs:
- case Intrinsic::ppc_qpx_qvstfsa:
- VT = MVT::v4f32;
- break;
- case Intrinsic::ppc_qpx_qvstfcd:
- case Intrinsic::ppc_qpx_qvstfcda:
- VT = MVT::v2f64;
- break;
- case Intrinsic::ppc_qpx_qvstfcs:
- case Intrinsic::ppc_qpx_qvstfcsa:
- VT = MVT::v2f32;
- break;
- case Intrinsic::ppc_qpx_qvstfiw:
- case Intrinsic::ppc_qpx_qvstfiwa:
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_vsx_stxvw4x:
@@ -13287,11 +12547,13 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));
// We don't really care about what is known about the first bit (if
- // anything), so clear it in all masks prior to comparing them.
- Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
- Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);
+ // anything), so pretend that it is known zero for both to ensure they can
+ // be compared as constants.
+ Op1Known.Zero.setBit(0); Op1Known.One.clearBit(0);
+ Op2Known.Zero.setBit(0); Op2Known.One.clearBit(0);
- if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One)
+ if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
+ Op1Known.getConstant() != Op2Known.getConstant())
return SDValue();
}
}
@@ -13343,8 +12605,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
// Visit all inputs, collect all binary operations (and, or, xor and
// select) that are all fed by extensions.
while (!BinOps.empty()) {
- SDValue BinOp = BinOps.back();
- BinOps.pop_back();
+ SDValue BinOp = BinOps.pop_back_val();
if (!Visited.insert(BinOp.getNode()).second)
continue;
@@ -13559,8 +12820,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
// Visit all inputs, collect all binary operations (and, or, xor and
// select) that are all fed by truncations.
while (!BinOps.empty()) {
- SDValue BinOp = BinOps.back();
- BinOps.pop_back();
+ SDValue BinOp = BinOps.pop_back_val();
if (!Visited.insert(BinOp.getNode()).second)
continue;
@@ -14157,6 +13417,46 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+// Look for the pattern of a load from a narrow width to i128, feeding
+// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
+// (LXVRZX). This node represents a zero extending load that will be matched
+// to the Load VSX Vector Rightmost instructions.
+static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // This combine is only eligible for a BUILD_VECTOR of v1i128.
+ if (N->getValueType(0) != MVT::v1i128)
+ return SDValue();
+
+ SDValue Operand = N->getOperand(0);
+ // Proceed with the transformation if the operand to the BUILD_VECTOR
+ // is a load instruction.
+ if (Operand.getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Operand);
+ EVT MemoryType = LD->getMemoryVT();
+
+ // This transformation is only valid if the we are loading either a byte,
+ // halfword, word, or doubleword.
+ bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
+ MemoryType == MVT::i32 || MemoryType == MVT::i64;
+
+ // Ensure that the load from the narrow width is being zero extended to i128.
+ if (!ValidLDType ||
+ (LD->getExtensionType() != ISD::ZEXTLOAD &&
+ LD->getExtensionType() != ISD::EXTLOAD))
+ return SDValue();
+
+ SDValue LoadOps[] = {
+ LD->getChain(), LD->getBasePtr(),
+ DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), DL)};
+
+ return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, DL,
+ DAG.getVTList(MVT::v1i128, MVT::Other),
+ LoadOps, MemoryType, LD->getMemOperand());
+}
+
SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
@@ -14194,6 +13494,14 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
return Reduced;
}
+ // On Power10, the Load VSX Vector Rightmost instructions can be utilized
+ // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
+ // is a load from <valid narrow width> to i128.
+ if (Subtarget.isISA3_1()) {
+ SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
+ if (BVOfZLoad)
+ return BVOfZLoad;
+ }
if (N->getValueType(0) != MVT::v2f64)
return SDValue();
@@ -14495,8 +13803,7 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
EVT Op1VT = N->getOperand(1).getValueType();
EVT ResVT = Val.getValueType();
- // Floating point types smaller than 32 bits are not legal on Power.
- if (ResVT.getScalarSizeInBits() < 32)
+ if (!isTypeLegal(ResVT))
return SDValue();
// Only perform combine for conversion to i64/i32 or power9 i16/i8.
@@ -14590,7 +13897,6 @@ static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
ShuffV[i] += HalfVec;
}
- return;
}
// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
@@ -15059,18 +14365,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
EVT MemVT = LD->getMemoryVT();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
- Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
- Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);
if (LD->isUnindexed() && VT.isVector() &&
((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
// P8 and later hardware should just use LOAD.
!Subtarget.hasP8Vector() &&
(VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
- VT == MVT::v4f32)) ||
- (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
- LD->getAlign() >= ScalarABIAlignment)) &&
+ VT == MVT::v4f32))) &&
LD->getAlign() < ABIAlignment) {
- // This is a type-legal unaligned Altivec or QPX load.
+ // This is a type-legal unaligned Altivec load.
SDValue Chain = LD->getChain();
SDValue Ptr = LD->getBasePtr();
bool isLittleEndian = Subtarget.isLittleEndian();
@@ -15101,24 +14403,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// optimization later.
Intrinsic::ID Intr, IntrLD, IntrPerm;
MVT PermCntlTy, PermTy, LDTy;
- if (Subtarget.hasAltivec()) {
- Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
- Intrinsic::ppc_altivec_lvsl;
- IntrLD = Intrinsic::ppc_altivec_lvx;
- IntrPerm = Intrinsic::ppc_altivec_vperm;
- PermCntlTy = MVT::v16i8;
- PermTy = MVT::v4i32;
- LDTy = MVT::v4i32;
- } else {
- Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
- Intrinsic::ppc_qpx_qvlpcls;
- IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
- Intrinsic::ppc_qpx_qvlfs;
- IntrPerm = Intrinsic::ppc_qpx_qvfperm;
- PermCntlTy = MVT::v4f64;
- PermTy = MVT::v4f64;
- LDTy = MemVT.getSimpleVT();
- }
+ Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
+ : Intrinsic::ppc_altivec_lvsl;
+ IntrLD = Intrinsic::ppc_altivec_lvx;
+ IntrPerm = Intrinsic::ppc_altivec_vperm;
+ PermCntlTy = MVT::v16i8;
+ PermTy = MVT::v4i32;
+ LDTy = MVT::v4i32;
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
@@ -15189,10 +14480,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
if (VT != PermTy)
- Perm = Subtarget.hasAltivec() ?
- DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
- DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
- DAG.getTargetConstant(1, dl, MVT::i64));
+ Perm = Subtarget.hasAltivec()
+ ? DAG.getNode(ISD::BITCAST, dl, VT, Perm)
+ : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm,
+ DAG.getTargetConstant(1, dl, MVT::i64));
// second argument is 1 because this rounding
// is always exact.
@@ -15208,14 +14499,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
: Intrinsic::ppc_altivec_lvsl);
- if ((IID == Intr ||
- IID == Intrinsic::ppc_qpx_qvlpcld ||
- IID == Intrinsic::ppc_qpx_qvlpcls) &&
- N->getOperand(1)->getOpcode() == ISD::ADD) {
+ if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {
SDValue Add = N->getOperand(1);
- int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
- 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
+ int Bits = 4 /* 16 byte alignment */;
if (DAG.MaskedValueIsZero(Add->getOperand(1),
APInt::getAllOnesValue(Bits /* alignment */)
@@ -15225,7 +14512,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
UE = BasePtr->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
- cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
+ cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
+ IID) {
// We've found another LVSL/LVSR, and this address is an aligned
// multiple of that one. The results will be the same, so use the
// one we've just found instead.
@@ -15357,43 +14645,43 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
break;
case PPCISD::VCMP:
- // If a VCMPo node already exists with exactly the same operands as this
- // node, use its result instead of this node (VCMPo computes both a CR6 and
- // a normal output).
+ // If a VCMP_rec node already exists with exactly the same operands as this
+ // node, use its result instead of this node (VCMP_rec computes both a CR6
+ // and a normal output).
//
if (!N->getOperand(0).hasOneUse() &&
!N->getOperand(1).hasOneUse() &&
!N->getOperand(2).hasOneUse()) {
- // Scan all of the users of the LHS, looking for VCMPo's that match.
- SDNode *VCMPoNode = nullptr;
+ // Scan all of the users of the LHS, looking for VCMP_rec's that match.
+ SDNode *VCMPrecNode = nullptr;
SDNode *LHSN = N->getOperand(0).getNode();
for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
UI != E; ++UI)
- if (UI->getOpcode() == PPCISD::VCMPo &&
+ if (UI->getOpcode() == PPCISD::VCMP_rec &&
UI->getOperand(1) == N->getOperand(1) &&
UI->getOperand(2) == N->getOperand(2) &&
UI->getOperand(0) == N->getOperand(0)) {
- VCMPoNode = *UI;
+ VCMPrecNode = *UI;
break;
}
- // If there is no VCMPo node, or if the flag value has a single use, don't
- // transform this.
- if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+ // If there is no VCMP_rec node, or if the flag value has a single use,
+ // don't transform this.
+ if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(0, 1))
break;
// Look at the (necessarily single) use of the flag value. If it has a
// chain, this transformation is more complex. Note that multiple things
// could use the value result, which we should ignore.
SDNode *FlagUser = nullptr;
- for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+ for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
FlagUser == nullptr; ++UI) {
- assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+ assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
SDNode *User = *UI;
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
- if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+ if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) {
FlagUser = User;
break;
}
@@ -15403,7 +14691,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// If the user is a MFOCRF instruction, we know this is safe.
// Otherwise we give up for right now.
if (FlagUser->getOpcode() == PPCISD::MFOCRF)
- return SDValue(VCMPoNode, 0);
+ return SDValue(VCMPrecNode, 0);
}
break;
case ISD::BRCOND: {
@@ -15492,7 +14780,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
DAG.getConstant(CompareOpc, dl, MVT::i32)
};
EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
- SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+ SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
// Unpack the result based on how the target uses it.
PPC::Predicate CompOpc;
@@ -15587,16 +14875,19 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case Intrinsic::ppc_altivec_vcmpequh_p:
case Intrinsic::ppc_altivec_vcmpequw_p:
case Intrinsic::ppc_altivec_vcmpequd_p:
+ case Intrinsic::ppc_altivec_vcmpequq_p:
case Intrinsic::ppc_altivec_vcmpgefp_p:
case Intrinsic::ppc_altivec_vcmpgtfp_p:
case Intrinsic::ppc_altivec_vcmpgtsb_p:
case Intrinsic::ppc_altivec_vcmpgtsh_p:
case Intrinsic::ppc_altivec_vcmpgtsw_p:
case Intrinsic::ppc_altivec_vcmpgtsd_p:
+ case Intrinsic::ppc_altivec_vcmpgtsq_p:
case Intrinsic::ppc_altivec_vcmpgtub_p:
case Intrinsic::ppc_altivec_vcmpgtuh_p:
case Intrinsic::ppc_altivec_vcmpgtuw_p:
case Intrinsic::ppc_altivec_vcmpgtud_p:
+ case Intrinsic::ppc_altivec_vcmpgtuq_p:
Known.Zero = ~1U; // All bits but the low one are known to be zero.
break;
}
@@ -15774,17 +15065,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &PPC::F4RCRegClass);
if (VT == MVT::f64 || VT == MVT::i64)
return std::make_pair(0U, &PPC::F8RCRegClass);
- if (VT == MVT::v4f64 && Subtarget.hasQPX())
- return std::make_pair(0U, &PPC::QFRCRegClass);
- if (VT == MVT::v4f32 && Subtarget.hasQPX())
- return std::make_pair(0U, &PPC::QSRCRegClass);
}
break;
case 'v':
- if (VT == MVT::v4f64 && Subtarget.hasQPX())
- return std::make_pair(0U, &PPC::QFRCRegClass);
- if (VT == MVT::v4f32 && Subtarget.hasQPX())
- return std::make_pair(0U, &PPC::QSRCRegClass);
if (Subtarget.hasAltivec())
return std::make_pair(0U, &PPC::VRRCRegClass);
break;
@@ -15920,9 +15203,15 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// by AM is legal for this target, for a load/store of the specified type.
bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
- unsigned AS, Instruction *I) const {
- // PPC does not allow r+i addressing modes for vectors!
- if (Ty->isVectorTy() && AM.BaseOffs != 0)
+ unsigned AS,
+ Instruction *I) const {
+ // Vector type r+i form is supported since power9 as DQ form. We don't check
+ // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
+ // imm form is preferred and the offset can be adjusted to use imm form later
+ // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
+ // max offset to check legal addressing mode, we should be a little aggressive
+ // to contain other offsets for that LSRUse.
+ if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
return false;
// PPC allows a sign-extended 16-bit immediate field.
@@ -16076,19 +15365,17 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
- case Intrinsic::ppc_qpx_qvlfd:
- case Intrinsic::ppc_qpx_qvlfs:
- case Intrinsic::ppc_qpx_qvlfcd:
- case Intrinsic::ppc_qpx_qvlfcs:
- case Intrinsic::ppc_qpx_qvlfiwa:
- case Intrinsic::ppc_qpx_qvlfiwz:
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
case Intrinsic::ppc_altivec_lvehx:
case Intrinsic::ppc_altivec_lvewx:
case Intrinsic::ppc_vsx_lxvd2x:
- case Intrinsic::ppc_vsx_lxvw4x: {
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x_be:
+ case Intrinsic::ppc_vsx_lxvw4x_be:
+ case Intrinsic::ppc_vsx_lxvl:
+ case Intrinsic::ppc_vsx_lxvll: {
EVT VT;
switch (Intrinsic) {
case Intrinsic::ppc_altivec_lvebx:
@@ -16101,20 +15388,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
VT = MVT::i32;
break;
case Intrinsic::ppc_vsx_lxvd2x:
+ case Intrinsic::ppc_vsx_lxvd2x_be:
VT = MVT::v2f64;
break;
- case Intrinsic::ppc_qpx_qvlfd:
- VT = MVT::v4f64;
- break;
- case Intrinsic::ppc_qpx_qvlfs:
- VT = MVT::v4f32;
- break;
- case Intrinsic::ppc_qpx_qvlfcd:
- VT = MVT::v2f64;
- break;
- case Intrinsic::ppc_qpx_qvlfcs:
- VT = MVT::v2f32;
- break;
default:
VT = MVT::v4i32;
break;
@@ -16129,52 +15405,17 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad;
return true;
}
- case Intrinsic::ppc_qpx_qvlfda:
- case Intrinsic::ppc_qpx_qvlfsa:
- case Intrinsic::ppc_qpx_qvlfcda:
- case Intrinsic::ppc_qpx_qvlfcsa:
- case Intrinsic::ppc_qpx_qvlfiwaa:
- case Intrinsic::ppc_qpx_qvlfiwza: {
- EVT VT;
- switch (Intrinsic) {
- case Intrinsic::ppc_qpx_qvlfda:
- VT = MVT::v4f64;
- break;
- case Intrinsic::ppc_qpx_qvlfsa:
- VT = MVT::v4f32;
- break;
- case Intrinsic::ppc_qpx_qvlfcda:
- VT = MVT::v2f64;
- break;
- case Intrinsic::ppc_qpx_qvlfcsa:
- VT = MVT::v2f32;
- break;
- default:
- VT = MVT::v4i32;
- break;
- }
-
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = VT;
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Info.size = VT.getStoreSize();
- Info.align = Align(1);
- Info.flags = MachineMemOperand::MOLoad;
- return true;
- }
- case Intrinsic::ppc_qpx_qvstfd:
- case Intrinsic::ppc_qpx_qvstfs:
- case Intrinsic::ppc_qpx_qvstfcd:
- case Intrinsic::ppc_qpx_qvstfcs:
- case Intrinsic::ppc_qpx_qvstfiw:
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_altivec_stvebx:
case Intrinsic::ppc_altivec_stvehx:
case Intrinsic::ppc_altivec_stvewx:
case Intrinsic::ppc_vsx_stxvd2x:
- case Intrinsic::ppc_vsx_stxvw4x: {
+ case Intrinsic::ppc_vsx_stxvw4x:
+ case Intrinsic::ppc_vsx_stxvd2x_be:
+ case Intrinsic::ppc_vsx_stxvw4x_be:
+ case Intrinsic::ppc_vsx_stxvl:
+ case Intrinsic::ppc_vsx_stxvll: {
EVT VT;
switch (Intrinsic) {
case Intrinsic::ppc_altivec_stvebx:
@@ -16187,20 +15428,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
VT = MVT::i32;
break;
case Intrinsic::ppc_vsx_stxvd2x:
+ case Intrinsic::ppc_vsx_stxvd2x_be:
VT = MVT::v2f64;
break;
- case Intrinsic::ppc_qpx_qvstfd:
- VT = MVT::v4f64;
- break;
- case Intrinsic::ppc_qpx_qvstfs:
- VT = MVT::v4f32;
- break;
- case Intrinsic::ppc_qpx_qvstfcd:
- VT = MVT::v2f64;
- break;
- case Intrinsic::ppc_qpx_qvstfcs:
- VT = MVT::v2f32;
- break;
default:
VT = MVT::v4i32;
break;
@@ -16215,39 +15445,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::ppc_qpx_qvstfda:
- case Intrinsic::ppc_qpx_qvstfsa:
- case Intrinsic::ppc_qpx_qvstfcda:
- case Intrinsic::ppc_qpx_qvstfcsa:
- case Intrinsic::ppc_qpx_qvstfiwa: {
- EVT VT;
- switch (Intrinsic) {
- case Intrinsic::ppc_qpx_qvstfda:
- VT = MVT::v4f64;
- break;
- case Intrinsic::ppc_qpx_qvstfsa:
- VT = MVT::v4f32;
- break;
- case Intrinsic::ppc_qpx_qvstfcda:
- VT = MVT::v2f64;
- break;
- case Intrinsic::ppc_qpx_qvstfcsa:
- VT = MVT::v2f32;
- break;
- default:
- VT = MVT::v4i32;
- break;
- }
-
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = VT;
- Info.ptrVal = I.getArgOperand(1);
- Info.offset = 0;
- Info.size = VT.getStoreSize();
- Info.align = Align(1);
- Info.flags = MachineMemOperand::MOStore;
- return true;
- }
default:
break;
}
@@ -16260,14 +15457,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
EVT PPCTargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
- // When expanding a memset, require at least two QPX instructions to cover
- // the cost of loading the value to be stored from the constant pool.
- if (Subtarget.hasQPX() && Op.size() >= 32 &&
- (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) &&
- !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
- return MVT::v4f64;
- }
-
// We should use Altivec/VSX loads and stores when available. For unaligned
// addresses, unaligned VSX loads are only fast starting with the P8.
if (Subtarget.hasAltivec() && Op.size() >= 16 &&
@@ -16386,6 +15575,33 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return true;
}
+bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const {
+ // Check integral scalar types.
+ if (!VT.isScalarInteger())
+ return false;
+ if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
+ if (!ConstNode->getAPIntValue().isSignedIntN(64))
+ return false;
+ // This transformation will generate >= 2 operations. But the following
+ // cases will generate <= 2 instructions during ISEL. So exclude them.
+ // 1. If the constant multiplier fits 16 bits, it can be handled by one
+ // HW instruction, ie. MULLI
+ // 2. If the multiplier after shifted fits 16 bits, an extra shift
+ // instruction is needed than case 1, ie. MULLI and RLDICR
+ int64_t Imm = ConstNode->getSExtValue();
+ unsigned Shift = countTrailingZeros<uint64_t>(Imm);
+ Imm >>= Shift;
+ if (isInt<16>(Imm))
+ return false;
+ uint64_t UImm = static_cast<uint64_t>(Imm);
+ if (isPowerOf2_64(UImm + 1) || isPowerOf2_64(UImm - 1) ||
+ isPowerOf2_64(1 - UImm) || isPowerOf2_64(-1 - UImm))
+ return true;
+ }
+ return false;
+}
+
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
return isFMAFasterThanFMulAndFAdd(
@@ -16405,31 +15621,56 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
}
}
-// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.
-// FIXME: add more patterns which are profitable to hoist.
+// FIXME: add more patterns which are not profitable to hoist.
bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
- if (I->getOpcode() != Instruction::FMul)
- return true;
-
if (!I->hasOneUse())
return true;
Instruction *User = I->user_back();
assert(User && "A single use instruction with no uses.");
- if (User->getOpcode() != Instruction::FSub &&
- User->getOpcode() != Instruction::FAdd)
- return true;
+ switch (I->getOpcode()) {
+ case Instruction::FMul: {
+ // Don't break FMA, PowerPC prefers FMA.
+ if (User->getOpcode() != Instruction::FSub &&
+ User->getOpcode() != Instruction::FAdd)
+ return true;
- const TargetOptions &Options = getTargetMachine().Options;
- const Function *F = I->getFunction();
- const DataLayout &DL = F->getParent()->getDataLayout();
- Type *Ty = User->getOperand(0)->getType();
+ const TargetOptions &Options = getTargetMachine().Options;
+ const Function *F = I->getFunction();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ Type *Ty = User->getOperand(0)->getType();
+
+ return !(
+ isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+ isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+ }
+ case Instruction::Load: {
+ // Don't break "store (load float*)" pattern, this pattern will be combined
+ // to "store (load int32)" in later InstCombine pass. See function
+ // combineLoadToOperationType. On PowerPC, loading a float point takes more
+ // cycles than loading a 32 bit integer.
+ LoadInst *LI = cast<LoadInst>(I);
+ // For the loads that combineLoadToOperationType does nothing, like
+ // ordered load, it should be profitable to hoist them.
+ // For swifterror load, it can only be used for pointer to pointer type, so
+ // later type check should get rid of this case.
+ if (!LI->isUnordered())
+ return true;
+
+ if (User->getOpcode() != Instruction::Store)
+ return true;
+
+ if (I->getType()->getTypeID() != Type::FloatTyID)
+ return true;
- return !(
- isFMAFasterThanFMulAndFAdd(*F, Ty) &&
- isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
- (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+ return false;
+ }
+ default:
+ return true;
+ }
+ return true;
}
const MCPhysReg *
@@ -16461,7 +15702,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
if (VT == MVT::v2i64)
return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
- if (Subtarget.hasVSX() || Subtarget.hasQPX())
+ if (Subtarget.hasVSX())
return true;
return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
@@ -16507,8 +15748,7 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
switch (Opc) {
case PPCISD::FNMSUB:
- // TODO: QPX subtarget is deprecated. No transformation here.
- if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX())
+ if (!Op.hasOneUse() || !isTypeLegal(VT))
break;
const TargetOptions &Options = getTargetMachine().Options;
@@ -16637,10 +15877,10 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
SDValue N0 = N->getOperand(0);
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!Subtarget.isISA3_0() ||
+ if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
N0.getOpcode() != ISD::SIGN_EXTEND ||
- N0.getOperand(0).getValueType() != MVT::i32 ||
- CN1 == nullptr || N->getValueType(0) != MVT::i64)
+ N0.getOperand(0).getValueType() != MVT::i32 || CN1 == nullptr ||
+ N->getValueType(0) != MVT::i64)
return SDValue();
// We can't save an operation here if the value is already extended, and
@@ -16989,8 +16229,7 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N,
bool LegalOps = !DCI.isBeforeLegalizeOps();
SDLoc Loc(N);
- // TODO: QPX subtarget is deprecated. No transformation here.
- if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT))
+ if (!isOperationLegal(ISD::FMA, VT))
return SDValue();
// Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 768eaa43e013..477105bd03ac 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -89,6 +89,12 @@ namespace llvm {
FRE,
FRSQRTE,
+ /// Test instruction for software square root.
+ FTSQRT,
+
+ /// Square root instruction.
+ FSQRT,
+
/// VPERM - The PPC VPERM Instruction.
///
VPERM,
@@ -146,8 +152,7 @@ namespace llvm {
/// probed.
PROBED_ALLOCA,
- /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
- /// at function entry, used for PIC code.
+ /// The result of the mflr at function entry, used for PIC code.
GlobalBaseReg,
/// These nodes represent PPC shifts.
@@ -265,11 +270,11 @@ namespace llvm {
/// is VCMPGTSH.
VCMP,
- /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
- /// altivec VCMP*o instructions. For lack of better number, we use the
+ /// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
+ /// altivec VCMP*_rec instructions. For lack of better number, we use the
/// opcode number encoding for the OPC field to identify the compare. For
/// example, 838 is VCMPGTSH.
- VCMPo,
+ VCMP_rec,
/// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
/// corresponds to the COND_BRANCH pseudo instruction. CRRC is the
@@ -381,6 +386,10 @@ namespace llvm {
/// sym\@got\@dtprel\@l.
ADDI_DTPREL_L,
+ /// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
+ /// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
+ PADDI_DTPREL,
+
/// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
/// during instruction selection to optimize a BUILD_VECTOR into
/// operations on splats. This is necessary to avoid losing these
@@ -427,22 +436,6 @@ namespace llvm {
/// => VABSDUW((XVNEGSP a), (XVNEGSP b))
VABSD,
- /// QVFPERM = This corresponds to the QPX qvfperm instruction.
- QVFPERM,
-
- /// QVGPCI = This corresponds to the QPX qvgpci instruction.
- QVGPCI,
-
- /// QVALIGNI = This corresponds to the QPX qvaligni instruction.
- QVALIGNI,
-
- /// QVESPLATI = This corresponds to the QPX qvesplati instruction.
- QVESPLATI,
-
- /// QBFLT = Access the underlying QPX floating-point boolean
- /// representation.
- QBFLT,
-
/// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
/// lower (IDX=1) half of v4f32 to v2f64.
FP_EXTEND_HALF,
@@ -452,6 +445,46 @@ namespace llvm {
/// PLD.
MAT_PCREL_ADDR,
+ /// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
+ /// TLS global address when using dynamic access models. This can be done
+ /// through an add like PADDI.
+ TLS_DYNAMIC_MAT_PCREL_ADDR,
+
+ /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
+ /// when using local exec access models, and when prefixed instructions are
+ /// available. This is used with ADD_TLS to produce an add like PADDI.
+ TLS_LOCAL_EXEC_MAT_ADDR,
+
+ /// ACC_BUILD = Build an accumulator register from 4 VSX registers.
+ ACC_BUILD,
+
+ /// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
+ PAIR_BUILD,
+
+ /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
+ /// an accumulator or pair register. This node is needed because
+ /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
+ /// element type.
+ EXTRACT_VSX_REG,
+
+ /// XXMFACC = This corresponds to the xxmfacc instruction.
+ XXMFACC,
+
+ // Constrained conversion from floating point to int
+ STRICT_FCTIDZ = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+ STRICT_FCTIWZ,
+ STRICT_FCTIDUZ,
+ STRICT_FCTIWUZ,
+
+ /// Constrained integer-to-floating-point conversion instructions.
+ STRICT_FCFID,
+ STRICT_FCFIDU,
+ STRICT_FCFIDS,
+ STRICT_FCFIDUS,
+
+ /// Constrained floating point add in round-to-zero mode.
+ STRICT_FADDRTZ,
+
/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
/// the GPRC input, then stores it through Ptr. Type can be either i16 or
@@ -493,6 +526,12 @@ namespace llvm {
/// an xxswapd.
LXVD2X,
+ /// LXVRZX - Load VSX Vector Rightmost and Zero Extend
+ /// This node represents v1i128 BUILD_VECTOR of a zero extending load
+ /// instruction from <byte, halfword, word, or doubleword> to i128.
+ /// Allows utilization of the Load VSX Vector Rightmost Instructions.
+ LXVRZX,
+
/// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
/// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
/// the vector type to load vector in big-endian element order.
@@ -519,10 +558,6 @@ namespace llvm {
/// Store scalar integers from VSR.
ST_VSR_SCAL_INT,
- /// QBRC, CHAIN = QVLFSb CHAIN, Ptr
- /// The 4xf32 load used for v4i1 constants.
- QVLFSb,
-
/// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
/// except they ensure that the compare input is zero-extended for
/// sub-word versions because the atomic loads zero-extend.
@@ -627,10 +662,6 @@ namespace llvm {
/// the number of bytes of each element [124] -> [bhw].
SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
- /// If this is a qvaligni shuffle mask, return the shift
- /// amount, otherwise return -1.
- int isQVALIGNIShuffleMask(SDNode *N);
-
} // end namespace PPC
class PPCTargetLowering : public TargetLowering {
@@ -740,6 +771,8 @@ namespace llvm {
bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
SelectionDAG &DAG,
MaybeAlign EncodingAlignment) const;
+ bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base,
+ SelectionDAG &DAG) const;
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
/// represented as an indexed [r+r] operation.
@@ -895,6 +928,9 @@ namespace llvm {
return true;
}
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
+
bool isDesirableToTransformToIntegerOp(unsigned Opc,
EVT VT) const override {
// Only handle float load/store pair because float(fpr) load/store
@@ -980,11 +1016,6 @@ namespace llvm {
Register
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
- /// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a
- /// specific type is cheaper than a multiply followed by a shift.
- /// This is true for words and doublewords on 64-bit PowerPC.
- bool isMulhCheaperThanMulShift(EVT Type) const override;
-
/// Override to support customized stack guard loading.
bool useLoadStackGuardNode() const override;
void insertSSPDeclarations(Module &M) const override;
@@ -1042,11 +1073,6 @@ namespace llvm {
}
};
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
- // Addrspacecasts are always noops.
- return true;
- }
-
bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
SelectionDAG &DAG,
ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
@@ -1117,19 +1143,18 @@ namespace llvm {
SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
@@ -1176,10 +1201,6 @@ namespace llvm {
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
- SDValue LowerFormalArguments_Darwin(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
SDValue LowerFormalArguments_64SVR4(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -1194,13 +1215,6 @@ namespace llvm {
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
const SDLoc &dl) const;
- SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee, CallFlags CFlags,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- const SDLoc &dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals,
- const CallBase *CB) const;
SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
@@ -1257,6 +1271,10 @@ namespace llvm {
bool Reciprocal) const override;
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps) const override;
+ SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+ const DenormalMode &Mode) const override;
+ SDValue getSqrtResultForDenormInput(SDValue Operand,
+ SelectionDAG &DAG) const override;
unsigned combineRepeatedFPDivisors() const override;
SDValue
@@ -1295,6 +1313,8 @@ namespace llvm {
bool isIntS16Immediate(SDNode *N, int16_t &Imm);
bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+ bool isIntS34Immediate(SDNode *N, int64_t &Imm);
+ bool isIntS34Immediate(SDValue Op, int64_t &Imm);
bool convertToNonDenormSingle(APInt &ArgAPInt);
bool convertToNonDenormSingle(APFloat &ArgAPFloat);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index de42d354a048..03e9d6970a30 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -19,12 +19,14 @@ def s16imm64 : Operand<i64> {
let EncoderMethod = "getImm16Encoding";
let ParserMatchClass = PPCS16ImmAsmOperand;
let DecoderMethod = "decodeSImmOperand<16>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def u16imm64 : Operand<i64> {
let PrintMethod = "printU16ImmOperand";
let EncoderMethod = "getImm16Encoding";
let ParserMatchClass = PPCU16ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<16>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def s17imm64 : Operand<i64> {
// This operand type is used for addis/lis to allow the assembler parser
@@ -34,6 +36,7 @@ def s17imm64 : Operand<i64> {
let EncoderMethod = "getImm16Encoding";
let ParserMatchClass = PPCS17ImmAsmOperand;
let DecoderMethod = "decodeSImmOperand<16>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def tocentry : Operand<iPTR> {
let MIOperandInfo = (ops i64imm:$imm);
@@ -148,6 +151,9 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
def BL8_NOTOC : IForm<18, 0, 1, (outs),
(ins calltarget:$func),
"bl $func", IIC_BrB, []>;
+ def BL8_NOTOC_TLS : IForm<18, 0, 1, (outs),
+ (ins tlscall:$func),
+ "bl $func", IIC_BrB, []>;
}
}
let Uses = [CTR8, RM] in {
@@ -840,7 +846,7 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
"setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
}
-def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L),
+def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins u2imm:$L),
"darn $RT, $L", IIC_LdStLD>, isPPC64;
def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D),
"addpcis $RT, $D", IIC_BrB, []>, isPPC64;
@@ -981,8 +987,11 @@ def : InstAlias<"cntlzw. $rA, $rS", (CNTLZW8_rec g8rc:$rA, g8rc:$rS)>;
def : InstAlias<"mtxer $Rx", (MTSPR8 1, g8rc:$Rx)>;
def : InstAlias<"mfxer $Rx", (MFSPR8 g8rc:$Rx, 1)>;
-def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
-def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
+//Disable this alias on AIX for now because as does not support them.
+let Predicates = [ModernAs] in {
+ def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
+ def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
+}
def : InstAlias<"mfrtcu $Rx", (MFSPR8 g8rc:$Rx, 4)>;
def : InstAlias<"mfrtcl $Rx", (MFSPR8 g8rc:$Rx, 5)>;
@@ -1056,7 +1065,7 @@ def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
"lwa $rD, $src", IIC_LdStLWA,
[(set i64:$rD,
- (aligned4sextloadi32 iaddrX4:$src))]>, isPPC64,
+ (DSFormSextLoadi32 iaddrX4:$src))]>, isPPC64,
PPC970_DGroup_Cracked;
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src),
@@ -1167,7 +1176,7 @@ def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
let PPC970_Unit = 2 in {
def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
"ld $rD, $src", IIC_LdStLD,
- [(set i64:$rD, (aligned4load iaddrX4:$src))]>, isPPC64;
+ [(set i64:$rD, (DSFormLoad iaddrX4:$src))]>, isPPC64;
// The following four definitions are selected for small code model only.
// Otherwise, we need to create two instructions to form a 32-bit offset,
// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
@@ -1262,17 +1271,36 @@ def ADDItlsgdL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm6
[(set i64:$rD,
(PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
isPPC64;
-// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
+
+class GETtlsADDRPseudo <string asmstr> : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+ asmstr,
+ [(set i64:$rD,
+ (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+ isPPC64;
+class GETtlsldADDRPseudo <string asmstr> : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+ asmstr,
+ [(set i64:$rD,
+ (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+ isPPC64;
+
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1 in {
+// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
// explicitly defined when this op is created, so not mentioned here.
// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be
// correct because the branch select pass is relying on it.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8,
- Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
- "#GETtlsADDR",
- [(set i64:$rD,
- (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
- isPPC64;
+let Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7], Size = 8 in
+def GETtlsADDR : GETtlsADDRPseudo <"#GETtlsADDR">;
+let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7], Size = 8 in
+def GETtlsADDRPCREL : GETtlsADDRPseudo <"#GETtlsADDRPCREL">;
+
+// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR : GETtlsldADDRPseudo <"#GETtlsldADDR">;
+let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDRPCREL : GETtlsldADDRPseudo <"#GETtlsldADDRPCREL">;
+}
+
// Combined op for ADDItlsgdL and GETtlsADDR, late expanded. X3 and LR8
// are true defines while the rest of the Defs are clobbers.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -1296,15 +1324,6 @@ def ADDItlsldL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm6
[(set i64:$rD,
(PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
isPPC64;
-// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
-// explicitly defined when this op is created, so not mentioned here.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
- Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
-def GETtlsldADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
- "#GETtlsldADDR",
- [(set i64:$rD,
- (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
- isPPC64;
// Combined op for ADDItlsldL and GETtlsADDR, late expanded. X3 and LR8
// are true defines, while the rest of the Defs are clobbers.
let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
@@ -1329,6 +1348,11 @@ def ADDIdtprelL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm
[(set i64:$rD,
(PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
isPPC64;
+def PADDIdtprel : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+ "#PADDIdtprel",
+ [(set i64:$rD,
+ (PPCpaddiDtprel i64:$reg, tglobaltlsaddr:$disp))]>,
+ isPPC64;
let PPC970_Unit = 2 in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
@@ -1359,7 +1383,7 @@ def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
// Normal 8-byte stores.
def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
"std $rS, $dst", IIC_LdStSTD,
- [(aligned4store i64:$rS, iaddrX4:$dst)]>, isPPC64;
+ [(DSFormStore i64:$rS, iaddrX4:$dst)]>, isPPC64;
def STDX : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
"stdx $rS, $dst", IIC_LdStSTD,
[(store i64:$rS, xaddrX4:$dst)]>, isPPC64,
@@ -1426,7 +1450,7 @@ def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
(STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
(STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
-def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+def : Pat<(DSFormPreStore i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
(STDU $rS, iaddroff:$ptroff, $ptrreg)>;
def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
@@ -1444,11 +1468,11 @@ def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
//
-let PPC970_Unit = 3, hasSideEffects = 0,
+let PPC970_Unit = 3, hasSideEffects = 0, mayRaiseFPException = 1,
Uses = [RM] in { // FPU Operations.
defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
"fcfid", "$frD, $frB", IIC_FPGeneral,
- [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+ [(set f64:$frD, (PPCany_fcfid f64:$frB))]>, isPPC64;
defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
"fctid", "$frD, $frB", IIC_FPGeneral,
[]>, isPPC64;
@@ -1457,23 +1481,23 @@ defm FCTIDU : XForm_26r<63, 942, (outs f8rc:$frD), (ins f8rc:$frB),
[]>, isPPC64;
defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
"fctidz", "$frD, $frB", IIC_FPGeneral,
- [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
+ [(set f64:$frD, (PPCany_fctidz f64:$frB))]>, isPPC64;
defm FCFIDU : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
"fcfidu", "$frD, $frB", IIC_FPGeneral,
- [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
+ [(set f64:$frD, (PPCany_fcfidu f64:$frB))]>, isPPC64;
defm FCFIDS : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
"fcfids", "$frD, $frB", IIC_FPGeneral,
- [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
+ [(set f32:$frD, (PPCany_fcfids f64:$frB))]>, isPPC64;
defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
"fcfidus", "$frD, $frB", IIC_FPGeneral,
- [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
+ [(set f32:$frD, (PPCany_fcfidus f64:$frB))]>, isPPC64;
defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
"fctiduz", "$frD, $frB", IIC_FPGeneral,
- [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
+ [(set f64:$frD, (PPCany_fctiduz f64:$frB))]>, isPPC64;
defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
"fctiwuz", "$frD, $frB", IIC_FPGeneral,
- [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
+ [(set f64:$frD, (PPCany_fctiwuz f64:$frB))]>, isPPC64;
}
@@ -1570,11 +1594,11 @@ def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
// Patterns to match r+r indexed loads and stores for
// addresses without at least 4-byte alignment.
-def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+def : Pat<(i64 (NonDSFormSextLoadi32 xoaddr:$src)),
(LWAX xoaddr:$src)>;
-def : Pat<(i64 (unaligned4load xoaddr:$src)),
+def : Pat<(i64 (NonDSFormLoad xoaddr:$src)),
(LDX xoaddr:$src)>;
-def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
+def : Pat<(NonDSFormStore i64:$rS, xoaddr:$dst),
(STDX $rS, xoaddr:$dst)>;
// 64-bits atomic loads and stores
@@ -1585,6 +1609,11 @@ def : Pat<(atomic_store_64 iaddrX4:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr
def : Pat<(atomic_store_64 xaddrX4:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
let Predicates = [IsISA3_0] in {
+// DARN (deliver random number)
+// L=0 for 32-bit, L=1 for conditioned random, L=2 for raw random
+def : Pat<(int_ppc_darn32), (EXTRACT_SUBREG (DARN 0), sub_32)>;
+def : Pat<(int_ppc_darn), (DARN 1)>;
+def : Pat<(int_ppc_darnraw), (DARN 2)>;
class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
InstrItinClass itin, list<dag> pattern>
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 920eeed9d41f..1a34aa09315b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -404,12 +404,14 @@ let isCodeGenOnly = 1 in {
Deprecated<DeprecatedDST>;
}
-def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
- "mfvscr $vD", IIC_LdStStore,
- [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
-def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
- "mtvscr $vB", IIC_LdStLoad,
- [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
+let hasSideEffects = 1 in {
+ def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
+ "mfvscr $vD", IIC_LdStStore,
+ [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
+ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
+ "mtvscr $vB", IIC_LdStLoad,
+ [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
+}
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads.
def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src),
@@ -469,10 +471,11 @@ def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
"vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
[(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
(fneg v4f32:$vB))))]>;
-
-def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
-def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
- v8i16>;
+let hasSideEffects = 1 in {
+ def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
+ def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
+ v8i16>;
+}
def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
} // isCommutable
@@ -608,14 +611,16 @@ def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
v4i32, v16i8, v4i32>;
def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
v4i32, v8i16, v4i32>;
-def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
- v4i32, v8i16, v4i32>;
def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
v4i32, v16i8, v4i32>;
def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
v4i32, v8i16, v4i32>;
-def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
- v4i32, v8i16, v4i32>;
+let hasSideEffects = 1 in {
+ def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
+ v4i32, v8i16, v4i32>;
+ def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
+ v4i32, v8i16, v4i32>;
+}
let isCommutable = 1 in {
def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
@@ -665,15 +670,17 @@ def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;
-def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
-def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
+let hasSideEffects = 1 in {
+ def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
+ def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
-def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
- v4i32, v16i8, v4i32>;
-def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
- v4i32, v8i16, v4i32>;
-def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
- v4i32, v16i8, v4i32>;
+ def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
+ v4i32, v16i8, v4i32>;
+ def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
+ v4i32, v8i16, v4i32>;
+ def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
+ v4i32, v16i8, v4i32>;
+}
def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vnor $vD, $vA, $vB", IIC_VecFP,
@@ -742,26 +749,28 @@ def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
// Vector Pack.
def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
v8i16, v4i32>;
-def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
- v16i8, v8i16>;
-def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
- v16i8, v8i16>;
-def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
- v8i16, v4i32>;
-def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
- v8i16, v4i32>;
+let hasSideEffects = 1 in {
+ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
+ v16i8, v8i16>;
+ def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
+ v16i8, v8i16>;
+ def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
+ v8i16, v4i32>;
+ def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
+ v8i16, v4i32>;
+ def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
+ v16i8, v8i16>;
+ def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
+ v8i16, v4i32>;
+}
def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vpkuhum $vD, $vA, $vB", IIC_VecFP,
[(set v16i8:$vD,
(vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
-def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
- v16i8, v8i16>;
def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vpkuwum $vD, $vA, $vB", IIC_VecFP,
[(set v16i8:$vD,
(vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
-def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
- v8i16, v4i32>;
// Vector Unpack.
def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
@@ -784,47 +793,47 @@ class VCMP<bits<10> xo, string asmstr, ValueType Ty>
: VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
IIC_VecFPCompare,
[(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
-class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+class VCMP_rec<bits<10> xo, string asmstr, ValueType Ty>
: VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
IIC_VecFPCompare,
- [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
+ [(set Ty:$vD, (Ty (PPCvcmp_rec Ty:$vA, Ty:$vB, xo)))]> {
let Defs = [CR6];
let RC = 1;
}
// f32 element comparisons.0
def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>;
-def VCMPBFP_rec : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPBFP_rec : VCMP_rec<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
-def VCMPEQFP_rec : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPEQFP_rec : VCMP_rec<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
-def VCMPGEFP_rec : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP_rec : VCMP_rec<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
-def VCMPGTFP_rec : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP_rec : VCMP_rec<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
// i8 element comparisons.
def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>;
-def VCMPEQUB_rec : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPEQUB_rec : VCMP_rec< 6, "vcmpequb. $vD, $vA, $vB", v16i8>;
def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
-def VCMPGTSB_rec : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB_rec : VCMP_rec<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
-def VCMPGTUB_rec : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB_rec : VCMP_rec<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
// i16 element comparisons.
def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
-def VCMPEQUH_rec : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPEQUH_rec : VCMP_rec< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
-def VCMPGTSH_rec : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH_rec : VCMP_rec<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
-def VCMPGTUH_rec : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH_rec : VCMP_rec<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
// i32 element comparisons.
def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
-def VCMPEQUW_rec : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPEQUW_rec : VCMP_rec<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
-def VCMPGTSW_rec : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW_rec : VCMP_rec<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
-def VCMPGTUW_rec : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW_rec : VCMP_rec<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
isReMaterializable = 1 in {
@@ -933,6 +942,18 @@ def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 VRRC:$src))), (f128 VRRC:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 VRRC:$src))), (f128 VRRC:$src)>;
+
+def : Pat<(v16i8 (bitconvert (f128 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (f128 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (f128 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (f128 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v2f64 (bitconvert (f128 VRRC:$src))), (v2f64 VRRC:$src)>;
+
// Max/Min
def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)),
(v16i8 (VMAXUB $src1, $src2))>;
@@ -1291,11 +1312,11 @@ def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
// i64 element comparisons.
def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>;
-def VCMPEQUD_rec : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
+def VCMPEQUD_rec : VCMP_rec<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>;
-def VCMPGTSD_rec : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
+def VCMPGTSD_rec : VCMP_rec<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>;
-def VCMPGTUD_rec : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;
+def VCMPGTUD_rec : VCMP_rec<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;
// The cryptography instructions that do not require Category:Vector.Crypto
def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb",
@@ -1306,20 +1327,22 @@ def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw",
int_ppc_altivec_crypto_vpmsumw, v4i32>;
def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd",
int_ppc_altivec_crypto_vpmsumd, v2i64>;
-def VPERMXOR : VA1a_Int_Ty<45, "vpermxor",
- int_ppc_altivec_crypto_vpermxor, v16i8>;
+def VPERMXOR : VAForm_1<45, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VC),
+ "vpermxor $VD, $VA, $VB, $VC", IIC_VecFP, []>;
// Vector doubleword integer pack and unpack.
-def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
- v4i32, v2i64>;
-def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
- v4i32, v2i64>;
+let hasSideEffects = 1 in {
+ def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
+ v4i32, v2i64>;
+ def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
+ v4i32, v2i64>;
+ def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
+ v4i32, v2i64>;
+}
def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vpkudum $vD, $vA, $vB", IIC_VecFP,
[(set v16i8:$vD,
(vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>;
-def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
- v4i32, v2i64>;
def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
v2i64, v4i32>;
def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
@@ -1363,21 +1386,21 @@ def VMSUMUDM : VA1a_Int_Ty3<35, "vmsumudm", int_ppc_altivec_vmsumudm,
// i8 element comparisons.
def VCMPNEB : VCMP < 7, "vcmpneb $vD, $vA, $vB" , v16i8>;
-def VCMPNEB_rec : VCMPo < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
+def VCMPNEB_rec : VCMP_rec < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
def VCMPNEZB : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>;
-def VCMPNEZB_rec : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
+def VCMPNEZB_rec : VCMP_rec<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
// i16 element comparisons.
def VCMPNEH : VCMP < 71, "vcmpneh $vD, $vA, $vB" , v8i16>;
-def VCMPNEH_rec : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
+def VCMPNEH_rec : VCMP_rec< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
def VCMPNEZH : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>;
-def VCMPNEZH_rec : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
+def VCMPNEZH_rec : VCMP_rec<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
// i32 element comparisons.
def VCMPNEW : VCMP <135, "vcmpnew $vD, $vA, $vB" , v4i32>;
-def VCMPNEW_rec : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
+def VCMPNEW_rec : VCMP_rec<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
def VCMPNEZW : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>;
-def VCMPNEZW_rec : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
+def VCMPNEZW_rec : VCMP_rec<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
// VX-Form: [PO VRT / UIM VRB XO].
// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
@@ -1449,11 +1472,16 @@ def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd",
[(set v2i64:$vD, (cttz v2i64:$vB))]>;
// Vector Extend Sign
-def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>;
-def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
-def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
-def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
-def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
+def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w",
+ [(set v4i32:$vD, (int_ppc_altivec_vextsb2w v16i8:$vB))]>;
+def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w",
+ [(set v4i32:$vD, (int_ppc_altivec_vextsh2w v8i16:$vB))]>;
+def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d",
+ [(set v2i64:$vD, (int_ppc_altivec_vextsb2d v16i8:$vB))]>;
+def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d",
+ [(set v2i64:$vD, (int_ppc_altivec_vextsh2d v8i16:$vB))]>;
+def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d",
+ [(set v2i64:$vD, (int_ppc_altivec_vextsw2d v4i32:$vB))]>;
let isCodeGenOnly = 1 in {
def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>;
def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 632d4d9deb8a..646efe64a22c 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -637,12 +637,12 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
}
class XForm_17a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin>
+ InstrItinClass itin, list<dag> pattern>
: XForm_17<opcode, xo, OOL, IOL, asmstr, itin > {
let FRA = 0;
+ let Pattern = pattern;
}
-// Used for QPX
class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
@@ -1781,14 +1781,6 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
let Inst{31} = 0;
}
-// Used for QPX
-class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
- : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
- let FRA = 0;
- let FRC = 0;
-}
-
// 1.7.13 M-Form
class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
@@ -2099,49 +2091,6 @@ class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
let Inst{23-31} = xo;
}
-// Z23-Form (used by QPX)
-class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, itin> {
- bits<5> FRT;
- bits<5> FRA;
- bits<5> FRB;
- bits<2> idx;
-
- let Pattern = pattern;
-
- bit RC = 0; // set by isRecordForm
-
- let Inst{6-10} = FRT;
- let Inst{11-15} = FRA;
- let Inst{16-20} = FRB;
- let Inst{21-22} = idx;
- let Inst{23-30} = xo;
- let Inst{31} = RC;
-}
-
-class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
- : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
- let FRB = 0;
-}
-
-class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
- : I<opcode, OOL, IOL, asmstr, itin> {
- bits<5> FRT;
- bits<12> idx;
-
- let Pattern = pattern;
-
- bit RC = 0; // set by isRecordForm
-
- let Inst{6-10} = FRT;
- let Inst{11-22} = idx;
- let Inst{23-30} = xo;
- let Inst{31} = RC;
-}
-
class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td
index 992ad8216f3b..e59a08774dc5 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrHTM.td
@@ -164,9 +164,8 @@ def : Pat<(int_ppc_tsuspend),
(TSR 0)>;
def : Pat<(i64 (int_ppc_ttest)),
- (RLDICL (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (TABORTWCI 0, (LI 0), 0), sub_32)),
- 36, 28)>;
+ (i64 (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), (TABORTWCI 0, (LI 0), 0), sub_32))>;
} // [HasHTM]
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index e428e7155e5e..9e3c6c569bd7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -21,12 +21,15 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/StackMaps.h"
@@ -73,6 +76,14 @@ static cl::opt<bool>
UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
cl::desc("Use the old (incorrect) instruction latency calculation"));
+static cl::opt<float>
+ FMARPFactor("ppc-fma-rp-factor", cl::Hidden, cl::init(1.5),
+ cl::desc("register pressure factor for the transformations."));
+
+static cl::opt<bool> EnableFMARegPressureReduction(
+ "ppc-fma-rp-reduction", cl::Hidden, cl::init(true),
+ cl::desc("enable register pressure reduce in machine combiner pass."));
+
// Pin the vtable to this file.
void PPCInstrInfo::anchor() {}
@@ -259,14 +270,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case PPC::XVMULDP:
case PPC::XVMULSP:
case PPC::XSMULSP:
- // QPX Add:
- case PPC::QVFADD:
- case PPC::QVFADDS:
- case PPC::QVFADDSs:
- // QPX Multiply:
- case PPC::QVFMUL:
- case PPC::QVFMULS:
- case PPC::QVFMULSs:
return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
Inst.getFlag(MachineInstr::MIFlag::FmNsz);
// Fixed point:
@@ -286,23 +289,23 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
#define InfoArrayIdxFMULInst 2
#define InfoArrayIdxAddOpIdx 3
#define InfoArrayIdxMULOpIdx 4
+#define InfoArrayIdxFSubInst 5
// Array keeps info for FMA instructions:
// Index 0(InfoArrayIdxFMAInst): FMA instruction;
-// Index 1(InfoArrayIdxFAddInst): ADD instruction assoaicted with FMA;
-// Index 2(InfoArrayIdxFMULInst): MUL instruction assoaicted with FMA;
+// Index 1(InfoArrayIdxFAddInst): ADD instruction associated with FMA;
+// Index 2(InfoArrayIdxFMULInst): MUL instruction associated with FMA;
// Index 3(InfoArrayIdxAddOpIdx): ADD operand index in FMA operands;
// Index 4(InfoArrayIdxMULOpIdx): first MUL operand index in FMA operands;
-// second MUL operand index is plus 1.
-static const uint16_t FMAOpIdxInfo[][5] = {
+// second MUL operand index is plus 1;
+// Index 5(InfoArrayIdxFSubInst): SUB instruction associated with FMA.
+static const uint16_t FMAOpIdxInfo[][6] = {
// FIXME: Add more FMA instructions like XSNMADDADP and so on.
- {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2},
- {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2},
- {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2},
- {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2},
- {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1},
- {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1},
- {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1},
- {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}};
+ {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2, PPC::XSSUBDP},
+ {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2, PPC::XSSUBSP},
+ {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2, PPC::XVSUBDP},
+ {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2, PPC::XVSUBSP},
+ {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1, PPC::FSUB},
+ {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1, PPC::FSUBS}};
// Check if an opcode is a FMA instruction. If it is, return the index in array
// FMAOpIdxInfo. Otherwise, return -1.
@@ -313,6 +316,8 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const {
return -1;
}
+// On PowerPC target, we have two kinds of patterns related to FMA:
+// 1: Improve ILP.
// Try to reassociate FMA chains like below:
//
// Pattern 1:
@@ -336,11 +341,35 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const {
//
// breaking the dependency between A and B, allowing FMA to be executed in
// parallel (or back-to-back in a pipeline) instead of depending on each other.
+//
+// 2: Reduce register pressure.
+// Try to reassociate FMA with FSUB and a constant like below:
+// C is a floatint point const.
+//
+// Pattern 1:
+// A = FSUB X, Y (Leaf)
+// D = FMA B, C, A (Root)
+// -->
+// A = FMA B, Y, -C
+// D = FMA A, X, C
+//
+// Pattern 2:
+// A = FSUB X, Y (Leaf)
+// D = FMA B, A, C (Root)
+// -->
+// A = FMA B, Y, -C
+// D = FMA A, X, C
+//
+// Before the transformation, A must be assigned with different hardware
+// register with D. After the transformation, A and D must be assigned with
+// same hardware register due to TIE attricute of FMA instructions.
+//
bool PPCInstrInfo::getFMAPatterns(
- MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const {
MachineBasicBlock *MBB = Root.getParent();
- const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
auto IsAllOpsVirtualReg = [](const MachineInstr &Instr) {
for (const auto &MO : Instr.explicit_operands())
@@ -349,16 +378,35 @@ bool PPCInstrInfo::getFMAPatterns(
return true;
};
- auto IsReassociable = [&](const MachineInstr &Instr, int16_t &AddOpIdx,
- bool IsLeaf, bool IsAdd) {
- int16_t Idx = -1;
- if (!IsAdd) {
- Idx = getFMAOpIdxInfo(Instr.getOpcode());
- if (Idx < 0)
- return false;
- } else if (Instr.getOpcode() !=
- FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())]
- [InfoArrayIdxFAddInst])
+ auto IsReassociableAddOrSub = [&](const MachineInstr &Instr,
+ unsigned OpType) {
+ if (Instr.getOpcode() !=
+ FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())][OpType])
+ return false;
+
+ // Instruction can be reassociated.
+ // fast math flags may prohibit reassociation.
+ if (!(Instr.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Instr.getFlag(MachineInstr::MIFlag::FmNsz)))
+ return false;
+
+ // Instruction operands are virtual registers for reassociation.
+ if (!IsAllOpsVirtualReg(Instr))
+ return false;
+
+ // For register pressure reassociation, the FSub must have only one use as
+ // we want to delete the sub to save its def.
+ if (OpType == InfoArrayIdxFSubInst &&
+ !MRI->hasOneNonDBGUse(Instr.getOperand(0).getReg()))
+ return false;
+
+ return true;
+ };
+
+ auto IsReassociableFMA = [&](const MachineInstr &Instr, int16_t &AddOpIdx,
+ int16_t &MulOpIdx, bool IsLeaf) {
+ int16_t Idx = getFMAOpIdxInfo(Instr.getOpcode());
+ if (Idx < 0)
return false;
// Instruction can be reassociated.
@@ -371,65 +419,356 @@ bool PPCInstrInfo::getFMAPatterns(
if (!IsAllOpsVirtualReg(Instr))
return false;
- if (IsAdd && IsLeaf)
+ MulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
+ if (IsLeaf)
return true;
AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx];
const MachineOperand &OpAdd = Instr.getOperand(AddOpIdx);
- MachineInstr *MIAdd = MRI.getUniqueVRegDef(OpAdd.getReg());
+ MachineInstr *MIAdd = MRI->getUniqueVRegDef(OpAdd.getReg());
// If 'add' operand's def is not in current block, don't do ILP related opt.
if (!MIAdd || MIAdd->getParent() != MBB)
return false;
// If this is not Leaf FMA Instr, its 'add' operand should only have one use
// as this fma will be changed later.
- return IsLeaf ? true : MRI.hasOneNonDBGUse(OpAdd.getReg());
+ return IsLeaf ? true : MRI->hasOneNonDBGUse(OpAdd.getReg());
};
int16_t AddOpIdx = -1;
+ int16_t MulOpIdx = -1;
+
+ bool IsUsedOnceL = false;
+ bool IsUsedOnceR = false;
+ MachineInstr *MULInstrL = nullptr;
+ MachineInstr *MULInstrR = nullptr;
+
+ auto IsRPReductionCandidate = [&]() {
+ // Currently, we only support float and double.
+ // FIXME: add support for other types.
+ unsigned Opcode = Root.getOpcode();
+ if (Opcode != PPC::XSMADDASP && Opcode != PPC::XSMADDADP)
+ return false;
+
+ // Root must be a valid FMA like instruction.
+ // Treat it as leaf as we don't care its add operand.
+ if (IsReassociableFMA(Root, AddOpIdx, MulOpIdx, true)) {
+ assert((MulOpIdx >= 0) && "mul operand index not right!");
+ Register MULRegL = TRI->lookThruSingleUseCopyChain(
+ Root.getOperand(MulOpIdx).getReg(), MRI);
+ Register MULRegR = TRI->lookThruSingleUseCopyChain(
+ Root.getOperand(MulOpIdx + 1).getReg(), MRI);
+ if (!MULRegL && !MULRegR)
+ return false;
+
+ if (MULRegL && !MULRegR) {
+ MULRegR =
+ TRI->lookThruCopyLike(Root.getOperand(MulOpIdx + 1).getReg(), MRI);
+ IsUsedOnceL = true;
+ } else if (!MULRegL && MULRegR) {
+ MULRegL =
+ TRI->lookThruCopyLike(Root.getOperand(MulOpIdx).getReg(), MRI);
+ IsUsedOnceR = true;
+ } else {
+ IsUsedOnceL = true;
+ IsUsedOnceR = true;
+ }
+
+ if (!Register::isVirtualRegister(MULRegL) ||
+ !Register::isVirtualRegister(MULRegR))
+ return false;
+
+ MULInstrL = MRI->getVRegDef(MULRegL);
+ MULInstrR = MRI->getVRegDef(MULRegR);
+ return true;
+ }
+ return false;
+ };
+
+ // Register pressure fma reassociation patterns.
+ if (DoRegPressureReduce && IsRPReductionCandidate()) {
+ assert((MULInstrL && MULInstrR) && "wrong register preduction candidate!");
+ // Register pressure pattern 1
+ if (isLoadFromConstantPool(MULInstrL) && IsUsedOnceR &&
+ IsReassociableAddOrSub(*MULInstrR, InfoArrayIdxFSubInst)) {
+ LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BCA\n");
+ Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BCA);
+ return true;
+ }
+
+ // Register pressure pattern 2
+ if ((isLoadFromConstantPool(MULInstrR) && IsUsedOnceL &&
+ IsReassociableAddOrSub(*MULInstrL, InfoArrayIdxFSubInst))) {
+ LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BAC\n");
+ Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BAC);
+ return true;
+ }
+ }
+
+ // ILP fma reassociation patterns.
// Root must be a valid FMA like instruction.
- if (!IsReassociable(Root, AddOpIdx, false, false))
+ AddOpIdx = -1;
+ if (!IsReassociableFMA(Root, AddOpIdx, MulOpIdx, false))
return false;
assert((AddOpIdx >= 0) && "add operand index not right!");
Register RegB = Root.getOperand(AddOpIdx).getReg();
- MachineInstr *Prev = MRI.getUniqueVRegDef(RegB);
+ MachineInstr *Prev = MRI->getUniqueVRegDef(RegB);
// Prev must be a valid FMA like instruction.
AddOpIdx = -1;
- if (!IsReassociable(*Prev, AddOpIdx, false, false))
+ if (!IsReassociableFMA(*Prev, AddOpIdx, MulOpIdx, false))
return false;
assert((AddOpIdx >= 0) && "add operand index not right!");
Register RegA = Prev->getOperand(AddOpIdx).getReg();
- MachineInstr *Leaf = MRI.getUniqueVRegDef(RegA);
+ MachineInstr *Leaf = MRI->getUniqueVRegDef(RegA);
AddOpIdx = -1;
- if (IsReassociable(*Leaf, AddOpIdx, true, false)) {
+ if (IsReassociableFMA(*Leaf, AddOpIdx, MulOpIdx, true)) {
Patterns.push_back(MachineCombinerPattern::REASSOC_XMM_AMM_BMM);
+ LLVM_DEBUG(dbgs() << "add pattern REASSOC_XMM_AMM_BMM\n");
return true;
}
- if (IsReassociable(*Leaf, AddOpIdx, true, true)) {
+ if (IsReassociableAddOrSub(*Leaf, InfoArrayIdxFAddInst)) {
Patterns.push_back(MachineCombinerPattern::REASSOC_XY_AMM_BMM);
+ LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_AMM_BMM\n");
return true;
}
return false;
}
+void PPCInstrInfo::finalizeInsInstrs(
+ MachineInstr &Root, MachineCombinerPattern &P,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) const {
+ assert(!InsInstrs.empty() && "Instructions set to be inserted is empty!");
+
+ MachineFunction *MF = Root.getMF();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineConstantPool *MCP = MF->getConstantPool();
+
+ int16_t Idx = getFMAOpIdxInfo(Root.getOpcode());
+ if (Idx < 0)
+ return;
+
+ uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
+
+ // For now we only need to fix up placeholder for register pressure reduce
+ // patterns.
+ Register ConstReg = 0;
+ switch (P) {
+ case MachineCombinerPattern::REASSOC_XY_BCA:
+ ConstReg =
+ TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), MRI);
+ break;
+ case MachineCombinerPattern::REASSOC_XY_BAC:
+ ConstReg =
+ TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx + 1).getReg(), MRI);
+ break;
+ default:
+ // Not register pressure reduce patterns.
+ return;
+ }
+
+ MachineInstr *ConstDefInstr = MRI->getVRegDef(ConstReg);
+ // Get const value from const pool.
+ const Constant *C = getConstantFromConstantPool(ConstDefInstr);
+ assert(isa<llvm::ConstantFP>(C) && "not a valid constant!");
+
+ // Get negative fp const.
+ APFloat F1((dyn_cast<ConstantFP>(C))->getValueAPF());
+ F1.changeSign();
+ Constant *NegC = ConstantFP::get(dyn_cast<ConstantFP>(C)->getContext(), F1);
+ Align Alignment = MF->getDataLayout().getPrefTypeAlign(C->getType());
+
+ // Put negative fp const into constant pool.
+ unsigned ConstPoolIdx = MCP->getConstantPoolIndex(NegC, Alignment);
+
+ MachineOperand *Placeholder = nullptr;
+ // Record the placeholder PPC::ZERO8 we add in reassociateFMA.
+ for (auto *Inst : InsInstrs) {
+ for (MachineOperand &Operand : Inst->explicit_operands()) {
+ assert(Operand.isReg() && "Invalid instruction in InsInstrs!");
+ if (Operand.getReg() == PPC::ZERO8) {
+ Placeholder = &Operand;
+ break;
+ }
+ }
+ }
+
+ assert(Placeholder && "Placeholder does not exist!");
+
+ // Generate instructions to load the const fp from constant pool.
+ // We only support PPC64 and medium code model.
+ Register LoadNewConst =
+ generateLoadForNewConst(ConstPoolIdx, &Root, C->getType(), InsInstrs);
+
+ // Fill the placeholder with the new load from constant pool.
+ Placeholder->setReg(LoadNewConst);
+}
+
+bool PPCInstrInfo::shouldReduceRegisterPressure(
+ MachineBasicBlock *MBB, RegisterClassInfo *RegClassInfo) const {
+
+ if (!EnableFMARegPressureReduction)
+ return false;
+
+ // Currently, we only enable register pressure reducing in machine combiner
+ // for: 1: PPC64; 2: Code Model is Medium; 3: Power9 which also has vector
+ // support.
+ //
+ // So we need following instructions to access a TOC entry:
+ //
+ // %6:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, %const.0
+ // %7:vssrc = DFLOADf32 target-flags(ppc-toc-lo) %const.0,
+ // killed %6:g8rc_and_g8rc_nox0, implicit $x2 :: (load 4 from constant-pool)
+ //
+ // FIXME: add more supported targets, like Small and Large code model, PPC32,
+ // AIX.
+ if (!(Subtarget.isPPC64() && Subtarget.hasP9Vector() &&
+ Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium))
+ return false;
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+ auto GetMBBPressure = [&](MachineBasicBlock *MBB) -> std::vector<unsigned> {
+ RegionPressure Pressure;
+ RegPressureTracker RPTracker(Pressure);
+
+ // Initialize the register pressure tracker.
+ RPTracker.init(MBB->getParent(), RegClassInfo, nullptr, MBB, MBB->end(),
+ /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true);
+
+ for (MachineBasicBlock::iterator MII = MBB->instr_end(),
+ MIE = MBB->instr_begin();
+ MII != MIE; --MII) {
+ MachineInstr &MI = *std::prev(MII);
+ if (MI.isDebugValue() || MI.isDebugLabel())
+ continue;
+ RegisterOperands RegOpers;
+ RegOpers.collect(MI, *TRI, *MRI, false, false);
+ RPTracker.recedeSkipDebugValues();
+ assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!");
+ RPTracker.recede(RegOpers);
+ }
+
+ // Close the RPTracker to finalize live ins.
+ RPTracker.closeRegion();
+
+ return RPTracker.getPressure().MaxSetPressure;
+ };
+
+ // For now we only care about float and double type fma.
+ unsigned VSSRCLimit = TRI->getRegPressureSetLimit(
+ *MBB->getParent(), PPC::RegisterPressureSets::VSSRC);
+
+ // Only reduce register pressure when pressure is high.
+ return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] >
+ (float)VSSRCLimit * FMARPFactor;
+}
+
+bool PPCInstrInfo::isLoadFromConstantPool(MachineInstr *I) const {
+ // I has only one memory operand which is load from constant pool.
+ if (!I->hasOneMemOperand())
+ return false;
+
+ MachineMemOperand *Op = I->memoperands()[0];
+ return Op->isLoad() && Op->getPseudoValue() &&
+ Op->getPseudoValue()->kind() == PseudoSourceValue::ConstantPool;
+}
+
+Register PPCInstrInfo::generateLoadForNewConst(
+ unsigned Idx, MachineInstr *MI, Type *Ty,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) const {
+ // Now we only support PPC64, Medium code model and P9 with vector.
+ // We have immutable pattern to access const pool. See function
+ // shouldReduceRegisterPressure.
+ assert((Subtarget.isPPC64() && Subtarget.hasP9Vector() &&
+ Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium) &&
+ "Target not supported!\n");
+
+ MachineFunction *MF = MI->getMF();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+ // Generate ADDIStocHA8
+ Register VReg1 = MRI->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
+ MachineInstrBuilder TOCOffset =
+ BuildMI(*MF, MI->getDebugLoc(), get(PPC::ADDIStocHA8), VReg1)
+ .addReg(PPC::X2)
+ .addConstantPoolIndex(Idx);
+
+ assert((Ty->isFloatTy() || Ty->isDoubleTy()) &&
+ "Only float and double are supported!");
+
+ unsigned LoadOpcode;
+ // Should be float type or double type.
+ if (Ty->isFloatTy())
+ LoadOpcode = PPC::DFLOADf32;
+ else
+ LoadOpcode = PPC::DFLOADf64;
+
+ const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
+ Register VReg2 = MRI->createVirtualRegister(RC);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad,
+ Ty->getScalarSizeInBits() / 8, MF->getDataLayout().getPrefTypeAlign(Ty));
+
+ // Generate Load from constant pool.
+ MachineInstrBuilder Load =
+ BuildMI(*MF, MI->getDebugLoc(), get(LoadOpcode), VReg2)
+ .addConstantPoolIndex(Idx)
+ .addReg(VReg1, getKillRegState(true))
+ .addMemOperand(MMO);
+
+ Load->getOperand(1).setTargetFlags(PPCII::MO_TOC_LO);
+
+ // Insert the toc load instructions into InsInstrs.
+ InsInstrs.insert(InsInstrs.begin(), Load);
+ InsInstrs.insert(InsInstrs.begin(), TOCOffset);
+ return VReg2;
+}
+
+// This function returns the const value in constant pool if the \p I is a load
+// from constant pool.
+const Constant *
+PPCInstrInfo::getConstantFromConstantPool(MachineInstr *I) const {
+ MachineFunction *MF = I->getMF();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ MachineConstantPool *MCP = MF->getConstantPool();
+ assert(I->mayLoad() && "Should be a load instruction.\n");
+ for (auto MO : I->uses()) {
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (Reg == 0 || !Register::isVirtualRegister(Reg))
+ continue;
+ // Find the toc address.
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ for (auto MO2 : DefMI->uses())
+ if (MO2.isCPI())
+ return (MCP->getConstants())[MO2.getIndex()].Val.ConstVal;
+ }
+ return nullptr;
+}
+
bool PPCInstrInfo::getMachineCombinerPatterns(
- MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const {
// Using the machine combiner in this way is potentially expensive, so
// restrict to when aggressive optimizations are desired.
if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
return false;
- if (getFMAPatterns(Root, Patterns))
+ if (getFMAPatterns(Root, Patterns, DoRegPressureReduce))
return true;
- return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
}
void PPCInstrInfo::genAlternativeCodeSequence(
@@ -440,6 +779,8 @@ void PPCInstrInfo::genAlternativeCodeSequence(
switch (Pattern) {
case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+ case MachineCombinerPattern::REASSOC_XY_BCA:
+ case MachineCombinerPattern::REASSOC_XY_BAC:
reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg);
break;
default:
@@ -450,8 +791,6 @@ void PPCInstrInfo::genAlternativeCodeSequence(
}
}
-// Currently, only handle two patterns REASSOC_XY_AMM_BMM and
-// REASSOC_XMM_AMM_BMM. See comments for getFMAPatterns.
void PPCInstrInfo::reassociateFMA(
MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -459,6 +798,7 @@ void PPCInstrInfo::reassociateFMA(
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
MachineFunction *MF = Root.getMF();
MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
MachineOperand &OpC = Root.getOperand(0);
Register RegC = OpC.getReg();
const TargetRegisterClass *RC = MRI.getRegClass(RegC);
@@ -468,13 +808,42 @@ void PPCInstrInfo::reassociateFMA(
int16_t Idx = getFMAOpIdxInfo(FmaOp);
assert(Idx >= 0 && "Root must be a FMA instruction");
+ bool IsILPReassociate =
+ (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) ||
+ (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM);
+
uint16_t AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx];
uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
- MachineInstr *Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg());
- MachineInstr *Leaf =
- MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg());
- uint16_t IntersectedFlags =
- Root.getFlags() & Prev->getFlags() & Leaf->getFlags();
+
+ MachineInstr *Prev = nullptr;
+ MachineInstr *Leaf = nullptr;
+ switch (Pattern) {
+ default:
+ llvm_unreachable("not recognized pattern!");
+ case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
+ case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+ Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg());
+ Leaf = MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg());
+ break;
+ case MachineCombinerPattern::REASSOC_XY_BAC: {
+ Register MULReg =
+ TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), &MRI);
+ Leaf = MRI.getVRegDef(MULReg);
+ break;
+ }
+ case MachineCombinerPattern::REASSOC_XY_BCA: {
+ Register MULReg = TRI->lookThruCopyLike(
+ Root.getOperand(FirstMulOpIdx + 1).getReg(), &MRI);
+ Leaf = MRI.getVRegDef(MULReg);
+ break;
+ }
+ }
+
+ uint16_t IntersectedFlags = 0;
+ if (IsILPReassociate)
+ IntersectedFlags = Root.getFlags() & Prev->getFlags() & Leaf->getFlags();
+ else
+ IntersectedFlags = Root.getFlags() & Leaf->getFlags();
auto GetOperandInfo = [&](const MachineOperand &Operand, Register &Reg,
bool &KillFlag) {
@@ -484,36 +853,51 @@ void PPCInstrInfo::reassociateFMA(
};
auto GetFMAInstrInfo = [&](const MachineInstr &Instr, Register &MulOp1,
- Register &MulOp2, bool &MulOp1KillFlag,
- bool &MulOp2KillFlag) {
+ Register &MulOp2, Register &AddOp,
+ bool &MulOp1KillFlag, bool &MulOp2KillFlag,
+ bool &AddOpKillFlag) {
GetOperandInfo(Instr.getOperand(FirstMulOpIdx), MulOp1, MulOp1KillFlag);
GetOperandInfo(Instr.getOperand(FirstMulOpIdx + 1), MulOp2, MulOp2KillFlag);
+ GetOperandInfo(Instr.getOperand(AddOpIdx), AddOp, AddOpKillFlag);
};
- Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32;
+ Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32, RegA11,
+ RegA21, RegB;
bool KillX = false, KillY = false, KillM11 = false, KillM12 = false,
- KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false;
+ KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false,
+ KillA11 = false, KillA21 = false, KillB = false;
- GetFMAInstrInfo(Root, RegM31, RegM32, KillM31, KillM32);
- GetFMAInstrInfo(*Prev, RegM21, RegM22, KillM21, KillM22);
+ GetFMAInstrInfo(Root, RegM31, RegM32, RegB, KillM31, KillM32, KillB);
+
+ if (IsILPReassociate)
+ GetFMAInstrInfo(*Prev, RegM21, RegM22, RegA21, KillM21, KillM22, KillA21);
if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
- GetFMAInstrInfo(*Leaf, RegM11, RegM12, KillM11, KillM12);
+ GetFMAInstrInfo(*Leaf, RegM11, RegM12, RegA11, KillM11, KillM12, KillA11);
GetOperandInfo(Leaf->getOperand(AddOpIdx), RegX, KillX);
} else if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) {
GetOperandInfo(Leaf->getOperand(1), RegX, KillX);
GetOperandInfo(Leaf->getOperand(2), RegY, KillY);
+ } else {
+ // Get FSUB instruction info.
+ GetOperandInfo(Leaf->getOperand(1), RegX, KillX);
+ GetOperandInfo(Leaf->getOperand(2), RegY, KillY);
}
// Create new virtual registers for the new results instead of
// recycling legacy ones because the MachineCombiner's computation of the
// critical path requires a new register definition rather than an existing
// one.
+ // For register pressure reassociation, we only need create one virtual
+ // register for the new fma.
Register NewVRA = MRI.createVirtualRegister(RC);
InstrIdxForVirtReg.insert(std::make_pair(NewVRA, 0));
- Register NewVRB = MRI.createVirtualRegister(RC);
- InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1));
+ Register NewVRB = 0;
+ if (IsILPReassociate) {
+ NewVRB = MRI.createVirtualRegister(RC);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1));
+ }
Register NewVRD = 0;
if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
@@ -532,7 +916,11 @@ void PPCInstrInfo::reassociateFMA(
MI->getOperand(FirstMulOpIdx + 1).setIsKill(KillRegMul2);
};
- if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) {
+ MachineInstrBuilder NewARegPressure, NewCRegPressure;
+ switch (Pattern) {
+ default:
+ llvm_unreachable("not recognized pattern!");
+ case MachineCombinerPattern::REASSOC_XY_AMM_BMM: {
// Create new instructions for insertion.
MachineInstrBuilder MINewB =
BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB)
@@ -565,7 +953,9 @@ void PPCInstrInfo::reassociateFMA(
InsInstrs.push_back(MINewA);
InsInstrs.push_back(MINewB);
InsInstrs.push_back(MINewC);
- } else if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
+ break;
+ }
+ case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: {
assert(NewVRD && "new FMA register not created!");
// Create new instructions for insertion.
MachineInstrBuilder MINewA =
@@ -607,6 +997,47 @@ void PPCInstrInfo::reassociateFMA(
InsInstrs.push_back(MINewB);
InsInstrs.push_back(MINewD);
InsInstrs.push_back(MINewC);
+ break;
+ }
+ case MachineCombinerPattern::REASSOC_XY_BAC:
+ case MachineCombinerPattern::REASSOC_XY_BCA: {
+ Register VarReg;
+ bool KillVarReg = false;
+ if (Pattern == MachineCombinerPattern::REASSOC_XY_BCA) {
+ VarReg = RegM31;
+ KillVarReg = KillM31;
+ } else {
+ VarReg = RegM32;
+ KillVarReg = KillM32;
+ }
+ // We don't want to get negative const from memory pool too early, as the
+ // created entry will not be deleted even if it has no users. Since all
+ // operand of Leaf and Root are virtual register, we use zero register
+ // here as a placeholder. When the InsInstrs is selected in
+ // MachineCombiner, we call finalizeInsInstrs to replace the zero register
+ // with a virtual register which is a load from constant pool.
+ NewARegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRA)
+ .addReg(RegB, getKillRegState(RegB))
+ .addReg(RegY, getKillRegState(KillY))
+ .addReg(PPC::ZERO8);
+ NewCRegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), RegC)
+ .addReg(NewVRA, getKillRegState(true))
+ .addReg(RegX, getKillRegState(KillX))
+ .addReg(VarReg, getKillRegState(KillVarReg));
+ // For now, we only support xsmaddadp/xsmaddasp, their add operand are
+ // both at index 1, no need to adjust.
+ // FIXME: when add more fma instructions support, like fma/fmas, adjust
+ // the operand index here.
+ break;
+ }
+ }
+
+ if (!IsILPReassociate) {
+ setSpecialOperandAttr(*NewARegPressure, IntersectedFlags);
+ setSpecialOperandAttr(*NewCRegPressure, IntersectedFlags);
+
+ InsInstrs.push_back(NewARegPressure);
+ InsInstrs.push_back(NewCRegPressure);
}
assert(!InsInstrs.empty() &&
@@ -614,7 +1045,8 @@ void PPCInstrInfo::reassociateFMA(
// Record old instructions for deletion.
DelInstrs.push_back(Leaf);
- DelInstrs.push_back(Prev);
+ if (IsILPReassociate)
+ DelInstrs.push_back(Prev);
DelInstrs.push_back(&Root);
}
@@ -666,7 +1098,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case PPC::LI8:
case PPC::LIS:
case PPC::LIS8:
- case PPC::QVGPCI:
case PPC::ADDIStocHA:
case PPC::ADDIStocHA8:
case PPC::ADDItocL:
@@ -683,6 +1114,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case PPC::V_SETALLONES:
case PPC::CRSET:
case PPC::CRUNSET:
+ case PPC::XXSETACCZ:
return true;
}
return false;
@@ -1283,14 +1715,22 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addImm(31);
return;
} else if (PPC::CRRCRegClass.contains(SrcReg) &&
- PPC::G8RCRegClass.contains(DestReg)) {
- BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg);
- getKillRegState(KillSrc);
- return;
- } else if (PPC::CRRCRegClass.contains(SrcReg) &&
- PPC::GPRCRegClass.contains(DestReg)) {
- BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
+ (PPC::G8RCRegClass.contains(DestReg) ||
+ PPC::GPRCRegClass.contains(DestReg))) {
+ bool Is64Bit = PPC::G8RCRegClass.contains(DestReg);
+ unsigned MvCode = Is64Bit ? PPC::MFOCRF8 : PPC::MFOCRF;
+ unsigned ShCode = Is64Bit ? PPC::RLWINM8 : PPC::RLWINM;
+ unsigned CRNum = TRI->getEncodingValue(SrcReg);
+ BuildMI(MBB, I, DL, get(MvCode), DestReg).addReg(SrcReg);
getKillRegState(KillSrc);
+ if (CRNum == 7)
+ return;
+ // Shift the CR bits to make the CR field in the lowest 4 bits of GRC.
+ BuildMI(MBB, I, DL, get(ShCode), DestReg)
+ .addReg(DestReg, RegState::Kill)
+ .addImm(CRNum * 4 + 4)
+ .addImm(28)
+ .addImm(31);
return;
} else if (PPC::G8RCRegClass.contains(SrcReg) &&
PPC::VSFRCRegClass.contains(DestReg)) {
@@ -1343,17 +1783,53 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) ||
PPC::VSSRCRegClass.contains(DestReg, SrcReg))
Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf;
- else if (PPC::QFRCRegClass.contains(DestReg, SrcReg))
- Opc = PPC::QVFMR;
- else if (PPC::QSRCRegClass.contains(DestReg, SrcReg))
- Opc = PPC::QVFMRs;
- else if (PPC::QBRCRegClass.contains(DestReg, SrcReg))
- Opc = PPC::QVFMRb;
+ else if (Subtarget.pairedVectorMemops() &&
+ PPC::VSRpRCRegClass.contains(DestReg, SrcReg)) {
+ if (SrcReg > PPC::VSRp15)
+ SrcReg = PPC::V0 + (SrcReg - PPC::VSRp16) * 2;
+ else
+ SrcReg = PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
+ if (DestReg > PPC::VSRp15)
+ DestReg = PPC::V0 + (DestReg - PPC::VSRp16) * 2;
+ else
+ DestReg = PPC::VSL0 + (DestReg - PPC::VSRp0) * 2;
+ BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg).
+ addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+ BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg + 1).
+ addReg(SrcReg + 1).addReg(SrcReg + 1, getKillRegState(KillSrc));
+ return;
+ }
else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
Opc = PPC::CROR;
else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
Opc = PPC::EVOR;
- else
+ else if ((PPC::ACCRCRegClass.contains(DestReg) ||
+ PPC::UACCRCRegClass.contains(DestReg)) &&
+ (PPC::ACCRCRegClass.contains(SrcReg) ||
+ PPC::UACCRCRegClass.contains(SrcReg))) {
+ // If primed, de-prime the source register, copy the individual registers
+ // and prime the destination if needed. The vector subregisters are
+ // vs[(u)acc * 4] - vs[(u)acc * 4 + 3]. If the copy is not a kill and the
+ // source is primed, we need to re-prime it after the copy as well.
+ PPCRegisterInfo::emitAccCopyInfo(MBB, DestReg, SrcReg);
+ bool DestPrimed = PPC::ACCRCRegClass.contains(DestReg);
+ bool SrcPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+ MCRegister VSLSrcReg =
+ PPC::VSL0 + (SrcReg - (SrcPrimed ? PPC::ACC0 : PPC::UACC0)) * 4;
+ MCRegister VSLDestReg =
+ PPC::VSL0 + (DestReg - (DestPrimed ? PPC::ACC0 : PPC::UACC0)) * 4;
+ if (SrcPrimed)
+ BuildMI(MBB, I, DL, get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
+ for (unsigned Idx = 0; Idx < 4; Idx++)
+ BuildMI(MBB, I, DL, get(PPC::XXLOR), VSLDestReg + Idx)
+ .addReg(VSLSrcReg + Idx)
+ .addReg(VSLSrcReg + Idx, getKillRegState(KillSrc));
+ if (DestPrimed)
+ BuildMI(MBB, I, DL, get(PPC::XXMTACC), DestReg).addReg(DestReg);
+ if (SrcPrimed && !KillSrc)
+ BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
+ return;
+ } else
llvm_unreachable("Impossible reg-to-reg copy");
const MCInstrDesc &MCID = get(Opc);
@@ -1364,7 +1840,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
}
-static unsigned getSpillIndex(const TargetRegisterClass *RC) {
+unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const {
int OpcodeIndex = 0;
if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
@@ -1391,16 +1867,20 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) {
OpcodeIndex = SOK_VectorFloat8Spill;
} else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_VectorFloat4Spill;
- } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
- OpcodeIndex = SOK_VRSaveSpill;
- } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
- OpcodeIndex = SOK_QuadFloat8Spill;
- } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
- OpcodeIndex = SOK_QuadFloat4Spill;
- } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
- OpcodeIndex = SOK_QuadBitSpill;
} else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_SpillToVSR;
+ } else if (PPC::ACCRCRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.pairedVectorMemops() &&
+ "Register unexpected when paired memops are disabled.");
+ OpcodeIndex = SOK_AccumulatorSpill;
+ } else if (PPC::UACCRCRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.pairedVectorMemops() &&
+ "Register unexpected when paired memops are disabled.");
+ OpcodeIndex = SOK_UAccumulatorSpill;
+ } else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.pairedVectorMemops() &&
+ "Register unexpected when paired memops are disabled.");
+ OpcodeIndex = SOK_PairedVecSpill;
} else {
llvm_unreachable("Unknown regclass!");
}
@@ -1437,9 +1917,6 @@ void PPCInstrInfo::StoreRegToStackSlot(
PPC::CRBITRCRegClass.hasSubClassEq(RC))
FuncInfo->setSpillsCR();
- if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
- FuncInfo->setSpillsVRSAVE();
-
if (isXFormMemOp(Opcode))
FuncInfo->setHasNonRISpills();
}
@@ -1495,9 +1972,6 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
PPC::CRBITRCRegClass.hasSubClassEq(RC))
FuncInfo->setSpillsCR();
- if (PPC::VRSAVERCRegClass.hasSubClassEq(RC))
- FuncInfo->setSpillsVRSAVE();
-
if (isXFormMemOp(Opcode))
FuncInfo->setHasNonRISpills();
}
@@ -1667,6 +2141,17 @@ bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
return false;
}
+bool PPCInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+ // Set MFFS and MTFSF as scheduling boundary to avoid unexpected code motion
+ // across them, since some FP operations may change content of FPSCR.
+ // TODO: Model FPSCR in PPC instruction definitions and remove the workaround
+ if (MI.getOpcode() == PPC::MFFS || MI.getOpcode() == PPC::MTFSF)
+ return true;
+ return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
ArrayRef<MachineOperand> Pred) const {
unsigned OpC = MI.getOpcode();
@@ -1675,6 +2160,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
bool isPPC64 = Subtarget.isPPC64();
MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR)
: (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
+ // Need add Def and Use for CTR implicit operand.
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg(), RegState::Implicit)
+ .addReg(Pred[1].getReg(), RegState::ImplicitDefine);
} else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
MI.setDesc(get(PPC::BCLR));
MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
@@ -1694,6 +2183,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
bool isPPC64 = Subtarget.isPPC64();
MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
: (isPPC64 ? PPC::BDZ8 : PPC::BDZ)));
+ // Need add Def and Use for CTR implicit operand.
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(Pred[1].getReg(), RegState::Implicit)
+ .addReg(Pred[1].getReg(), RegState::ImplicitDefine);
} else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
MI.RemoveOperand(0);
@@ -1734,19 +2227,24 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
: (setLR ? PPC::BCCTRL : PPC::BCCTR)));
MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
- return true;
} else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
: (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]);
- return true;
+ } else {
+ MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
+ : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addImm(Pred[0].getImm())
+ .add(Pred[1]);
}
- MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
- : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
- MachineInstrBuilder(*MI.getParent()->getParent(), MI)
- .addImm(Pred[0].getImm())
- .add(Pred[1]);
+ // Need add Def and Use for LR implicit operand.
+ if (setLR)
+ MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+ .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit)
+ .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine);
+
return true;
}
@@ -1784,8 +2282,9 @@ bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
return false;
}
-bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const {
+bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI,
+ std::vector<MachineOperand> &Pred,
+ bool SkipDead) const {
// Note: At the present time, the contents of Pred from this function is
// unused by IfConversion. This implementation follows ARM by pushing the
// CR-defining operand. Because the 'DZ' and 'DNZ' count as types of
@@ -2071,6 +2570,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (NewOpC == -1)
return false;
+ // This transformation should not be performed if `nsw` is missing and is not
+ // `equalityOnly` comparison. Since if there is overflow, sub_lt, sub_gt in
+ // CRReg do not reflect correct order. If `equalityOnly` is true, sub_eq in
+ // CRReg can reflect if compared values are equal, this optz is still valid.
+ if (!equalityOnly && (NewOpC == PPC::SUBF_rec || NewOpC == PPC::SUBF8_rec) &&
+ Sub && !Sub->getFlag(MachineInstr::NoSWrap))
+ return false;
+
// If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
// needs to be updated to be based on SUB. Push the condition code
// operands to OperandsToUpdate. If it is safe to remove CmpInstr, the
@@ -2221,6 +2728,112 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return true;
}
+bool PPCInstrInfo::getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
+ const MachineOperand *BaseOp;
+ OffsetIsScalable = false;
+ if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI))
+ return false;
+ BaseOps.push_back(BaseOp);
+ return true;
+}
+
+static bool isLdStSafeToCluster(const MachineInstr &LdSt,
+ const TargetRegisterInfo *TRI) {
+ // If this is a volatile load/store, don't mess with it.
+ if (LdSt.hasOrderedMemoryRef() || LdSt.getNumExplicitOperands() != 3)
+ return false;
+
+ if (LdSt.getOperand(2).isFI())
+ return true;
+
+ assert(LdSt.getOperand(2).isReg() && "Expected a reg operand.");
+ // Can't cluster if the instruction modifies the base register
+ // or it is update form. e.g. ld r2,3(r2)
+ if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI))
+ return false;
+
+ return true;
+}
+
+// Only cluster instruction pair that have the same opcode, and they are
+// clusterable according to PowerPC specification.
+static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
+ const PPCSubtarget &Subtarget) {
+ switch (FirstOpc) {
+ default:
+ return false;
+ case PPC::STD:
+ case PPC::STFD:
+ case PPC::STXSD:
+ case PPC::DFSTOREf64:
+ return FirstOpc == SecondOpc;
+ // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with
+ // 32bit and 64bit instruction selection. They are clusterable pair though
+ // they are different opcode.
+ case PPC::STW:
+ case PPC::STW8:
+ return SecondOpc == PPC::STW || SecondOpc == PPC::STW8;
+ }
+}
+
+bool PPCInstrInfo::shouldClusterMemOps(
+ ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
+ unsigned NumBytes) const {
+
+ assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
+ const MachineOperand &BaseOp1 = *BaseOps1.front();
+ const MachineOperand &BaseOp2 = *BaseOps2.front();
+ assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
+ "Only base registers and frame indices are supported.");
+
+ // The NumLoads means the number of loads that has been clustered.
+ // Don't cluster memory op if there are already two ops clustered at least.
+ if (NumLoads > 2)
+ return false;
+
+ // Cluster the load/store only when they have the same base
+ // register or FI.
+ if ((BaseOp1.isReg() != BaseOp2.isReg()) ||
+ (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) ||
+ (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex()))
+ return false;
+
+ // Check if the load/store are clusterable according to the PowerPC
+ // specification.
+ const MachineInstr &FirstLdSt = *BaseOp1.getParent();
+ const MachineInstr &SecondLdSt = *BaseOp2.getParent();
+ unsigned FirstOpc = FirstLdSt.getOpcode();
+ unsigned SecondOpc = SecondLdSt.getOpcode();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ // Cluster the load/store only when they have the same opcode, and they are
+ // clusterable opcode according to PowerPC specification.
+ if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget))
+ return false;
+
+ // Can't cluster load/store that have ordered or volatile memory reference.
+ if (!isLdStSafeToCluster(FirstLdSt, TRI) ||
+ !isLdStSafeToCluster(SecondLdSt, TRI))
+ return false;
+
+ int64_t Offset1 = 0, Offset2 = 0;
+ unsigned Width1 = 0, Width2 = 0;
+ const MachineOperand *Base1 = nullptr, *Base2 = nullptr;
+ if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) ||
+ !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) ||
+ Width1 != Width2)
+ return false;
+
+ assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 &&
+ "getMemOperandWithOffsetWidth return incorrect base op");
+ // The caller should already have ordered FirstMemOp/SecondMemOp by offset.
+ assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+ return Offset1 + Width1 == Offset2;
+}
+
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
///
@@ -2270,7 +2883,14 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
{MO_PLT, "ppc-plt"},
{MO_PIC_FLAG, "ppc-pic"},
{MO_PCREL_FLAG, "ppc-pcrel"},
- {MO_GOT_FLAG, "ppc-got"}};
+ {MO_GOT_FLAG, "ppc-got"},
+ {MO_PCREL_OPT_FLAG, "ppc-opt-pcrel"},
+ {MO_TLSGD_FLAG, "ppc-tlsgd"},
+ {MO_TLSLD_FLAG, "ppc-tlsld"},
+ {MO_TPREL_FLAG, "ppc-tprel"},
+ {MO_GOT_TLSGD_PCREL_FLAG, "ppc-got-tlsgd-pcrel"},
+ {MO_GOT_TLSLD_PCREL_FLAG, "ppc-got-tlsld-pcrel"},
+ {MO_GOT_TPREL_PCREL_FLAG, "ppc-got-tprel-pcrel"}};
return makeArrayRef(TargetFlags);
}
@@ -2351,6 +2971,31 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
auto DL = MI.getDebugLoc();
switch (MI.getOpcode()) {
+ case PPC::BUILD_UACC: {
+ MCRegister ACC = MI.getOperand(0).getReg();
+ MCRegister UACC = MI.getOperand(1).getReg();
+ if (ACC - PPC::ACC0 != UACC - PPC::UACC0) {
+ MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4;
+ MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4;
+ // FIXME: This can easily be improved to look up to the top of the MBB
+ // to see if the inputs are XXLOR's. If they are and SrcReg is killed,
+ // we can just re-target any such XXLOR's to DstVSR + offset.
+ for (int VecNo = 0; VecNo < 4; VecNo++)
+ BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo)
+ .addReg(SrcVSR + VecNo)
+ .addReg(SrcVSR + VecNo);
+ }
+ // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers.
+ // So after building the 4 copies, we can replace the BUILD_UACC instruction
+ // with a NOP.
+ LLVM_FALLTHROUGH;
+ }
+ case PPC::KILL_PAIR: {
+ MI.setDesc(get(PPC::UNENCODED_NOP));
+ MI.RemoveOperand(1);
+ MI.RemoveOperand(0);
+ return true;
+ }
case TargetOpcode::LOAD_STACK_GUARD: {
assert(Subtarget.isTargetLinux() &&
"Only Linux target is expected to contain LOAD_STACK_GUARD");
@@ -2642,7 +3287,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
}
unsigned PPCInstrInfo::getSpillTarget() const {
- return Subtarget.hasP9Vector() ? 1 : 0;
+ // With P10, we may need to spill paired vector registers or accumulator
+ // registers. MMA implies paired vectors, so we can just check that.
+ bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops();
+ return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0;
}
const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
@@ -3033,6 +3681,143 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
return false;
}
+bool PPCInstrInfo::combineRLWINM(MachineInstr &MI,
+ MachineInstr **ToErase) const {
+ MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
+ unsigned FoldingReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(FoldingReg))
+ return false;
+ MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg);
+ if (SrcMI->getOpcode() != PPC::RLWINM &&
+ SrcMI->getOpcode() != PPC::RLWINM_rec &&
+ SrcMI->getOpcode() != PPC::RLWINM8 &&
+ SrcMI->getOpcode() != PPC::RLWINM8_rec)
+ return false;
+ assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() &&
+ MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() &&
+ SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) &&
+ "Invalid PPC::RLWINM Instruction!");
+ uint64_t SHSrc = SrcMI->getOperand(2).getImm();
+ uint64_t SHMI = MI.getOperand(2).getImm();
+ uint64_t MBSrc = SrcMI->getOperand(3).getImm();
+ uint64_t MBMI = MI.getOperand(3).getImm();
+ uint64_t MESrc = SrcMI->getOperand(4).getImm();
+ uint64_t MEMI = MI.getOperand(4).getImm();
+
+ assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) &&
+ "Invalid PPC::RLWINM Instruction!");
+ // If MBMI is bigger than MEMI, we always can not get run of ones.
+ // RotatedSrcMask non-wrap:
+ // 0........31|32........63
+ // RotatedSrcMask: B---E B---E
+ // MaskMI: -----------|--E B------
+ // Result: ----- --- (Bad candidate)
+ //
+ // RotatedSrcMask wrap:
+ // 0........31|32........63
+ // RotatedSrcMask: --E B----|--E B----
+ // MaskMI: -----------|--E B------
+ // Result: --- -----|--- ----- (Bad candidate)
+ //
+ // One special case is RotatedSrcMask is a full set mask.
+ // RotatedSrcMask full:
+ // 0........31|32........63
+ // RotatedSrcMask: ------EB---|-------EB---
+ // MaskMI: -----------|--E B------
+ // Result: -----------|--- ------- (Good candidate)
+
+ // Mark special case.
+ bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31);
+
+ // For other MBMI > MEMI cases, just return.
+ if ((MBMI > MEMI) && !SrcMaskFull)
+ return false;
+
+ // Handle MBMI <= MEMI cases.
+ APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI);
+ // In MI, we only need low 32 bits of SrcMI, just consider about low 32
+ // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0,
+ // while in PowerPC ISA, lowerest bit is at index 63.
+ APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc);
+
+ APInt RotatedSrcMask = MaskSrc.rotl(SHMI);
+ APInt FinalMask = RotatedSrcMask & MaskMI;
+ uint32_t NewMB, NewME;
+ bool Simplified = false;
+
+ // If final mask is 0, MI result should be 0 too.
+ if (FinalMask.isNullValue()) {
+ bool Is64Bit =
+ (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec);
+ Simplified = true;
+ LLVM_DEBUG(dbgs() << "Replace Instr: ");
+ LLVM_DEBUG(MI.dump());
+
+ if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) {
+ // Replace MI with "LI 0"
+ MI.RemoveOperand(4);
+ MI.RemoveOperand(3);
+ MI.RemoveOperand(2);
+ MI.getOperand(1).ChangeToImmediate(0);
+ MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI));
+ } else {
+ // Replace MI with "ANDI_rec reg, 0"
+ MI.RemoveOperand(4);
+ MI.RemoveOperand(3);
+ MI.getOperand(2).setImm(0);
+ MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
+ MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+ if (SrcMI->getOperand(1).isKill()) {
+ MI.getOperand(1).setIsKill(true);
+ SrcMI->getOperand(1).setIsKill(false);
+ } else
+ // About to replace MI.getOperand(1), clear its kill flag.
+ MI.getOperand(1).setIsKill(false);
+ }
+
+ LLVM_DEBUG(dbgs() << "With: ");
+ LLVM_DEBUG(MI.dump());
+
+ } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, NewME) &&
+ NewMB <= NewME) ||
+ SrcMaskFull) {
+ // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger
+ // than NewME. Otherwise we get a 64 bit value after folding, but MI
+ // return a 32 bit value.
+ Simplified = true;
+ LLVM_DEBUG(dbgs() << "Converting Instr: ");
+ LLVM_DEBUG(MI.dump());
+
+ uint16_t NewSH = (SHSrc + SHMI) % 32;
+ MI.getOperand(2).setImm(NewSH);
+ // If SrcMI mask is full, no need to update MBMI and MEMI.
+ if (!SrcMaskFull) {
+ MI.getOperand(3).setImm(NewMB);
+ MI.getOperand(4).setImm(NewME);
+ }
+ MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+ if (SrcMI->getOperand(1).isKill()) {
+ MI.getOperand(1).setIsKill(true);
+ SrcMI->getOperand(1).setIsKill(false);
+ } else
+ // About to replace MI.getOperand(1), clear its kill flag.
+ MI.getOperand(1).setIsKill(false);
+
+ LLVM_DEBUG(dbgs() << "To: ");
+ LLVM_DEBUG(MI.dump());
+ }
+ if (Simplified & MRI->use_nodbg_empty(FoldingReg) &&
+ !SrcMI->hasImplicitDef()) {
+ // If FoldingReg has no non-debug use and it has no implicit def (it
+ // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI.
+ // Otherwise keep it.
+ *ToErase = SrcMI;
+ LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
+ LLVM_DEBUG(SrcMI->dump());
+ }
+ return Simplified;
+}
+
bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg,
ImmInstrInfo &III, bool PostRA) const {
// The vast majority of the instructions would need their operand 2 replaced
@@ -3754,6 +4539,20 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
}
return false;
}
+ case PPC::SUBFIC:
+ case PPC::SUBFIC8: {
+ // Only transform this if the CARRY implicit operand is dead.
+ if (MI.getNumOperands() > 3 && !MI.getOperand(3).isDead())
+ return false;
+ int64_t Minuend = MI.getOperand(2).getImm();
+ if (isInt<16>(Minuend - SExtImm)) {
+ ReplaceWithLI = true;
+ Is64BitLI = Opc == PPC::SUBFIC8;
+ NewImm = Minuend - SExtImm;
+ break;
+ }
+ return false;
+ }
case PPC::RLDICL:
case PPC::RLDICL_rec:
case PPC::RLDICL_32:
@@ -4640,13 +5439,15 @@ MachineInstr *PPCInstrInfo::findLoopInstr(
bool PPCInstrInfo::getMemOperandWithOffsetWidth(
const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset,
unsigned &Width, const TargetRegisterInfo *TRI) const {
- if (!LdSt.mayLoadOrStore())
+ if (!LdSt.mayLoadOrStore() || LdSt.getNumExplicitOperands() != 3)
return false;
// Handle only loads/stores with base register followed by immediate offset.
- if (LdSt.getNumExplicitOperands() != 3)
+ if (!LdSt.getOperand(1).isImm() ||
+ (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))
return false;
- if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg())
+ if (!LdSt.getOperand(1).isImm() ||
+ (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))
return false;
if (!LdSt.hasOneMemOperand())
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 556c95fef3bd..c6ef1742b722 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -122,61 +122,73 @@ enum SpillOpcodeKey {
SOK_VSXVectorSpill,
SOK_VectorFloat8Spill,
SOK_VectorFloat4Spill,
- SOK_VRSaveSpill,
- SOK_QuadFloat8Spill,
- SOK_QuadFloat4Spill,
- SOK_QuadBitSpill,
SOK_SpillToVSR,
+ SOK_PairedVecSpill,
+ SOK_AccumulatorSpill,
+ SOK_UAccumulatorSpill,
SOK_SPESpill,
SOK_LastOpcodeSpill // This must be last on the enum.
};
// Define list of load and store spill opcodes.
+#define NoInstr PPC::INSTRUCTION_LIST_END
#define Pwr8LoadOpcodes \
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \
- PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, \
- PPC::SPILLTOVSR_LD, PPC::EVLDD \
+ PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD \
}
#define Pwr9LoadOpcodes \
{ \
PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \
- PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, \
- PPC::QVLFDXb, PPC::SPILLTOVSR_LD \
+ PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr \
+ }
+
+#define Pwr10LoadOpcodes \
+ { \
+ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \
+ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \
+ PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \
+ PPC::RESTORE_UACC, NoInstr \
}
#define Pwr8StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
- PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \
- PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST, \
- PPC::EVSTDD \
+ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, \
+ PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD \
}
#define Pwr9StoreOpcodes \
{ \
PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \
- PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, \
- PPC::SPILLTOVSR_ST \
+ PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr \
+ }
+
+#define Pwr10StoreOpcodes \
+ { \
+ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
+ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \
+ PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \
+ NoInstr \
}
// Initialize arrays for load and store spill opcodes on supported subtargets.
#define StoreOpcodesForSpill \
- { Pwr8StoreOpcodes, Pwr9StoreOpcodes }
+ { Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes }
#define LoadOpcodesForSpill \
- { Pwr8LoadOpcodes, Pwr9LoadOpcodes }
+ { Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes }
class PPCSubtarget;
class PPCInstrInfo : public PPCGenInstrInfo {
PPCSubtarget &Subtarget;
const PPCRegisterInfo RI;
- const unsigned StoreSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
+ const unsigned StoreSpillOpcodesArray[3][SOK_LastOpcodeSpill] =
StoreOpcodesForSpill;
- const unsigned LoadSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
+ const unsigned LoadSpillOpcodesArray[3][SOK_LastOpcodeSpill] =
LoadOpcodesForSpill;
void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill,
@@ -234,11 +246,17 @@ class PPCInstrInfo : public PPCGenInstrInfo {
unsigned getSpillTarget() const;
const unsigned *getStoreOpcodesForSpillArray() const;
const unsigned *getLoadOpcodesForSpillArray() const;
+ unsigned getSpillIndex(const TargetRegisterClass *RC) const;
int16_t getFMAOpIdxInfo(unsigned Opcode) const;
void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
+ bool isLoadFromConstantPool(MachineInstr *I) const;
+ Register
+ generateLoadForNewConst(unsigned Idx, MachineInstr *MI, Type *Ty,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) const;
+ const Constant *getConstantFromConstantPool(MachineInstr *I) const;
virtual void anchor();
protected:
@@ -273,10 +291,10 @@ public:
}
static bool isSameClassPhysRegCopy(unsigned Opcode) {
- unsigned CopyOpcodes[] =
- { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf,
- PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb,
- PPC::CROR, PPC::EVOR, -1U };
+ unsigned CopyOpcodes[] = {PPC::OR, PPC::OR8, PPC::FMR,
+ PPC::VOR, PPC::XXLOR, PPC::XXLORf,
+ PPC::XSCPSGNDP, PPC::MCRF, PPC::CROR,
+ PPC::EVOR, -1U};
for (int i = 0; CopyOpcodes[i] != -1U; i++)
if (Opcode == CopyOpcodes[i])
return true;
@@ -330,14 +348,29 @@ public:
/// chain ending in \p Root. All potential patterns are output in the \p
/// P array.
bool getFMAPatterns(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &P) const;
+ SmallVectorImpl<MachineCombinerPattern> &P,
+ bool DoRegPressureReduce) const;
/// Return true when there is potentially a faster code sequence
/// for an instruction chain ending in <Root>. All potential patterns are
/// output in the <Pattern> array.
- bool getMachineCombinerPatterns(
- MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &P) const override;
+ bool getMachineCombinerPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &P,
+ bool DoRegPressureReduce) const override;
+
+ /// On PowerPC, we leverage machine combiner pass to reduce register pressure
+ /// when the register pressure is high for one BB.
+ /// Return true if register pressure for \p MBB is high and ABI is supported
+ /// to reduce register pressure. Otherwise return false.
+ bool
+ shouldReduceRegisterPressure(MachineBasicBlock *MBB,
+ RegisterClassInfo *RegClassInfo) const override;
+
+ /// Fixup the placeholders we put in genAlternativeCodeSequence() for
+ /// MachineCombiner.
+ void
+ finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) const override;
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
@@ -470,14 +503,18 @@ public:
// Predication support.
bool isPredicated(const MachineInstr &MI) const override;
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
bool PredicateInstruction(MachineInstr &MI,
ArrayRef<MachineOperand> Pred) const override;
bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
ArrayRef<MachineOperand> Pred2) const override;
- bool DefinesPredicate(MachineInstr &MI,
- std::vector<MachineOperand> &Pred) const override;
+ bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+ bool SkipDead) const override;
// Comparison optimization.
@@ -497,6 +534,20 @@ public:
int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const;
+ /// Get the base operand and byte offset of an instruction that reads/writes
+ /// memory.
+ bool getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt,
+ SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+ bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const override;
+
+ /// Returns true if the two given memory operations should be scheduled
+ /// adjacent.
+ bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2,
+ unsigned NumLoads, unsigned NumBytes) const override;
+
/// Return true if two MIs access different memory addresses and false
/// otherwise
bool
@@ -554,6 +605,7 @@ public:
bool convertToImmediateForm(MachineInstr &MI,
MachineInstr **KilledDef = nullptr) const;
bool foldFrameOffset(MachineInstr &MI) const;
+ bool combineRLWINM(MachineInstr &MI, MachineInstr **ToErase = nullptr) const;
bool isADDIInstrEligibleForFolding(MachineInstr &ADDIMI, int64_t &Imm) const;
bool isADDInstrEligibleForFolding(MachineInstr &ADDMI) const;
bool isImmInstrEligibleForFolding(MachineInstr &MI, unsigned &BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index fedbf592af39..724af23542d7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -74,6 +74,9 @@ def SDT_PPCcondbr : SDTypeProfile<0, 3, [
SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
]>;
+def SDT_PPCFtsqrt : SDTypeProfile<1, 1, [
+ SDTCisVT<0, i32>]>;
+
def SDT_PPClbrx : SDTypeProfile<1, 2, [
SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
]>;
@@ -124,6 +127,8 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [
def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>;
def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+def PPCfsqrt : SDNode<"PPCISD::FSQRT", SDTFPUnaryOp, []>;
+def PPCftsqrt : SDNode<"PPCISD::FTSQRT", SDT_PPCFtsqrt,[]>;
def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>;
def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>;
@@ -134,6 +139,28 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
+ SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
+ SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
+ SDTFPRoundOp, [SDNPHasChain]>;
+def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
+ SDTFPRoundOp, [SDNPHasChain]>;
+
+def PPCany_fcfid : PatFrags<(ops node:$op),
+ [(PPCfcfid node:$op),
+ (PPCstrict_fcfid node:$op)]>;
+def PPCany_fcfidu : PatFrags<(ops node:$op),
+ [(PPCfcfidu node:$op),
+ (PPCstrict_fcfidu node:$op)]>;
+def PPCany_fcfids : PatFrags<(ops node:$op),
+ [(PPCfcfids node:$op),
+ (PPCstrict_fcfids node:$op)]>;
+def PPCany_fcfidus : PatFrags<(ops node:$op),
+ [(PPCfcfidus node:$op),
+ (PPCstrict_fcfidus node:$op)]>;
+
def PPCcv_fp_to_uint_in_vsr:
SDNode<"PPCISD::FP_TO_UINT_IN_VSR", SDT_PPCcv_fp_to_int, []>;
def PPCcv_fp_to_sint_in_vsr:
@@ -160,7 +187,12 @@ def PPCmffs : SDNode<"PPCISD::MFFS",
// Perform FADD in round-to-zero mode.
def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+def PPCstrict_faddrtz: SDNode<"PPCISD::STRICT_FADDRTZ", SDTFPBinOp,
+ [SDNPHasChain]>;
+def PPCany_faddrtz: PatFrags<(ops node:$lhs, node:$rhs),
+ [(PPCfaddrtz node:$lhs, node:$rhs),
+ (PPCstrict_faddrtz node:$lhs, node:$rhs)]>;
def PPCfsel : SDNode<"PPCISD::FSEL",
// Type constraint for fsel.
@@ -195,6 +227,7 @@ def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+def PPCpaddiDtprel : SDNode<"PPCISD::PADDI_DTPREL", SDTIntBinOp>;
def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
@@ -203,16 +236,6 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
-def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
-def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
-def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
-def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;
-
-def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;
-
-def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
- [SDNPHasChain, SDNPMayLoad]>;
-
def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
@@ -225,6 +248,28 @@ def PPCfnmsub : SDNode<"PPCISD::FNMSUB" , SDTFPTernaryOp>;
def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>;
+def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
+ SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
+ SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
+ SDTFPUnaryOp, [SDNPHasChain]>;
+def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+ SDTFPUnaryOp, [SDNPHasChain]>;
+
+def PPCany_fctidz : PatFrags<(ops node:$op),
+ [(PPCstrict_fctidz node:$op),
+ (PPCfctidz node:$op)]>;
+def PPCany_fctiwz : PatFrags<(ops node:$op),
+ [(PPCstrict_fctiwz node:$op),
+ (PPCfctiwz node:$op)]>;
+def PPCany_fctiduz : PatFrags<(ops node:$op),
+ [(PPCstrict_fctiduz node:$op),
+ (PPCfctiduz node:$op)]>;
+def PPCany_fctiwuz : PatFrags<(ops node:$op),
+ [(PPCstrict_fctiwuz node:$op),
+ (PPCfctiwuz node:$op)]>;
+
// Move 2 i64 values into a VSX register
def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
SDTypeProfile<1, 2,
@@ -295,7 +340,7 @@ def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
[SDNPHasChain, SDNPSideEffect]>;
def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
-def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
+def PPCvcmp_rec : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
[SDNPHasChain, SDNPOptInGlue]>;
@@ -327,6 +372,10 @@ def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>;
// PC Relative Specific Nodes
def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
+def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR",
+ SDTIntUnaryOp, []>;
+def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR",
+ SDTIntUnaryOp, []>;
//===----------------------------------------------------------------------===//
// PowerPC specific transformation functions and pattern fragments.
@@ -446,37 +495,41 @@ def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{
return isUInt<32>(Imm);
}]>;
-// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// This is a somewhat weaker condition than actually checking for 4-byte
+// alignment. It is simply checking that the displacement can be represented
+// as an immediate that is a multiple of 4 (i.e. the requirements for DS-Form
+// instructions).
+// But some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
// restricted memrix (4-aligned) constants are alignment sensitive. If these
// offsets are hidden behind TOC entries than the values of the lower-order
// bits cannot be checked directly. As a result, we need to also incorporate
// an alignment check into the relevant patterns.
-def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 4;
+def DSFormLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return isOffsetMultipleOf(N, 4) || cast<LoadSDNode>(N)->getAlignment() >= 4;
}]>;
-def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+def DSFormStore : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAlignment() >= 4;
+ return isOffsetMultipleOf(N, 4) || cast<StoreSDNode>(N)->getAlignment() >= 4;
}]>;
-def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 4;
+def DSFormSextLoadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+ return isOffsetMultipleOf(N, 4) || cast<LoadSDNode>(N)->getAlignment() >= 4;
}]>;
-def aligned4pre_store : PatFrag<
+def DSFormPreStore : PatFrag<
(ops node:$val, node:$base, node:$offset),
(pre_store node:$val, node:$base, node:$offset), [{
- return cast<StoreSDNode>(N)->getAlignment() >= 4;
+ return isOffsetMultipleOf(N, 4) || cast<StoreSDNode>(N)->getAlignment() >= 4;
}]>;
-def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() < 4;
+def NonDSFormLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4);
}]>;
-def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+def NonDSFormStore : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAlignment() < 4;
+ return cast<StoreSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4);
}]>;
-def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() < 4;
+def NonDSFormSextLoadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4);
}]>;
// This is a somewhat weaker condition than actually checking for 16-byte
@@ -617,6 +670,7 @@ def PPCU1ImmAsmOperand : AsmOperandClass {
def u1imm : Operand<i32> {
let PrintMethod = "printU1ImmOperand";
let ParserMatchClass = PPCU1ImmAsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU2ImmAsmOperand : AsmOperandClass {
@@ -626,6 +680,7 @@ def PPCU2ImmAsmOperand : AsmOperandClass {
def u2imm : Operand<i32> {
let PrintMethod = "printU2ImmOperand";
let ParserMatchClass = PPCU2ImmAsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCATBitsAsHintAsmOperand : AsmOperandClass {
@@ -635,6 +690,7 @@ def PPCATBitsAsHintAsmOperand : AsmOperandClass {
def atimm : Operand<i32> {
let PrintMethod = "printATBitsAsHint";
let ParserMatchClass = PPCATBitsAsHintAsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU3ImmAsmOperand : AsmOperandClass {
@@ -644,6 +700,7 @@ def PPCU3ImmAsmOperand : AsmOperandClass {
def u3imm : Operand<i32> {
let PrintMethod = "printU3ImmOperand";
let ParserMatchClass = PPCU3ImmAsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU4ImmAsmOperand : AsmOperandClass {
@@ -653,6 +710,7 @@ def PPCU4ImmAsmOperand : AsmOperandClass {
def u4imm : Operand<i32> {
let PrintMethod = "printU4ImmOperand";
let ParserMatchClass = PPCU4ImmAsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCS5ImmAsmOperand : AsmOperandClass {
let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
@@ -662,6 +720,7 @@ def s5imm : Operand<i32> {
let PrintMethod = "printS5ImmOperand";
let ParserMatchClass = PPCS5ImmAsmOperand;
let DecoderMethod = "decodeSImmOperand<5>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU5ImmAsmOperand : AsmOperandClass {
let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
@@ -671,6 +730,7 @@ def u5imm : Operand<i32> {
let PrintMethod = "printU5ImmOperand";
let ParserMatchClass = PPCU5ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<5>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU6ImmAsmOperand : AsmOperandClass {
let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
@@ -680,6 +740,7 @@ def u6imm : Operand<i32> {
let PrintMethod = "printU6ImmOperand";
let ParserMatchClass = PPCU6ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<6>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU7ImmAsmOperand : AsmOperandClass {
let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
@@ -689,6 +750,7 @@ def u7imm : Operand<i32> {
let PrintMethod = "printU7ImmOperand";
let ParserMatchClass = PPCU7ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<7>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU8ImmAsmOperand : AsmOperandClass {
let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
@@ -698,6 +760,7 @@ def u8imm : Operand<i32> {
let PrintMethod = "printU8ImmOperand";
let ParserMatchClass = PPCU8ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<8>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU10ImmAsmOperand : AsmOperandClass {
let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
@@ -707,6 +770,7 @@ def u10imm : Operand<i32> {
let PrintMethod = "printU10ImmOperand";
let ParserMatchClass = PPCU10ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<10>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU12ImmAsmOperand : AsmOperandClass {
let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
@@ -716,6 +780,7 @@ def u12imm : Operand<i32> {
let PrintMethod = "printU12ImmOperand";
let ParserMatchClass = PPCU12ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<12>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCS16ImmAsmOperand : AsmOperandClass {
let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
@@ -726,6 +791,7 @@ def s16imm : Operand<i32> {
let EncoderMethod = "getImm16Encoding";
let ParserMatchClass = PPCS16ImmAsmOperand;
let DecoderMethod = "decodeSImmOperand<16>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCU16ImmAsmOperand : AsmOperandClass {
let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
@@ -736,6 +802,7 @@ def u16imm : Operand<i32> {
let EncoderMethod = "getImm16Encoding";
let ParserMatchClass = PPCU16ImmAsmOperand;
let DecoderMethod = "decodeUImmOperand<16>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCS17ImmAsmOperand : AsmOperandClass {
let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
@@ -749,6 +816,7 @@ def s17imm : Operand<i32> {
let EncoderMethod = "getImm16Encoding";
let ParserMatchClass = PPCS17ImmAsmOperand;
let DecoderMethod = "decodeSImmOperand<16>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCS34ImmAsmOperand : AsmOperandClass {
let Name = "S34Imm";
@@ -757,9 +825,17 @@ def PPCS34ImmAsmOperand : AsmOperandClass {
}
def s34imm : Operand<i64> {
let PrintMethod = "printS34ImmOperand";
- let EncoderMethod = "getImm34Encoding";
+ let EncoderMethod = "getImm34EncodingNoPCRel";
let ParserMatchClass = PPCS34ImmAsmOperand;
let DecoderMethod = "decodeSImmOperand<34>";
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+def s34imm_pcrel : Operand<i64> {
+ let PrintMethod = "printS34ImmOperand";
+ let EncoderMethod = "getImm34EncodingPCRel";
+ let ParserMatchClass = PPCS34ImmAsmOperand;
+ let DecoderMethod = "decodeSImmOperand<34>";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def PPCImmZeroAsmOperand : AsmOperandClass {
let Name = "ImmZero";
@@ -770,6 +846,7 @@ def immZero : Operand<i32> {
let PrintMethod = "printImmZeroOperand";
let ParserMatchClass = PPCImmZeroAsmOperand;
let DecoderMethod = "decodeImmZeroOperand";
+ let OperandType = "OPERAND_IMMEDIATE";
}
def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
@@ -915,40 +992,47 @@ def memri : Operand<iPTR> {
let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getMemRIEncoding";
let DecoderMethod = "decodeMemRIOperands";
+ let OperandType = "OPERAND_MEMORY";
}
def memrr : Operand<iPTR> {
let PrintMethod = "printMemRegReg";
let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
+ let OperandType = "OPERAND_MEMORY";
}
def memrix : Operand<iPTR> { // memri where the imm is 4-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getMemRIXEncoding";
let DecoderMethod = "decodeMemRIXOperands";
+ let OperandType = "OPERAND_MEMORY";
}
def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getMemRIX16Encoding";
let DecoderMethod = "decodeMemRIX16Operands";
+ let OperandType = "OPERAND_MEMORY";
}
def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getSPE8DisEncoding";
let DecoderMethod = "decodeSPE8Operands";
+ let OperandType = "OPERAND_MEMORY";
}
def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getSPE4DisEncoding";
let DecoderMethod = "decodeSPE4Operands";
+ let OperandType = "OPERAND_MEMORY";
}
def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned.
let PrintMethod = "printMemRegImm";
let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
let EncoderMethod = "getSPE2DisEncoding";
let DecoderMethod = "decodeSPE2Operands";
+ let OperandType = "OPERAND_MEMORY";
}
// A single-register address. This is used with the SjLj
@@ -956,6 +1040,7 @@ def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned.
// G8RC_NOX0 registers.
def memr : Operand<iPTR> {
let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
+ let OperandType = "OPERAND_MEMORY";
}
def PPCTLSRegOperand : AsmOperandClass {
let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
@@ -981,11 +1066,13 @@ def pred : Operand<OtherVT> {
// Define PowerPC specific addressing mode.
// d-form
-def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb"
+def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb"
// ds-form
-def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
+def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std"
// dq-form
-def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv"
+// 8LS:d-form
+def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34", [], []>; // "pstxvp"
// Below forms are all x-form addressing mode, use three different ones so we
// can make a accurate check for x-form instructions in ISEL.
@@ -1031,6 +1118,11 @@ def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
def HasFPU : Predicate<"Subtarget->hasFPU()">;
def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
+def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">;
+
+// AIX assembler may not be modern enough to support some extended mne.
+def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">,
+ AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>;
//===----------------------------------------------------------------------===//
// PowerPC Multiclass Definitions.
@@ -1389,10 +1481,7 @@ def ADJCALLSTACKUP : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2
"#ADJCALLSTACKUP $amt1 $amt2",
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
-
-def UPDATE_VRSAVE : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$rS),
- "UPDATE_VRSAVE $rD, $rS", []>;
-}
+} // hasCtrlDep
let Defs = [R1], Uses = [R1] in
def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
@@ -1518,6 +1607,9 @@ def SETRNDi : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins u2imm:$RND),
def SETRND : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins gprc:$in),
"#SETRND", [(set f64:$FRT, (int_ppc_setrnd gprc :$in))]>;
+
+def SETFLM : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FLM),
+ "#SETFLM", [(set f64:$FRT, (int_ppc_setflm f8rc:$FLM))]>;
}
let Defs = [LR] in
@@ -1567,11 +1659,12 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
"bc 4, $bi, $dst">;
- let isReturn = 1, Uses = [LR, RM] in
+ let isReturn = 1, Uses = [LR, RM] in {
def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
"bclr 12, $bi, 0", IIC_BrB, []>;
def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
"bclr 4, $bi, 0", IIC_BrB, []>;
+ }
}
let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
@@ -1843,7 +1936,7 @@ def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
PPC970_DGroup_Single;
-def DCBF : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst),
+def DCBF : DCB_Form_hint<86, (outs), (ins u3imm:$TH, memrr:$dst),
"dcbf $dst, $TH", IIC_LdStDCBF, []>,
PPC970_DGroup_Single;
@@ -2378,7 +2471,7 @@ let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
"stmw $rS, $dst", IIC_LdStLMW, []>;
-def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
+def SYNC : XForm_24_sync<31, 598, (outs), (ins u2imm:$L),
"sync $L", IIC_LdStSync, []>;
let isCodeGenOnly = 1 in {
@@ -2573,37 +2666,26 @@ let isCompare = 1, hasSideEffects = 0 in {
}
}
let PPC970_Unit = 3, Predicates = [HasFPU] in { // FPU Operations.
-//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
-// "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
-let isCompare = 1, hasSideEffects = 0 in {
+let isCompare = 1, mayRaiseFPException = 1, hasSideEffects = 0 in {
def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
"fcmpu $crD, $fA, $fB", IIC_FPCompare>;
- let Interpretation64Bit = 1, isCodeGenOnly = 1 in
- def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
- "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+ def FCMPOS : XForm_17<63, 32, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
+ "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+ def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+ "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+ def FCMPOD : XForm_17<63, 32, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+ "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+ }
}
def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
"ftdiv $crD, $fA, $fB", IIC_FPCompare>;
def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
- "ftsqrt $crD, $fB", IIC_FPCompare>;
-
-let Uses = [RM], mayRaiseFPException = 1 in {
- let hasSideEffects = 0 in {
- defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
- "fctiw", "$frD, $frB", IIC_FPGeneral,
- []>;
- defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
- "fctiwu", "$frD, $frB", IIC_FPGeneral,
- []>;
- defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
- "fctiwz", "$frD, $frB", IIC_FPGeneral,
- [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
-
- defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
- "frsp", "$frD, $frB", IIC_FPGeneral,
- [(set f32:$frD, (any_fpround f64:$frB))]>;
+ "ftsqrt $crD, $fB", IIC_FPCompare,
+ [(set i32:$crD, (PPCftsqrt f64:$fB))]>;
+let mayRaiseFPException = 1, hasSideEffects = 0 in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
"frin", "$frD, $frB", IIC_FPGeneral,
@@ -2611,9 +2693,7 @@ let Uses = [RM], mayRaiseFPException = 1 in {
defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
"frin", "$frD, $frB", IIC_FPGeneral,
[(set f32:$frD, (any_fround f32:$frB))]>;
- }
- let hasSideEffects = 0 in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
"frip", "$frD, $frB", IIC_FPGeneral,
@@ -2635,6 +2715,22 @@ let Uses = [RM], mayRaiseFPException = 1 in {
defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
"frim", "$frD, $frB", IIC_FPGeneral,
[(set f32:$frD, (any_ffloor f32:$frB))]>;
+}
+
+let Uses = [RM], mayRaiseFPException = 1, hasSideEffects = 0 in {
+ defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiw", "$frD, $frB", IIC_FPGeneral,
+ []>;
+ defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiwu", "$frD, $frB", IIC_FPGeneral,
+ []>;
+ defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
+ "fctiwz", "$frD, $frB", IIC_FPGeneral,
+ [(set f64:$frD, (PPCany_fctiwz f64:$frB))]>;
+
+ defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
+ "frsp", "$frD, $frB", IIC_FPGeneral,
+ [(set f32:$frD, (any_fpround f64:$frB))]>;
defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
"fsqrt", "$frD, $frB", IIC_FPSqrtD,
@@ -2642,9 +2738,10 @@ let Uses = [RM], mayRaiseFPException = 1 in {
defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
"fsqrts", "$frD, $frB", IIC_FPSqrtS,
[(set f32:$frD, (any_fsqrt f32:$frB))]>;
- }
- }
}
+}
+
+def : Pat<(PPCfsqrt f64:$frA), (FSQRT $frA)>;
/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
/// often coalesced away and we don't want the dispatch group builder to think
@@ -2689,6 +2786,7 @@ defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
[(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
// Reciprocal estimates.
+let mayRaiseFPException = 1 in {
defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
"fre", "$frD, $frB", IIC_FPGeneral,
[(set f64:$frD, (PPCfre f64:$frB))]>;
@@ -2702,6 +2800,7 @@ defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
"frsqrtes", "$frD, $frB", IIC_FPGeneral,
[(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
}
+}
// XL-Form instructions. condition register logical ops.
//
@@ -2862,18 +2961,6 @@ let isCodeGenOnly = 1 in {
def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
-// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
-// so we'll need to scavenge a register for it.
-let mayStore = 1 in
-def SPILL_VRSAVE : PPCEmitTimePseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
- "#SPILL_VRSAVE", []>;
-
-// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
-// spilled), so we'll need to scavenge a register for it.
-let mayLoad = 1 in
-def RESTORE_VRSAVE : PPCEmitTimePseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
- "#RESTORE_VRSAVE", []>;
-
let hasSideEffects = 0 in {
// mtocrf's input needs to be prepared by shifting by an amount dependent
// on the cr register selected. Thus, post-ra anti-dep breaking must not
@@ -2913,20 +3000,24 @@ def : InstAlias<"mtcr $rA", (MTCRF 255, gprc:$rA)>;
let Predicates = [HasFPU] in {
// Custom inserter instruction to perform FADD in round-to-zero mode.
-let Uses = [RM] in {
+let Uses = [RM], mayRaiseFPException = 1 in {
def FADDrtz: PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
- [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
+ [(set f64:$FRT, (PPCany_faddrtz f64:$FRA, f64:$FRB))]>;
}
// The above pseudo gets expanded to make use of the following instructions
// to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level.
-let Uses = [RM], Defs = [RM] in {
- def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
- "mtfsb0 $FM", IIC_IntMTFSB0, []>,
- PPC970_DGroup_Single, PPC970_Unit_FPU;
- def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
- "mtfsb1 $FM", IIC_IntMTFSB0, []>,
- PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+// When FM is 30/31, we are setting the 62/63 bit of FPSCR, the implicit-def
+// RM should be set.
+def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+ "mtfsb0 $FM", IIC_IntMTFSB0, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+ "mtfsb1 $FM", IIC_IntMTFSB0, []>,
+ PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+let Defs = [RM] in {
let isCodeGenOnly = 1 in
def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
"mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
@@ -3065,7 +3156,7 @@ def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC_rec gprc:$rA, gprc:$rC, gprc:$rB)>
// this type.
//
let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in { // FPU Operations.
-let Uses = [RM] in {
+let mayRaiseFPException = 1, Uses = [RM] in {
let isCommutable = 1 in {
defm FMADD : AForm_1r<63, 29,
(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
@@ -3251,9 +3342,13 @@ def : Pat<(PPCcall (i32 texternalsym:$dst)),
// Calls for AIX only
def : Pat<(PPCcall (i32 mcsym:$dst)),
(BL mcsym:$dst)>;
+
def : Pat<(PPCcall_nop (i32 mcsym:$dst)),
(BL_NOP mcsym:$dst)>;
+def : Pat<(PPCcall_nop (i32 texternalsym:$dst)),
+ (BL_NOP texternalsym:$dst)>;
+
def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm),
(TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
@@ -3263,7 +3358,7 @@ def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
(TCRETURNri CTRRC:$dst, imm:$imm)>;
-
+def : Pat<(int_ppc_readflm), (MFFS)>;
// Hi and Lo for Darwin Global Addresses.
def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
@@ -3417,7 +3512,7 @@ def : Pat<(f64 (extloadf32 iaddr:$src)),
def : Pat<(f64 (extloadf32 xaddr:$src)),
(COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
-def : Pat<(f64 (fpextend f32:$src)),
+def : Pat<(f64 (any_fpextend f32:$src)),
(COPY_TO_REGCLASS $src, F8RC)>;
}
@@ -3457,7 +3552,6 @@ include "PPCInstrAltivec.td"
include "PPCInstrSPE.td"
include "PPCInstr64Bit.td"
include "PPCInstrVSX.td"
-include "PPCInstrQPX.td"
include "PPCInstrHTM.td"
def crnot : OutPatFrag<(ops node:$in),
@@ -3841,6 +3935,7 @@ def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+let Predicates = [IsNotISA3_1] in {
// Instantiations of CRNotPat for i32.
defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
(EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
@@ -3898,106 +3993,62 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+}
-let Predicates = [HasFPU] in {
-// Instantiations of CRNotPat for f32.
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
-
-// Instantiations of CRNotPat for f64.
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
-
-// Instantiations of CRNotPat for f128.
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+multiclass FSetCCPat<SDNode SetCC, ValueType Ty, PatLeaf FCmp> {
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+ defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
+
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+ def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
}
-// SETCC for f32.
let Predicates = [HasFPU] in {
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+// FCMPU: If either of the operands is a Signaling NaN, then VXSNAN is set.
+// SETCC for f32.
+defm : FSetCCPat<any_fsetcc, f32, FCMPUS>;
// SETCC for f64.
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+defm : FSetCCPat<any_fsetcc, f64, FCMPUD>;
// SETCC for f128.
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETLT)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOGT)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETGT)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOEQ)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+defm : FSetCCPat<any_fsetcc, f128, XSCMPUQP>;
+
+// FCMPO: If either of the operands is a Signaling NaN, then VXSNAN is set and,
+// if neither operand is a Signaling NaN but at least one operand is a Quiet NaN,
+// then VXVC is set.
+// SETCCS for f32.
+defm : FSetCCPat<strict_fsetccs, f32, FCMPOS>;
+
+// SETCCS for f64.
+defm : FSetCCPat<strict_fsetccs, f64, FCMPOD>;
+// SETCCS for f128.
+defm : FSetCCPat<strict_fsetccs, f128, XSCMPOQP>;
}
// This must be in this file because it relies on patterns defined in this file
@@ -4266,7 +4317,7 @@ def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
"icbi $src", IIC_LdStICBI, []>;
-def WAIT : XForm_24_sync<31, 30, (outs), (ins i32imm:$L),
+def WAIT : XForm_24_sync<31, 30, (outs), (ins u2imm:$L),
"wait $L", IIC_LdStLoad, []>;
def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
@@ -4284,7 +4335,7 @@ def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
"mfsrin $RS, $RB", IIC_SprMFSR>;
-def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, u1imm:$L),
"mtmsr $RS, $L", IIC_SprMTMSR>;
def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
@@ -4313,15 +4364,17 @@ def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
"mfmsr $RT", IIC_SprMFMSR, []>;
-def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, u1imm:$L),
"mtmsrd $RS, $L", IIC_SprMTMSRD>;
def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
"mcrfs $BF, $BFA", IIC_BrMCR>;
+// If W is 0 and BF is 7, the 60:63 bits will be set, we should set the
+// implicit-def RM.
def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
"mtfsfi $BF, $U, $W", IIC_IntMFFS>;
-
+let Defs = [CR1] in
def MTFSFI_rec : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
"mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm;
@@ -4329,12 +4382,15 @@ def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec crrc:$BF, i32imm:$U, 0)>;
let Predicates = [HasFPU] in {
+let Defs = [RM] in {
def MTFSF : XFLForm_1<63, 711, (outs),
- (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+ (ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W),
"mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
+let Defs = [CR1] in
def MTFSF_rec : XFLForm_1<63, 711, (outs),
- (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+ (ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W),
"mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isRecordForm;
+}
def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSF_rec i32imm:$FLM, f8rc:$FRB, 0, 0)>;
@@ -4561,6 +4617,16 @@ def : Pat<(int_ppc_dcbfl xoaddr:$dst),
def : Pat<(int_ppc_dcbflp xoaddr:$dst),
(DCBF 3, xoaddr:$dst)>;
+let Predicates = [IsISA3_1] in {
+ def DCBFPS : PPCAsmPseudo<"dcbfps $dst", (ins memrr:$dst)>;
+ def DCBSTPS : PPCAsmPseudo<"dcbstps $dst", (ins memrr:$dst)>;
+
+ def : Pat<(int_ppc_dcbfps xoaddr:$dst),
+ (DCBF 4, xoaddr:$dst)>;
+ def : Pat<(int_ppc_dcbstps xoaddr:$dst),
+ (DCBF 6, xoaddr:$dst)>;
+}
+
def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
@@ -4587,8 +4653,11 @@ def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
+//Disable this alias on AIX for now because as does not support them.
+let Predicates = [ModernAs] in {
def : InstAlias<"mtudscr $Rx", (MTSPR 3, gprc:$Rx)>;
def : InstAlias<"mfudscr $Rx", (MFSPR gprc:$Rx, 3)>;
+}
def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 2bab73418e10..b9eb3b3b7d37 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1,3 +1,8 @@
+// Mask immediates for MMA instructions (2, 4 and 8 bits).
+def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
+def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
+def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
+
//===----------------------------------------------------------------------===//
// PowerPC ISA 3.1 specific type constraints.
//
@@ -5,12 +10,35 @@
def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
]>;
+def SDT_PPCAccBuild : SDTypeProfile<1, 4, [
+ SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>,
+ SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32>
+]>;
+def SDT_PPCPairBuild : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>
+]>;
+def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2>
+]>;
+def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2>
+]>;
+def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
+ SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
+]>;
//===----------------------------------------------------------------------===//
// ISA 3.1 specific PPCISD nodes.
//
def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
+ []>;
+def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
+ []>;
+def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
//===----------------------------------------------------------------------===//
@@ -18,6 +46,15 @@ def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
// address computations).
class isPCRel { bit PCRel = 1; }
+// PowerPC specific type constraints.
+def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+
+// PPC Specific DAG Nodes.
+def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
+ [SDNPHasChain, SDNPMayLoad]>;
+
// Top-level class for prefixed instructions.
class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
InstrItinClass itin> : Instruction {
@@ -59,6 +96,39 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
string BaseName = "";
}
+// VX-Form: [ PO VT R VB RC XO ]
+class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> VT;
+ bits<5> VB;
+ bit RC = 0;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = VT;
+ let Inst{11-15} = R;
+ let Inst{16-20} = VB;
+ let Inst{21} = RC;
+ let Inst{22-31} = xo;
+}
+
+// Multiclass definition to account for record and non-record form
+// instructions of VXRForm.
+multiclass VXForm_VTB5_RCr<bits<10> xo, bits<5> R, dag OOL, dag IOL,
+ string asmbase, string asmstr,
+ InstrItinClass itin, list<dag> pattern> {
+ let BaseName = asmbase in {
+ def NAME : VXForm_VTB5_RC<xo, R, OOL, IOL,
+ !strconcat(asmbase, !strconcat(" ", asmstr)),
+ itin, pattern>, RecFormRel;
+ let Defs = [CR6] in
+ def _rec : VXForm_VTB5_RC<xo, R, OOL, IOL,
+ !strconcat(asmbase, !strconcat(". ", asmstr)),
+ itin, []>, isRecordForm, RecFormRel;
+ }
+}
+
class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: PI<1, opcode, OOL, IOL, asmstr, itin> {
@@ -242,29 +312,37 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
}
-// VX-Form: [PO VRT / UIM RB XO].
-// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
-// "/ UIM" (unused bit followed by a 4-bit immediate)
-// Destructive (insert) forms are suffixed with _ins.
-class VXForm_VRT5_UIM5_RB5_ins<bits<11> xo, string opc, list<dag> pattern>
- : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
- !strconcat(opc, " $vD, $rB, $UIM"), IIC_VecGeneral, pattern>,
- RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
-
// VX-Form: [PO VRT RA VRB XO].
// Destructive (insert) forms are suffixed with _ins.
class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
- : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, vrrc:$vB),
+ : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, vrrc:$vB),
!strconcat(opc, " $vD, $rA, $vB"), IIC_VecGeneral, pattern>,
RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
// VX-Form: [PO VRT RA RB XO].
// Destructive (insert) forms are suffixed with _ins.
class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
- : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+ : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, gprc:$rB),
!strconcat(opc, " $vD, $rA, $rB"), IIC_VecGeneral, pattern>,
RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+// VX-Form: [ PO BF // VRA VRB XO ]
+class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<3> BF;
+ bits<5> VA;
+ bits<5> VB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = BF;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = VA;
+ let Inst{16-20} = VB;
+ let Inst{21-31} = xo;
+}
+
// VN-Form: [PO VRT VRA VRB PS SD XO]
// SD is "Shift Direction"
class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
@@ -285,6 +363,22 @@ class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
let Inst{26-31} = xo;
}
+class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<4, OOL, IOL, asmstr, itin> {
+ bits<5> RD;
+ bits<5> VB;
+ bit MP;
+
+ let Pattern = pattern;
+
+ let Inst{6-10} = RD;
+ let Inst{11-14} = eo;
+ let Inst{15} = MP;
+ let Inst{16-20} = VB;
+ let Inst{21-31} = xo;
+}
+
// 8RR:D-Form: [ 1 1 0 // // imm0
// PO T XO TX imm1 ].
class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
@@ -415,6 +509,13 @@ class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
let Inst{31} = 0;
}
+// X-Form: [ PO RT BI /// XO / ]
+class XForm_XT5_BI5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let B = 0;
+}
+
multiclass MLS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL,
dag PCRel_IOL, string asmstr,
InstrItinClass itin> {
@@ -444,14 +545,307 @@ multiclass 8LS_DForm_R_SI34_XT6_RA5_p<bits<5> opcode, dag OOL, dag IOL,
isPCRel;
}
+def PPCRegVSRpRCAsmOperand : AsmOperandClass {
+ let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrprc : RegisterOperand<VSRpRC> {
+ let ParserMatchClass = PPCRegVSRpRCAsmOperand;
+}
+
+def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
+ let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+
+def vsrpevenrc : RegisterOperand<VSRpRC> {
+ let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
+ let EncoderMethod = "getVSRpEvenEncoding";
+ let DecoderMethod = "decodeVSRpEvenOperands";
+}
+
+class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<5> XTp;
+ bits<17> DQ_RA;
+ let Pattern = pattern;
+
+ let Inst{6-9} = XTp{3-0};
+ let Inst{10} = XTp{4};
+ let Inst{11-15} = DQ_RA{16-12}; // Register #
+ let Inst{16-27} = DQ_RA{11-0}; // Displacement.
+ let Inst{28-31} = xo;
+}
+
+class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp {
+ bits<5> XTp;
+ bits<5> A;
+ bits<5> B;
+
+ let Pattern = pattern;
+ let Inst{6-9} = XTp{3-0};
+ let Inst{10} = XTp{4};
+ let Inst{11-15} = A;
+ let Inst{16-20} = B;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<5> XTp;
+ bits<39> D_RA;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-10} = 0;
+ let Inst{11} = PCRel;
+ let Inst{12-13} = 0;
+ let Inst{14-31} = D_RA{33-16}; // Imm18
+
+ // The instruction.
+ let Inst{38-41} = XTp{3-0};
+ let Inst{42} = XTp{4};
+ let Inst{43-47} = D_RA{38-34}; // Register #
+ let Inst{48-63} = D_RA{15-0}; // D
+}
+
+multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> pref, bits<6> opcode, dag OOL,
+ dag IOL, dag PCRel_IOL,
+ string asmstr, InstrItinClass itin> {
+ def NAME : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, IOL,
+ !strconcat(asmstr, ", 0"), itin, []>;
+ def pc : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, PCRel_IOL,
+ !strconcat(asmstr, ", 1"), itin, []>,
+ isPCRel;
+}
+
+def PPCRegACCRCAsmOperand : AsmOperandClass {
+ let Name = "RegACCRC"; let PredicateMethod = "isACCRegNumber";
+}
+
+def acc : RegisterOperand<ACCRC> {
+ let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+
+def uacc : RegisterOperand<UACCRC> {
+ let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+
+// [PO AS XO2 XO]
+class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin, list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = AT;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = xo2;
+ let Inst{16-20} = 0;
+ let Inst{21-30} = xo;
+ let Inst{31} = 0;
+}
+
+class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XA;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = AT;
+ let Inst{9-10} = 0;
+ let Inst{11-15} = XA{4-0};
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-28} = xo;
+ let Inst{29} = XA{5};
+ let Inst{30} = XB{5};
+ let Inst{31} = 0;
+}
+
+class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<4> XMSK;
+ bits<4> YMSK;
+ bits<2> PMSK;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-7} = 3;
+ let Inst{8-11} = 9;
+ let Inst{12-15} = 0;
+ let Inst{16-17} = PMSK;
+ let Inst{18-23} = 0;
+ let Inst{24-27} = XMSK;
+ let Inst{28-31} = YMSK;
+
+ // The instruction.
+ let Inst{38-40} = AT;
+ let Inst{41-42} = 0;
+ let Inst{43-47} = XA{4-0};
+ let Inst{48-52} = XB{4-0};
+ let Inst{53-60} = xo;
+ let Inst{61} = XA{5};
+ let Inst{62} = XB{5};
+ let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<4> XMSK;
+ bits<4> YMSK;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-7} = 3;
+ let Inst{8-11} = 9;
+ let Inst{12-23} = 0;
+ let Inst{24-27} = XMSK;
+ let Inst{28-31} = YMSK;
+
+ // The instruction.
+ let Inst{38-40} = AT;
+ let Inst{41-42} = 0;
+ let Inst{43-47} = XA{4-0};
+ let Inst{48-52} = XB{4-0};
+ let Inst{53-60} = xo;
+ let Inst{61} = XA{5};
+ let Inst{62} = XB{5};
+ let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<4> XMSK;
+ bits<2> YMSK;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-7} = 3;
+ let Inst{8-11} = 9;
+ let Inst{12-23} = 0;
+ let Inst{24-27} = XMSK;
+ let Inst{28-29} = YMSK;
+ let Inst{30-31} = 0;
+
+ // The instruction.
+ let Inst{38-40} = AT;
+ let Inst{41-42} = 0;
+ let Inst{43-47} = XA{4-0};
+ let Inst{48-52} = XB{4-0};
+ let Inst{53-60} = xo;
+ let Inst{61} = XA{5};
+ let Inst{62} = XB{5};
+ let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<4> XMSK;
+ bits<4> YMSK;
+ bits<8> PMSK;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-7} = 3;
+ let Inst{8-11} = 9;
+ let Inst{12-15} = 0;
+ let Inst{16-23} = PMSK;
+ let Inst{24-27} = XMSK;
+ let Inst{28-31} = YMSK;
+
+ // The instruction.
+ let Inst{38-40} = AT;
+ let Inst{41-42} = 0;
+ let Inst{43-47} = XA{4-0};
+ let Inst{48-52} = XB{4-0};
+ let Inst{53-60} = xo;
+ let Inst{61} = XA{5};
+ let Inst{62} = XB{5};
+ let Inst{63} = 0;
+}
+
+class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XA;
+ bits<6> XB;
+ bits<4> XMSK;
+ bits<4> YMSK;
+ bits<4> PMSK;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-7} = 3;
+ let Inst{8-11} = 9;
+ let Inst{12-15} = 0;
+ let Inst{16-19} = PMSK;
+ let Inst{20-23} = 0;
+ let Inst{24-27} = XMSK;
+ let Inst{28-31} = YMSK;
+
+ // The instruction.
+ let Inst{38-40} = AT;
+ let Inst{41-42} = 0;
+ let Inst{43-47} = XA{4-0};
+ let Inst{48-52} = XB{4-0};
+ let Inst{53-60} = xo;
+ let Inst{61} = XA{5};
+ let Inst{62} = XB{5};
+ let Inst{63} = 0;
+}
+
def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">;
def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
+def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">;
+def MMA : Predicate<"Subtarget->hasMMA()">;
+
+def RCCp {
+ dag AToVSRC = (COPY_TO_REGCLASS $XA, VSRC);
+ dag BToVSRC = (COPY_TO_REGCLASS $XB, VSRC);
+}
let Predicates = [PrefixInstrs] in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
defm PADDI8 :
MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI),
- (ins immZero:$RA, s34imm:$SI),
+ (ins immZero:$RA, s34imm_pcrel:$SI),
"paddi $RT, $RA, $SI", IIC_LdStLFD>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT),
@@ -461,7 +855,7 @@ let Predicates = [PrefixInstrs] in {
}
defm PADDI :
MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI),
- (ins immZero:$RA, s34imm:$SI),
+ (ins immZero:$RA, s34imm_pcrel:$SI),
"paddi $RT, $RA, $SI", IIC_LdStLFD>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT),
@@ -592,6 +986,695 @@ let Predicates = [PrefixInstrs] in {
}
}
+// Multiclass definitions for MMA accumulator instructions.
+// ----------------------------------------------------------------------------
+
+// Defines 2 unmasked instructions where the xo field for acc/non-acc version
+// is even/odd.
+multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ let Predicates = [MMA] in {
+ def NAME :
+ XX3Form_AT3_XAB6<opcode, !or(xo, 0x01), (outs acc:$AT), IOL,
+ !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PP :
+ XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME :
+ MMIRR_XX3Form_XY4P8_XAB6<
+ opcode, !or(xo, 0x01), (outs acc:$AT),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_XY4P8_XAB6<
+ opcode, xo, (outs acc:$AT),
+ !con((ins acc:$ATi),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME :
+ MMIRR_XX3Form_XYP4_XAB6<
+ opcode, !or(xo, 0x01), (outs acc:$AT),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_XYP4_XAB6<
+ opcode, xo, (outs acc:$AT),
+ !con((ins acc:$ATi),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// The XO field for acc/non-acc version is even/odd.
+multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, !or(xo, 0x01), (outs acc:$AT),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, xo, (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits.
+// Upper nibble of XO field for acc/non-acc version is 0x4/0x6.
+multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ let Predicates = [MMA] in {
+ def NAME :
+ XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), IOL,
+ !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PP :
+ XX3Form_AT3_XAB6<
+ opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, xo, (outs acc:$AT),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, !or(xo, 0x20), (outs acc:$AT),
+ !con((ins acc:$ATi),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4
+// bits. Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
+ defm NAME : ACC_UM_M244_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA] in {
+ def PN : XX3Form_AT3_XAB6<
+ opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NP : XX3Form_AT3_XAB6<
+ opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NN : XX3Form_AT3_XAB6<
+ opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME#PN :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, !or(xo, 0x80), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NP :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, !or(xo, 0x40), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NN :
+ MMIRR_XX3Form_XY4P2_XAB6<
+ opcode, !or(xo, 0xC0), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+ !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 5 instructions, unmasked, operand negating.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
+ defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA] in {
+ def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
+ !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
+ !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
+ !con((ins acc:$ATi), IOL),
+ !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
+ defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME :
+ MMIRR_XX3Form_XY4_XAB6<
+ opcode, !or(xo, 0x01), (outs acc:$AT),
+ !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_XY4_XAB6<
+ opcode, xo, (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#PN :
+ MMIRR_XX3Form_XY4_XAB6<
+ opcode, !or(xo, 0x80), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+ !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NP :
+ MMIRR_XX3Form_XY4_XAB6<
+ opcode, !or(xo, 0x40), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+ !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NN :
+ MMIRR_XX3Form_XY4_XAB6<
+ opcode, !or(xo, 0xC0), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
+ !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits.
+// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands.
+multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
+ string asmbase, string asmstr> {
+ defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [MMA, PrefixInstrs] in {
+ def PM#NAME :
+ MMIRR_XX3Form_X4Y2_XAB6<
+ opcode, !or(xo, 0x01), (outs acc:$AT),
+ !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_X4Y2_XAB6<
+ opcode, xo, (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#PN :
+ MMIRR_XX3Form_X4Y2_XAB6<
+ opcode, !or(xo, 0x80), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+ !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NP :
+ MMIRR_XX3Form_X4Y2_XAB6<
+ opcode, !or(xo, 0x40), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+ !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def PM#NAME#NN :
+ MMIRR_XX3Form_X4Y2_XAB6<
+ opcode, !or(xo, 0xC0), (outs acc:$AT),
+ !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
+ !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+// End of class definitions.
+//-----------------------------------------------------------------------------
+
+let Predicates = [MMA] in {
+ def XXMFACC :
+ XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS",
+ IIC_VecGeneral,
+ [(set v512i1:$ASo, (int_ppc_mma_xxmfacc v512i1:$AS))]>,
+ RegConstraint<"$ASo = $AS">, NoEncode<"$ASo">;
+ def XXMTACC :
+ XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
+ IIC_VecGeneral,
+ [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
+ "#KILL_PAIR", []>,
+ RegConstraint<"$XTp = $XSp">;
+ def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS),
+ "#BUILD_UACC $AT, $AS", []>;
+ // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in
+ // the backend. We avoid CSE here because it generates a copy of the acc
+ // register and this copy is more expensive than calling the intrinsic again.
+ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+ def XXSETACCZ :
+ XForm_AT3<31, 3, 177, (outs acc:$AT), (ins), "xxsetaccz $AT", IIC_VecGeneral,
+ [(set v512i1:$AT, (int_ppc_mma_xxsetaccz))]>;
+ }
+ def XVI8GER4SPP :
+ XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
+ "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ let mayStore = 1 in {
+ def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
+ "#SPILL_ACC", []>;
+ def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst),
+ "#SPILL_UACC", []>;
+ }
+ let mayLoad = 1, hasSideEffects = 0 in {
+ def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src),
+ "#RESTORE_ACC", []>;
+ def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src),
+ "#RESTORE_UACC", []>;
+ }
+}
+
+let Predicates = [MMA, PrefixInstrs] in {
+ def PMXVI8GER4SPP :
+ MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT),
+ (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK,
+ u4imm:$YMSK, u4imm:$PMSK),
+ "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
+ IIC_VecGeneral, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+}
+
+// MMA accumulating/non-accumulating instructions.
+//------------------------------------------------------------------------------
+
+// XVBF16GER2, XVBF16GER2PP, XVBF16GER2PN, XVBF16GER2NP, XVBF16GER2NN
+// PMXVBF16GER2, PMXVBF16GER2PP, PMXVBF16GER2PN, PMXVBF16GER2NP, PMXVBF16GER2NN
+defm XVBF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 50, (ins vsrc:$XA, vsrc:$XB),
+ "xvbf16ger2", "$AT, $XA, $XB">;
+
+// XVI4GER8, XVI4GER8PP, PMXVI4GER8, PMXVI4GER8PP
+defm XVI4GER8 : ACC_UM_M844_XOEO<59, 34, (ins vsrc:$XA, vsrc:$XB),
+ "xvi4ger8", "$AT, $XA, $XB">;
+
+// XVI8GER4, XVI8GER4PP, PMXVI8GER4, PMXVI8GER4PP
+defm XVI8GER4 : ACC_UM_M444_XOEO<59, 2, (ins vsrc:$XA, vsrc:$XB),
+ "xvi8ger4", "$AT, $XA, $XB">;
+
+// XVI16GER2, XVI16GER2PP, PMXVI16GER2, PMXVI16GER2PP
+defm XVI16GER2 : ACC_UM_M244_XO46<59, 75, (ins vsrc:$XA, vsrc:$XB),
+ "xvi16ger2", "$AT, $XA, $XB">;
+
+// XVI16GER2S, XVI16GER2SPP, PMXVI16GER2S, PMXVI16GER2SPP
+defm XVI16GER2S : ACC_UM_M244_XOEO<59, 42, (ins vsrc:$XA, vsrc:$XB),
+ "xvi16ger2s", "$AT, $XA, $XB">;
+
+// XVF16GER2, XVF16GER2PP, XVF16GER2PN, XVF16GER2NP, XVF16GER2NN
+// PMXVF16GER2, PMXVF16GER2PP, PMXVF16GER2PN, PMXVF16GER2NP, PMXVF16GER2NN
+defm XVF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 18, (ins vsrc:$XA, vsrc:$XB),
+ "xvf16ger2", "$AT, $XA, $XB">;
+
+// XVF32GER, XVF32GERPP, XVF32GERPN, XVF32GERNP, XVF32GERPP
+// PMXVF32GER, PMXVF32GERPP, PMXVF32GERPN, PMXVF32GERNP, PMXVF32GERPP
+defm XVF32GER : ACC_NEG_UM_M44_XOM84C<59, 26, (ins vsrc:$XA, vsrc:$XB),
+ "xvf32ger", "$AT, $XA, $XB">;
+
+// XVF64GER, XVF64GERPP, XVF64GERPN, XVF64GERNP, XVF64GERNN
+// PMXVF64GER, PMXVF64GERPP, PMXVF64GERPN, PMXVF64GERNP, PMXVF64GERNN
+defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB),
+ "xvf64ger", "$AT, $XA, $XB">;
+//------------------------------------------------------------------------------
+
+// MMA Intrinsics
+let Predicates = [MMA] in {
+ def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)),
+ (XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)),
+ (XVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)),
+ (XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)),
+ (XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)),
+ (XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)),
+ (XVF64GER $XA, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+ (XVF64GERPP $ATi, $XA, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+ (XVF64GERPN $ATi, $XA, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+ (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
+ (XVF64GERNN $ATi, $XA, RCCp.BToVSRC)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)),
+ (XVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)),
+ (XVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+ def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)),
+ (XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>;
+}
+
+// MMA Intrinsics
+let Predicates = [MMA, PrefixInstrs] in {
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk8Imm:$PMSK)),
+ (PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk8Imm:$PMSK)),
+ (PMXVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk8Imm:$PMSK)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
+ (PMXVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk4Imm:$PMSK)),
+ (PMXVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMXVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMXVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK)),
+ (PMXVF32GER RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+ (PMXVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+ (PMXVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+ (PMXVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK)),
+ (PMXVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk2Imm:$YMSK)),
+ (PMXVF64GER $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+ (PMXVF64GERPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk2Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+ (PMXVF64GERPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk2Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+ (PMXVF64GERNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk2Imm:$YMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk2Imm:$YMSK)),
+ (PMXVF64GERNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk2Imm:$YMSK)>;
+
+ def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMXVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+ (PMXVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+ def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB,
+ Msk4Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMXVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+}
+
+def Concats {
+ dag VecsToVecPair0 =
+ (v256i1 (INSERT_SUBREG
+ (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1),
+ $vs1, sub_vsx0));
+ dag VecsToVecPair1 =
+ (v256i1 (INSERT_SUBREG
+ (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1),
+ $vs3, sub_vsx0));
+ dag VecsToVecQuad =
+ (BUILD_UACC (INSERT_SUBREG
+ (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)),
+ (KILL_PAIR VecsToVecPair0), sub_pair0),
+ (KILL_PAIR VecsToVecPair1), sub_pair1));
+}
+
+def Extracts {
+ dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0));
+ dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1));
+ dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0));
+ dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1));
+ dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0));
+ dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1));
+}
+
+let Predicates = [MMA] in {
+ def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)),
+ (XXMTACC Concats.VecsToVecQuad)>;
+ def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0,
+ v16i8:$vs3, v16i8:$vs2)),
+ (XXMTACC Concats.VecsToVecQuad)>;
+ def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))),
+ Extracts.Vec0>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))),
+ Extracts.Vec1>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))),
+ Extracts.Vec2>;
+ def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))),
+ Extracts.Vec3>;
+}
+
+let Predicates = [PairedVectorMemops] in {
+ def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)),
+ Concats.VecsToVecPair0>;
+ def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)),
+ Concats.VecsToVecPair0>;
+ def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))),
+ (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>;
+ def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))),
+ (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>;
+}
+
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in {
+ def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp),
+ (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA",
+ IIC_LdStLFD, []>;
+ def LXVPX : XForm_XTp5_XAB5<31, 333, (outs vsrprc:$XTp), (ins memrr:$src),
+ "lxvpx $XTp, $src", IIC_LdStLFD,
+ []>;
+}
+
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
+ def STXVP : DQForm_XTp5_RA17_MEM<6, 1, (outs), (ins vsrprc:$XTp,
+ memrix16:$DQ_RA), "stxvp $XTp, $DQ_RA",
+ IIC_LdStLFD, []>;
+ def STXVPX : XForm_XTp5_XAB5<31, 461, (outs), (ins vsrprc:$XTp, memrr:$dst),
+ "stxvpx $XTp, $dst", IIC_LdStLFD,
+ []>;
+}
+
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+ defm PLXVP :
+ 8LS_DForm_R_XTp5_SI34_MEM_p<1, 58, (outs vsrprc:$XTp), (ins memri34:$D_RA),
+ (ins memri34_pcrel:$D_RA), "plxvp $XTp, $D_RA",
+ IIC_LdStLFD>;
+}
+
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+ defm PSTXVP :
+ 8LS_DForm_R_XTp5_SI34_MEM_p<1, 62, (outs), (ins vsrprc:$XTp, memri34:$D_RA),
+ (ins vsrprc:$XTp, memri34_pcrel:$D_RA),
+ "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
+}
+
+let Predicates = [PairedVectorMemops] in {
+ // Intrinsics for Paired Vector Loads.
+ def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
+ def : Pat<(v256i1 (int_ppc_vsx_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>;
+ let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+ def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+ }
+ // Intrinsics for Paired Vector Stores.
+ def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX16:$dst),
+ (STXVP $XSp, memrix16:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, xaddrX16:$dst),
+ (STXVPX $XSp, xaddrX16:$dst)>;
+ let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+ def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX34:$dst),
+ (PSTXVP $XSp, memri34:$dst)>;
+ }
+}
+
// TODO: We have an added complexity of 500 here. This is only a temporary
// solution to have tablegen consider these patterns first. The way we do
// addressing for PowerPC is complex depending on available D form, X form, or
@@ -753,6 +1836,13 @@ let Predicates = [PCRelativeMemops], AddedComplexity = 500 in {
// If the PPCmatpcreladdr node is not caught by any other pattern it should be
// caught here and turned into a paddi instruction to materialize the address.
def : Pat<(PPCmatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+ // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize
+ // tls global address with paddi instruction.
+ def : Pat<(PPCtlsdynamatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+ // PPCtlslocalexecmataddr node is used for TLS local exec models to
+ // materialize tls global address with paddi instruction.
+ def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)),
+ (PADDI8 $in, $addr)>;
}
let Predicates = [PrefixInstrs] in {
@@ -797,6 +1887,26 @@ let Predicates = [PrefixInstrs] in {
}
let Predicates = [IsISA3_1] in {
+ def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI),
+ "setbc $RT, $BI", IIC_IntCompare, []>;
+ def SETBCR : XForm_XT5_BI5<31, 416, (outs gprc:$RT), (ins crbitrc:$BI),
+ "setbcr $RT, $BI", IIC_IntCompare, []>;
+ def SETNBC : XForm_XT5_BI5<31, 448, (outs gprc:$RT), (ins crbitrc:$BI),
+ "setnbc $RT, $BI", IIC_IntCompare, []>;
+ def SETNBCR : XForm_XT5_BI5<31, 480, (outs gprc:$RT), (ins crbitrc:$BI),
+ "setnbcr $RT, $BI", IIC_IntCompare, []>;
+
+ let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+ def SETBC8 : XForm_XT5_BI5<31, 384, (outs g8rc:$RT), (ins crbitrc:$BI),
+ "setbc $RT, $BI", IIC_IntCompare, []>;
+ def SETBCR8 : XForm_XT5_BI5<31, 416, (outs g8rc:$RT), (ins crbitrc:$BI),
+ "setbcr $RT, $BI", IIC_IntCompare, []>;
+ def SETNBC8 : XForm_XT5_BI5<31, 448, (outs g8rc:$RT), (ins crbitrc:$BI),
+ "setnbc $RT, $BI", IIC_IntCompare, []>;
+ def SETNBCR8 : XForm_XT5_BI5<31, 480, (outs g8rc:$RT), (ins crbitrc:$BI),
+ "setnbcr $RT, $BI", IIC_IntCompare, []>;
+ }
+
def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT),
(ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
"vsldbi $VRT, $VRA, $VRB, $SH",
@@ -813,87 +1923,254 @@ let Predicates = [IsISA3_1] in {
(int_ppc_altivec_vsrdbi v16i8:$VRA,
v16i8:$VRB,
i32:$SH))]>;
- def VINSW :
- VXForm_VRT5_UIM5_RB5_ins<207, "vinsw",
- [(set v4i32:$vD,
- (int_ppc_altivec_vinsw v4i32:$vDi, i64:$rB,
- timm:$UIM))]>;
+ defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB),
+ "vstribr", "$vT, $vB", IIC_VecGeneral,
+ [(set v16i8:$vT,
+ (int_ppc_altivec_vstribr v16i8:$vB))]>;
+ defm VSTRIBL : VXForm_VTB5_RCr<13, 0, (outs vrrc:$vT), (ins vrrc:$vB),
+ "vstribl", "$vT, $vB", IIC_VecGeneral,
+ [(set v16i8:$vT,
+ (int_ppc_altivec_vstribl v16i8:$vB))]>;
+ defm VSTRIHR : VXForm_VTB5_RCr<13, 3, (outs vrrc:$vT), (ins vrrc:$vB),
+ "vstrihr", "$vT, $vB", IIC_VecGeneral,
+ [(set v8i16:$vT,
+ (int_ppc_altivec_vstrihr v8i16:$vB))]>;
+ defm VSTRIHL : VXForm_VTB5_RCr<13, 2, (outs vrrc:$vT), (ins vrrc:$vB),
+ "vstrihl", "$vT, $vB", IIC_VecGeneral,
+ [(set v8i16:$vT,
+ (int_ppc_altivec_vstrihl v8i16:$vB))]>;
+ def VINSW :
+ VXForm_1<207, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, gprc:$rB),
+ "vinsw $vD, $rB, $UIM", IIC_VecGeneral,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_vinsw v4i32:$vDi, i32:$rB, timm:$UIM))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
def VINSD :
- VXForm_VRT5_UIM5_RB5_ins<463, "vinsd",
- [(set v2i64:$vD,
- (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB,
- timm:$UIM))]>;
+ VXForm_1<463, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
+ "vinsd $vD, $rB, $UIM", IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB, timm:$UIM))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
def VINSBVLX :
VXForm_VTB5_RA5_ins<15, "vinsbvlx",
[(set v16i8:$vD,
- (int_ppc_altivec_vinsbvlx v16i8:$vDi, i64:$rA,
+ (int_ppc_altivec_vinsbvlx v16i8:$vDi, i32:$rA,
v16i8:$vB))]>;
def VINSBVRX :
VXForm_VTB5_RA5_ins<271, "vinsbvrx",
[(set v16i8:$vD,
- (int_ppc_altivec_vinsbvrx v16i8:$vDi, i64:$rA,
+ (int_ppc_altivec_vinsbvrx v16i8:$vDi, i32:$rA,
v16i8:$vB))]>;
def VINSHVLX :
VXForm_VTB5_RA5_ins<79, "vinshvlx",
[(set v8i16:$vD,
- (int_ppc_altivec_vinshvlx v8i16:$vDi, i64:$rA,
+ (int_ppc_altivec_vinshvlx v8i16:$vDi, i32:$rA,
v8i16:$vB))]>;
def VINSHVRX :
VXForm_VTB5_RA5_ins<335, "vinshvrx",
[(set v8i16:$vD,
- (int_ppc_altivec_vinshvrx v8i16:$vDi, i64:$rA,
+ (int_ppc_altivec_vinshvrx v8i16:$vDi, i32:$rA,
v8i16:$vB))]>;
def VINSWVLX :
VXForm_VTB5_RA5_ins<143, "vinswvlx",
[(set v4i32:$vD,
- (int_ppc_altivec_vinswvlx v4i32:$vDi, i64:$rA,
+ (int_ppc_altivec_vinswvlx v4i32:$vDi, i32:$rA,
v4i32:$vB))]>;
def VINSWVRX :
VXForm_VTB5_RA5_ins<399, "vinswvrx",
[(set v4i32:$vD,
- (int_ppc_altivec_vinswvrx v4i32:$vDi, i64:$rA,
+ (int_ppc_altivec_vinswvrx v4i32:$vDi, i32:$rA,
v4i32:$vB))]>;
def VINSBLX :
VXForm_VRT5_RAB5_ins<527, "vinsblx",
[(set v16i8:$vD,
- (int_ppc_altivec_vinsblx v16i8:$vDi, i64:$rA,
- i64:$rB))]>;
+ (int_ppc_altivec_vinsblx v16i8:$vDi, i32:$rA,
+ i32:$rB))]>;
def VINSBRX :
VXForm_VRT5_RAB5_ins<783, "vinsbrx",
[(set v16i8:$vD,
- (int_ppc_altivec_vinsbrx v16i8:$vDi, i64:$rA,
- i64:$rB))]>;
+ (int_ppc_altivec_vinsbrx v16i8:$vDi, i32:$rA,
+ i32:$rB))]>;
def VINSHLX :
VXForm_VRT5_RAB5_ins<591, "vinshlx",
[(set v8i16:$vD,
- (int_ppc_altivec_vinshlx v8i16:$vDi, i64:$rA,
- i64:$rB))]>;
+ (int_ppc_altivec_vinshlx v8i16:$vDi, i32:$rA,
+ i32:$rB))]>;
def VINSHRX :
VXForm_VRT5_RAB5_ins<847, "vinshrx",
[(set v8i16:$vD,
- (int_ppc_altivec_vinshrx v8i16:$vDi, i64:$rA,
- i64:$rB))]>;
+ (int_ppc_altivec_vinshrx v8i16:$vDi, i32:$rA,
+ i32:$rB))]>;
def VINSWLX :
VXForm_VRT5_RAB5_ins<655, "vinswlx",
[(set v4i32:$vD,
- (int_ppc_altivec_vinswlx v4i32:$vDi, i64:$rA,
- i64:$rB))]>;
+ (int_ppc_altivec_vinswlx v4i32:$vDi, i32:$rA,
+ i32:$rB))]>;
def VINSWRX :
VXForm_VRT5_RAB5_ins<911, "vinswrx",
[(set v4i32:$vD,
- (int_ppc_altivec_vinswrx v4i32:$vDi, i64:$rA,
- i64:$rB))]>;
+ (int_ppc_altivec_vinswrx v4i32:$vDi, i32:$rA,
+ i32:$rB))]>;
def VINSDLX :
- VXForm_VRT5_RAB5_ins<719, "vinsdlx",
- [(set v2i64:$vD,
- (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA,
- i64:$rB))]>;
+ VXForm_1<719, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+ "vinsdlx $vD, $rA, $rB", IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA, i64:$rB))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
def VINSDRX :
- VXForm_VRT5_RAB5_ins<975, "vinsdrx",
- [(set v2i64:$vD,
- (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA,
- i64:$rB))]>;
-
+ VXForm_1<975, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+ "vinsdrx $vD, $rA, $rB", IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+ def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$rD), (ins vrrc:$vB),
+ "vextractbm $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD,
+ (int_ppc_altivec_vextractbm v16i8:$vB))]>;
+ def VEXTRACTHM : VXForm_RD5_XO5_RS5<1602, 9, (outs gprc:$rD), (ins vrrc:$vB),
+ "vextracthm $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD,
+ (int_ppc_altivec_vextracthm v8i16:$vB))]>;
+ def VEXTRACTWM : VXForm_RD5_XO5_RS5<1602, 10, (outs gprc:$rD), (ins vrrc:$vB),
+ "vextractwm $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD,
+ (int_ppc_altivec_vextractwm v4i32:$vB))]>;
+ def VEXTRACTDM : VXForm_RD5_XO5_RS5<1602, 11, (outs gprc:$rD), (ins vrrc:$vB),
+ "vextractdm $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD,
+ (int_ppc_altivec_vextractdm v2i64:$vB))]>;
+ def VEXTRACTQM : VXForm_RD5_XO5_RS5<1602, 12, (outs gprc:$rD), (ins vrrc:$vB),
+ "vextractqm $rD, $vB", IIC_VecGeneral,
+ [(set i32:$rD,
+ (int_ppc_altivec_vextractqm v1i128:$vB))]>;
+ def VEXPANDBM : VXForm_RD5_XO5_RS5<1602, 0, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vexpandbm $vD, $vB", IIC_VecGeneral,
+ [(set v16i8:$vD, (int_ppc_altivec_vexpandbm
+ v16i8:$vB))]>;
+ def VEXPANDHM : VXForm_RD5_XO5_RS5<1602, 1, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vexpandhm $vD, $vB", IIC_VecGeneral,
+ [(set v8i16:$vD, (int_ppc_altivec_vexpandhm
+ v8i16:$vB))]>;
+ def VEXPANDWM : VXForm_RD5_XO5_RS5<1602, 2, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vexpandwm $vD, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (int_ppc_altivec_vexpandwm
+ v4i32:$vB))]>;
+ def VEXPANDDM : VXForm_RD5_XO5_RS5<1602, 3, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vexpanddm $vD, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (int_ppc_altivec_vexpanddm
+ v2i64:$vB))]>;
+ def VEXPANDQM : VXForm_RD5_XO5_RS5<1602, 4, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vexpandqm $vD, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vexpandqm
+ v1i128:$vB))]>;
+ def MTVSRBM : VXForm_RD5_XO5_RS5<1602, 16, (outs vrrc:$vD), (ins g8rc:$rB),
+ "mtvsrbm $vD, $rB", IIC_VecGeneral,
+ [(set v16i8:$vD,
+ (int_ppc_altivec_mtvsrbm i64:$rB))]>;
+ def MTVSRHM : VXForm_RD5_XO5_RS5<1602, 17, (outs vrrc:$vD), (ins g8rc:$rB),
+ "mtvsrhm $vD, $rB", IIC_VecGeneral,
+ [(set v8i16:$vD,
+ (int_ppc_altivec_mtvsrhm i64:$rB))]>;
+ def MTVSRWM : VXForm_RD5_XO5_RS5<1602, 18, (outs vrrc:$vD), (ins g8rc:$rB),
+ "mtvsrwm $vD, $rB", IIC_VecGeneral,
+ [(set v4i32:$vD,
+ (int_ppc_altivec_mtvsrwm i64:$rB))]>;
+ def MTVSRDM : VXForm_RD5_XO5_RS5<1602, 19, (outs vrrc:$vD), (ins g8rc:$rB),
+ "mtvsrdm $vD, $rB", IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_mtvsrdm i64:$rB))]>;
+ def MTVSRQM : VXForm_RD5_XO5_RS5<1602, 20, (outs vrrc:$vD), (ins g8rc:$rB),
+ "mtvsrqm $vD, $rB", IIC_VecGeneral,
+ [(set v1i128:$vD,
+ (int_ppc_altivec_mtvsrqm i64:$rB))]>;
+ def MTVSRBMI : DXForm<4, 10, (outs vrrc:$vD), (ins u16imm64:$D),
+ "mtvsrbmi $vD, $D", IIC_VecGeneral,
+ [(set v16i8:$vD,
+ (int_ppc_altivec_mtvsrbm imm:$D))]>;
+ def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD),
+ (ins vrrc:$vB, u1imm:$MP),
+ "vcntmbb $rD, $vB, $MP", IIC_VecGeneral,
+ [(set i64:$rD, (int_ppc_altivec_vcntmbb
+ v16i8:$vB, timm:$MP))]>;
+ def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD),
+ (ins vrrc:$vB, u1imm:$MP),
+ "vcntmbh $rD, $vB, $MP", IIC_VecGeneral,
+ [(set i64:$rD, (int_ppc_altivec_vcntmbh
+ v8i16:$vB, timm:$MP))]>;
+ def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD),
+ (ins vrrc:$vB, u1imm:$MP),
+ "vcntmbw $rD, $vB, $MP", IIC_VecGeneral,
+ [(set i64:$rD, (int_ppc_altivec_vcntmbw
+ v4i32:$vB, timm:$MP))]>;
+ def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD),
+ (ins vrrc:$vB, u1imm:$MP),
+ "vcntmbd $rD, $vB, $MP", IIC_VecGeneral,
+ [(set i64:$rD, (int_ppc_altivec_vcntmbd
+ v2i64:$vB, timm:$MP))]>;
+ def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextdubvlx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextdubvlx v16i8:$vA,
+ v16i8:$vB,
+ i32:$rC))]>;
+ def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextdubvrx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextdubvrx v16i8:$vA,
+ v16i8:$vB,
+ i32:$rC))]>;
+ def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextduhvlx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextduhvlx v8i16:$vA,
+ v8i16:$vB,
+ i32:$rC))]>;
+ def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextduhvrx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextduhvrx v8i16:$vA,
+ v8i16:$vB,
+ i32:$rC))]>;
+ def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextduwvlx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextduwvlx v4i32:$vA,
+ v4i32:$vB,
+ i32:$rC))]>;
+ def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextduwvrx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextduwvrx v4i32:$vA,
+ v4i32:$vB,
+ i32:$rC))]>;
+ def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextddvlx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextddvlx v2i64:$vA,
+ v2i64:$vB,
+ i32:$rC))]>;
+ def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, gprc:$rC),
+ "vextddvrx $vD, $vA, $vB, $rC",
+ IIC_VecGeneral,
+ [(set v2i64:$vD,
+ (int_ppc_altivec_vextddvrx v2i64:$vA,
+ v2i64:$vB,
+ i32:$rC))]>;
def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
"vpdepd $vD, $vA, $vB", IIC_VecGeneral,
[(set v2i64:$vD,
@@ -961,7 +2238,61 @@ let Predicates = [IsISA3_1] in {
"vclrrb $vD, $vA, $rB", IIC_VecGeneral,
[(set v16i8:$vD,
(int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
-
+ def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulld $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>;
+ def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulhsw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>;
+ def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulhuw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>;
+ def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulhsd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>;
+ def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulhud $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>;
+ def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmodsw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>;
+ def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmoduw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>;
+ def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmodsd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>;
+ def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmodud $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>;
+ def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivsw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>;
+ def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivuw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>;
+ def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivsd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>;
+ def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivud $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>;
+ def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivesw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (int_ppc_altivec_vdivesw v4i32:$vA,
+ v4i32:$vB))]>;
+ def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdiveuw $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v4i32:$vD, (int_ppc_altivec_vdiveuw v4i32:$vA,
+ v4i32:$vB))]>;
+ def VDIVESD : VXForm_1<971, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivesd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (int_ppc_altivec_vdivesd v2i64:$vA,
+ v2i64:$vB))]>;
+ def VDIVEUD : VXForm_1<715, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdiveud $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v2i64:$vD, (int_ppc_altivec_vdiveud v2i64:$vA,
+ v2i64:$vB))]>;
def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB),
"xvtlsbb $BF, $XB", IIC_VecGeneral, []>;
@@ -980,10 +2311,204 @@ let Predicates = [IsISA3_1] in {
def STXVRWX : X_XS6_RA5_RB5<31, 205, "stxvrwx", vsrc, []>;
def STXVRDX : X_XS6_RA5_RB5<31, 237, "stxvrdx", vsrc, []>;
}
+
+ def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulesd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA,
+ v2i64:$vB))]>;
+ def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmuleud $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA,
+ v2i64:$vB))]>;
+ def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmulosd $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA,
+ v2i64:$vB))]>;
+ def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmuloud $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA,
+ v2i64:$vB))]>;
+ def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+ "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vmsumcud
+ v2i64:$vA, v2i64:$vB, v1i128:$vC))]>;
+ def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivsq $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (sdiv v1i128:$vA, v1i128:$vB))]>;
+ def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivuq $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (udiv v1i128:$vA, v1i128:$vB))]>;
+ def VDIVESQ : VXForm_1<779, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdivesq $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vdivesq v1i128:$vA,
+ v1i128:$vB))]>;
+ def VDIVEUQ : VXForm_1<523, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vdiveuq $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vdiveuq v1i128:$vA,
+ v1i128:$vB))]>;
+ def VCMPEQUQ : VCMP <455, "vcmpequq $vD, $vA, $vB" , v1i128>;
+ def VCMPGTSQ : VCMP <903, "vcmpgtsq $vD, $vA, $vB" , v1i128>;
+ def VCMPGTUQ : VCMP <647, "vcmpgtuq $vD, $vA, $vB" , v1i128>;
+ def VCMPEQUQ_rec : VCMP_rec <455, "vcmpequq. $vD, $vA, $vB" , v1i128>;
+ def VCMPGTSQ_rec : VCMP_rec <903, "vcmpgtsq. $vD, $vA, $vB" , v1i128>;
+ def VCMPGTUQ_rec : VCMP_rec <647, "vcmpgtuq. $vD, $vA, $vB" , v1i128>;
+ def VMODSQ : VXForm_1<1803, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmodsq $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (srem v1i128:$vA, v1i128:$vB))]>;
+ def VMODUQ : VXForm_1<1547, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+ "vmoduq $vD, $vA, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (urem v1i128:$vA, v1i128:$vB))]>;
+ def VEXTSD2Q : VXForm_RD5_XO5_RS5<1538, 27, (outs vrrc:$vD), (ins vrrc:$vB),
+ "vextsd2q $vD, $vB", IIC_VecGeneral,
+ [(set v1i128:$vD, (int_ppc_altivec_vextsd2q v2i64:$vB))]>;
+ def VCMPUQ : VXForm_BF3_VAB5<257, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
+ "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>;
+ def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB),
+ "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>;
+ def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm",
+ [(set v1i128:$vD,
+ (int_ppc_altivec_vrlqnm v1i128:$vA,
+ v1i128:$vB))]>;
+ def VRLQMI : VXForm_1<69, (outs vrrc:$vD),
+ (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+ "vrlqmi $vD, $vA, $vB", IIC_VecFP,
+ [(set v1i128:$vD,
+ (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB,
+ v1i128:$vDi))]>,
+ RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+ def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
+ def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
+ def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+ def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
+ def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
+ def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
+ def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>;
+ def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>;
+}
+
+let Predicates = [IsISA3_1, HasVSX] in {
+ def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>;
+ def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>;
+}
+
+// Multiclass defining patterns for Set Boolean Extension Reverse Instructions.
+// This is analogous to the CRNotPat multiclass but specifically for Power10
+// and newer subtargets since the extended forms use Set Boolean instructions.
+// The first two anonymous patterns defined are actually a duplicate of those
+// in CRNotPat, but it is preferable to define both multiclasses as complete
+// ones rather than pulling that small common section out.
+multiclass P10ReverseSetBool<dag pattern, dag result> {
+ def : Pat<pattern, (crnot result)>;
+ def : Pat<(not pattern), result>;
+
+ def : Pat<(i32 (zext pattern)),
+ (SETBCR result)>;
+ def : Pat<(i64 (zext pattern)),
+ (SETBCR8 result)>;
+
+ def : Pat<(i32 (sext pattern)),
+ (SETNBCR result)>;
+ def : Pat<(i64 (sext pattern)),
+ (SETNBCR8 result)>;
+
+ def : Pat<(i32 (anyext pattern)),
+ (SETBCR result)>;
+ def : Pat<(i64 (anyext pattern)),
+ (SETBCR8 result)>;
+}
+
+multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, ImmLeaf ZExtTy,
+ ImmLeaf SExtTy, PatLeaf Cmpi, PatLeaf Cmpli,
+ PatLeaf Cmp, PatLeaf Cmpl> {
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+ (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_lt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+ (EXTRACT_SUBREG (Cmp $s1, $s2), sub_lt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+ (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_gt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+ (EXTRACT_SUBREG (Cmp $s1, $s2), sub_gt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+ (EXTRACT_SUBREG (Cmp $s1, $s2), sub_eq)>;
+
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETUGE)),
+ (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_lt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETGE)),
+ (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_lt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETULE)),
+ (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_gt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETLE)),
+ (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_gt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETNE)),
+ (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_eq)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETNE)),
+ (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_eq)>;
+}
+
+multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf FCmp> {
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
+ defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
+ (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
+}
+
+let Predicates = [IsISA3_1] in {
+ def : Pat<(i32 (zext i1:$in)),
+ (SETBC $in)>;
+ def : Pat<(i64 (zext i1:$in)),
+ (SETBC8 $in)>;
+ def : Pat<(i32 (sext i1:$in)),
+ (SETNBC $in)>;
+ def : Pat<(i64 (sext i1:$in)),
+ (SETNBC8 $in)>;
+ def : Pat<(i32 (anyext i1:$in)),
+ (SETBC $in)>;
+ def : Pat<(i64 (anyext i1:$in)),
+ (SETBC8 $in)>;
+
+ // Instantiation of the set boolean reverse patterns for 32-bit integers.
+ defm : IntSetP10RevSetBool<setcc, i32, immZExt16, imm32SExt16,
+ CMPWI, CMPLWI, CMPW, CMPLW>;
+ defm : P10ReverseSetBool<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+
+ // Instantiation of the set boolean reverse patterns for 64-bit integers.
+ defm : IntSetP10RevSetBool<setcc, i64, immZExt16, imm64SExt16,
+ CMPDI, CMPLDI, CMPD, CMPLD>;
+ defm : P10ReverseSetBool<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+}
+
+// Instantiation of the set boolean reverse patterns for f32, f64, f128.
+let Predicates = [IsISA3_1, HasFPU] in {
+ defm : FSetP10RevSetBool<setcc, f32, FCMPUS>;
+ defm : FSetP10RevSetBool<setcc, f64, FCMPUD>;
+ defm : FSetP10RevSetBool<setcc, f128, XSCMPUQP>;
}
//---------------------------- Anonymous Patterns ----------------------------//
let Predicates = [IsISA3_1] in {
+ // Exploit the vector multiply high instructions using intrinsics.
+ def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VMULHSW $vA, $vB))>;
+ def : Pat<(v4i32 (int_ppc_altivec_vmulhuw v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VMULHUW $vA, $vB))>;
+ def : Pat<(v2i64 (int_ppc_altivec_vmulhsd v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VMULHSD $vA, $vB))>;
+ def : Pat<(v2i64 (int_ppc_altivec_vmulhud v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VMULHUD $vA, $vB))>;
def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)),
(v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>;
def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)),
@@ -992,12 +2517,82 @@ let Predicates = [IsISA3_1] in {
(v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>;
def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
(v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
- def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, -1)),
+ def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 1)),
(EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_lt)>;
def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)),
(EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>;
+
+ def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 8)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 16)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 32)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>;
+ def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)),
+ (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>;
+
+ def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)),
+ (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>;
+
+ def : Pat <(v2i64 (PPCxxsplti32dx v2i64:$XT, i32:$XI, i32:$IMM32)),
+ (v2i64 (XXSPLTI32DX v2i64:$XT, i32:$XI, i32:$IMM32))>;
+}
+
+let Predicates = [IsISA3_1, HasVSX] in {
+ def : Pat<(v16i8 (int_ppc_vsx_xvcvspbf16 v16i8:$XA)),
+ (COPY_TO_REGCLASS (XVCVSPBF16 RCCp.AToVSRC), VRRC)>;
+ def : Pat<(v16i8 (int_ppc_vsx_xvcvbf16spn v16i8:$XA)),
+ (COPY_TO_REGCLASS (XVCVBF16SPN RCCp.AToVSRC), VRRC)>;
}
+let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
+ // Store element 0 of a VSX register to memory
+ def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), xoaddr:$dst),
+ (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), xoaddr:$dst)>;
+ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), xoaddr:$dst),
+ (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), xoaddr:$dst)>;
+ def : Pat<(store (i32 (extractelt v4i32:$src, 0)), xoaddr:$dst),
+ (STXVRWX $src, xoaddr:$dst)>;
+ def : Pat<(store (f32 (extractelt v4f32:$src, 0)), xoaddr:$dst),
+ (STXVRWX $src, xoaddr:$dst)>;
+ def : Pat<(store (i64 (extractelt v2i64:$src, 0)), xoaddr:$dst),
+ (STXVRDX $src, xoaddr:$dst)>;
+ def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst),
+ (STXVRDX $src, xoaddr:$dst)>;
+ }
+
+// FIXME: The swap is overkill when the shift amount is a constant.
+// We should just fix the constant in the DAG.
+let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
+ def : Pat<(v1i128 (shl v1i128:$VRA, v1i128:$VRB)),
+ (v1i128 (VSLQ v1i128:$VRA,
+ (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+ (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+ def : Pat<(v1i128 (PPCshl v1i128:$VRA, v1i128:$VRB)),
+ (v1i128 (VSLQ v1i128:$VRA,
+ (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+ (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+ def : Pat<(v1i128 (srl v1i128:$VRA, v1i128:$VRB)),
+ (v1i128 (VSRQ v1i128:$VRA,
+ (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+ (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+ def : Pat<(v1i128 (PPCsrl v1i128:$VRA, v1i128:$VRB)),
+ (v1i128 (VSRQ v1i128:$VRA,
+ (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+ (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+ def : Pat<(v1i128 (sra v1i128:$VRA, v1i128:$VRB)),
+ (v1i128 (VSRAQ v1i128:$VRA,
+ (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+ (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+ def : Pat<(v1i128 (PPCsra v1i128:$VRA, v1i128:$VRB)),
+ (v1i128 (VSRAQ v1i128:$VRA,
+ (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC),
+ (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
+}
+
+class xxevalPattern <dag pattern, bits<8> imm> :
+ Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
+
let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
i32immNonAllOneNonZero:$A,
@@ -1010,6 +2605,44 @@ let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
def : Pat<(f64 nzFPImmAsi32:$A),
(COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
VSFRC)>;
+
+ // Anonymous patterns for XXEVAL
+ // AND
+ // and(A, B, C)
+ def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
+ // and(A, xor(B, C))
+ def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
+ // and(A, or(B, C))
+ def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
+ // and(A, nor(B, C))
+ def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (or v4i32:$vB, v4i32:$vC))),
+ 8>;
+ // and(A, eqv(B, C))
+ def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (xor v4i32:$vB, v4i32:$vC))),
+ 9>;
+ // and(A, nand(B, C))
+ def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (and v4i32:$vB, v4i32:$vC))),
+ 14>;
+
+ // NAND
+ // nand(A, B, C)
+ def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+ !sub(255, 1)>;
+ // nand(A, xor(B, C))
+ def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+ !sub(255, 6)>;
+ // nand(A, or(B, C))
+ def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+ !sub(255, 7)>;
+ // nand(A, nor(B, C))
+ def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+ !sub(255, 8)>;
+ // nand(A, eqv(B, C))
+ def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+ !sub(255, 9)>;
+ // nand(A, nand(B, C))
+ def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+ !sub(255, 14)>;
}
let Predicates = [PrefixInstrs] in {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td
deleted file mode 100644
index 2265af2815cb..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrQPX.td
+++ /dev/null
@@ -1,1212 +0,0 @@
-//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the QPX extension to the PowerPC instruction set.
-// Reference:
-// Book Q: QPX Architecture Definition. IBM (as updated in) 2011.
-//
-//===----------------------------------------------------------------------===//
-
-def PPCRegQFRCAsmOperand : AsmOperandClass {
- let Name = "RegQFRC"; let PredicateMethod = "isRegNumber";
-}
-def qfrc : RegisterOperand<QFRC> {
- let ParserMatchClass = PPCRegQFRCAsmOperand;
-}
-def PPCRegQSRCAsmOperand : AsmOperandClass {
- let Name = "RegQSRC"; let PredicateMethod = "isRegNumber";
-}
-def qsrc : RegisterOperand<QSRC> {
- let ParserMatchClass = PPCRegQSRCAsmOperand;
-}
-def PPCRegQBRCAsmOperand : AsmOperandClass {
- let Name = "RegQBRC"; let PredicateMethod = "isRegNumber";
-}
-def qbrc : RegisterOperand<QBRC> {
- let ParserMatchClass = PPCRegQBRCAsmOperand;
-}
-
-//===----------------------------------------------------------------------===//
-// Helpers for defining instructions that directly correspond to intrinsics.
-
-// QPXA1_Int - A AForm_1 intrinsic definition.
-class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
- : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
- !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused,
- [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
-// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions).
-class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
- : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
- !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm,
- [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
-// QPXA2_Int - A AForm_2 intrinsic definition.
-class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
- : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral,
- [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
-// QPXA3_Int - A AForm_3 intrinsic definition.
-class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
- : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
- !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral,
- [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>;
-// QPXA4_Int - A AForm_4a intrinsic definition.
-class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
- : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
- !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
- [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
-// QPXX18_Int - A XForm_18 intrinsic definition.
-class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
- : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare,
- [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
-// QPXX19_Int - A XForm_19 intrinsic definition.
-class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
- : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
- !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
- [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
-
-//===----------------------------------------------------------------------===//
-// Pattern Frags.
-
-def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32;
-}]>;
-
-def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr),
- (truncstore node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
-}]>;
-def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset),
- (pre_truncst node:$val,
- node:$base, node:$offset), [{
- return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
-}]>;
-
-def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{
- return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0;
-}]>;
-
-def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{
- return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1;
-}]>;
-
-let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs.
- def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>;
-
-//===----------------------------------------------------------------------===//
-// Instruction Definitions.
-
-def HasQPX : Predicate<"Subtarget->hasQPX()">;
-let Predicates = [HasQPX] in {
-let DecoderNamespace = "QPX" in {
-let hasSideEffects = 0 in { // QPX instructions don't have side effects.
-let Uses = [RM] in {
- // Add Instructions
- let isCommutable = 1 in {
- def QVFADD : AForm_2<4, 21,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>;
- def QVFADDSs : AForm_2<0, 21,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>;
- }
- def QVFSUB : AForm_2<4, 20,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>;
- def QVFSUBSs : AForm_2<0, 20,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>;
-
- // Estimate Instructions
- def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfre $FRT, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>;
- def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>;
- let isCodeGenOnly = 1 in
- def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfres $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>;
-
- def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfrsqrte $FRT, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>;
- def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>;
- let isCodeGenOnly = 1 in
- def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>;
-
- // Multiply Instructions
- let isCommutable = 1 in {
- def QVFMUL : AForm_3<4, 25,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
- "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral,
- [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>;
- let isCodeGenOnly = 1 in
- def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>;
- def QVFMULSs : AForm_3<0, 25,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC),
- "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral,
- [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>;
- }
- def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>;
- def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>;
-
- // Multiply-add instructions
- def QVFMADD : AForm_1<4, 29,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
- "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>;
- def QVFMADDSs : AForm_1<0, 29,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
- "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>;
- def QVFNMADD : AForm_1<4, 31,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
- "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
- v4f64:$FRB)))]>;
- let isCodeGenOnly = 1 in
- def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>;
- def QVFNMADDSs : AForm_1<0, 31,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
- "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
- v4f32:$FRB)))]>;
- def QVFMSUB : AForm_1<4, 28,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
- "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC,
- (fneg v4f64:$FRB)))]>;
- let isCodeGenOnly = 1 in
- def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>;
- def QVFMSUBSs : AForm_1<0, 28,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
- "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC,
- (fneg v4f32:$FRB)))]>;
- def QVFNMSUB : AForm_1<4, 30,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
- "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
- (fneg v4f64:$FRB))))]>;
- let isCodeGenOnly = 1 in
- def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>;
- def QVFNMSUBSs : AForm_1<0, 30,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
- "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
- [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
- (fneg v4f32:$FRB))))]>;
- def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>;
- def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>;
- def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>;
- def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>;
- def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>;
- def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>;
- def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>;
- def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>;
-
- // Select Instruction
- let isCodeGenOnly = 1 in
- def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>;
- def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT),
- (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC),
- "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
- [(set v4f64:$FRT, (vselect v4i1:$FRA,
- v4f64:$FRC, v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT),
- (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC),
- "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
- [(set v4f32:$FRT, (vselect v4i1:$FRA,
- v4f32:$FRC, v4f32:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT),
- (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC),
- "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
- [(set v4i1:$FRT, (vselect v4i1:$FRA,
- v4i1:$FRC, v4i1:$FRB))]>;
-
- // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after
- // instruction selection into a branch sequence.
- def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
- i32imm:$BROPC), "#SELECT_CC_QFRC",
- []>;
- def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
- i32imm:$BROPC), "#SELECT_CC_QSRC",
- []>;
- def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
- i32imm:$BROPC), "#SELECT_CC_QBRC",
- []>;
-
- // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
- // register bit directly.
- def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
- qfrc:$T, qfrc:$F), "#SELECT_QFRC",
- [(set v4f64:$dst,
- (select i1:$cond, v4f64:$T, v4f64:$F))]>;
- def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
- qsrc:$T, qsrc:$F), "#SELECT_QSRC",
- [(set v4f32:$dst,
- (select i1:$cond, v4f32:$T, v4f32:$F))]>;
- def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
- qbrc:$T, qbrc:$F), "#SELECT_QBRC",
- [(set v4i1:$dst,
- (select i1:$cond, v4i1:$T, v4i1:$F))]>;
-
- // Convert and Round Instructions
- def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>;
- let isCodeGenOnly = 1 in
- def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB),
- "qvfctid $FRT, $FRB", IIC_FPGeneral, []>;
-
- def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>;
- def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>;
- def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>;
- def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>;
- def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>;
- def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>;
- def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>;
- def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>;
- let isCodeGenOnly = 1 in
- def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB),
- "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>;
-
- def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>;
- def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>;
- def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>;
-
- let isCodeGenOnly = 1 in
- def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>;
- def QVFRSPs : XForm_19<4, 12,
- (outs qsrc:$FRT), (ins qfrc:$FRB),
- "qvfrsp $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>;
-
- def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfriz $FRT, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfriz $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>;
-
- def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfrin $FRT, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (fround v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfrin $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (fround v4f32:$FRB))]>;
-
- def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfrip $FRT, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (fceil v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfrip $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (fceil v4f32:$FRB))]>;
-
- def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfrim $FRT, $FRB", IIC_FPGeneral,
- [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfrim $FRT, $FRB", IIC_FPGeneral,
- [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>;
-
- // Move Instructions
- def QVFMR : XForm_19<4, 72,
- (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfmr $FRT, $FRB", IIC_VecPerm,
- [/* (set v4f64:$FRT, v4f64:$FRB) */]>;
- let isCodeGenOnly = 1 in {
- def QVFMRs : XForm_19<4, 72,
- (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfmr $FRT, $FRB", IIC_VecPerm,
- [/* (set v4f32:$FRT, v4f32:$FRB) */]>;
- def QVFMRb : XForm_19<4, 72,
- (outs qbrc:$FRT), (ins qbrc:$FRB),
- "qvfmr $FRT, $FRB", IIC_VecPerm,
- [/* (set v4i1:$FRT, v4i1:$FRB) */]>;
- }
- def QVFNEG : XForm_19<4, 40,
- (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfneg $FRT, $FRB", IIC_VecPerm,
- [(set v4f64:$FRT, (fneg v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFNEGs : XForm_19<4, 40,
- (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfneg $FRT, $FRB", IIC_VecPerm,
- [(set v4f32:$FRT, (fneg v4f32:$FRB))]>;
- def QVFABS : XForm_19<4, 264,
- (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfabs $FRT, $FRB", IIC_VecPerm,
- [(set v4f64:$FRT, (fabs v4f64:$FRB))]>;
- let isCodeGenOnly = 1 in
- def QVFABSs : XForm_19<4, 264,
- (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfabs $FRT, $FRB", IIC_VecPerm,
- [(set v4f32:$FRT, (fabs v4f32:$FRB))]>;
- def QVFNABS : XForm_19<4, 136,
- (outs qfrc:$FRT), (ins qfrc:$FRB),
- "qvfnabs $FRT, $FRB", IIC_VecPerm,
- [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>;
- let isCodeGenOnly = 1 in
- def QVFNABSs : XForm_19<4, 136,
- (outs qsrc:$FRT), (ins qsrc:$FRB),
- "qvfnabs $FRT, $FRB", IIC_VecPerm,
- [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>;
- def QVFCPSGN : XForm_18<4, 8,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
- [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>;
- let isCodeGenOnly = 1 in
- def QVFCPSGNs : XForm_18<4, 8,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
- [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>;
-
- def QVALIGNI : Z23Form_1<4, 5,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx),
- "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
- [(set v4f64:$FRT,
- (PPCqvaligni v4f64:$FRA, v4f64:$FRB,
- (i32 imm:$idx)))]>;
- let isCodeGenOnly = 1 in
- def QVALIGNIs : Z23Form_1<4, 5,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx),
- "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
- [(set v4f32:$FRT,
- (PPCqvaligni v4f32:$FRA, v4f32:$FRB,
- (i32 imm:$idx)))]>;
- let isCodeGenOnly = 1 in
- def QVALIGNIb : Z23Form_1<4, 5,
- (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx),
- "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
- [(set v4i1:$FRT,
- (PPCqvaligni v4i1:$FRA, v4i1:$FRB,
- (i32 imm:$idx)))]>;
-
- def QVESPLATI : Z23Form_2<4, 37,
- (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx),
- "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
- [(set v4f64:$FRT,
- (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>;
- let isCodeGenOnly = 1 in
- def QVESPLATIs : Z23Form_2<4, 37,
- (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx),
- "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
- [(set v4f32:$FRT,
- (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>;
- let isCodeGenOnly = 1 in
- def QVESPLATIb : Z23Form_2<4, 37,
- (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx),
- "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
- [(set v4i1:$FRT,
- (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>;
-
- def QVFPERM : AForm_1<4, 6,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
- "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
- [(set v4f64:$FRT,
- (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
- let isCodeGenOnly = 1 in
- def QVFPERMs : AForm_1<4, 6,
- (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC),
- "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
- [(set v4f32:$FRT,
- (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>;
-
- let isReMaterializable = 1, isAsCheapAsAMove = 1 in
- def QVGPCI : Z23Form_3<4, 133,
- (outs qfrc:$FRT), (ins u12imm:$idx),
- "qvgpci $FRT, $idx", IIC_VecPerm,
- [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>;
-
- // Compare Instruction
- let isCodeGenOnly = 1 in
- def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>;
- def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>;
- let isCodeGenOnly = 1 in
- def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>;
- let isCodeGenOnly = 1 in
- def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>;
- def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>;
- let isCodeGenOnly = 1 in
- def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>;
- let isCodeGenOnly = 1 in
- def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>;
- def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>;
- let isCodeGenOnly = 1 in
- def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>;
- let isCodeGenOnly = 1 in
- def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>;
- def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
- "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>;
- let isCodeGenOnly = 1 in
- def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
- "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
- [(set v4i1:$FRT,
- (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>;
-
- let isCodeGenOnly = 1 in
- def QVFLOGICAL : XForm_20<4, 4,
- (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt),
- "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
- def QVFLOGICALb : XForm_20<4, 4,
- (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
- "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
- let isCodeGenOnly = 1 in
- def QVFLOGICALs : XForm_20<4, 4,
- (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
- "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
-
- // Load indexed instructions
- let mayLoad = 1 in {
- def QVLFDX : XForm_1_memOp<31, 583,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfdx $FRT, $src", IIC_LdStLFD,
- [(set v4f64:$FRT, (load xoaddr:$src))]>;
- let isCodeGenOnly = 1 in
- def QVLFDXb : XForm_1_memOp<31, 583,
- (outs qbrc:$FRT), (ins memrr:$src),
- "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
-
- let RC = 1 in
- def QVLFDXA : XForm_1<31, 583,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfdxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFDUX : XForm_1<31, 615,
- (outs qfrc:$FRT, ptr_rc_nor0:$ea_result),
- (ins memrr:$src),
- "qvlfdux $FRT, $src", IIC_LdStLFDU, []>,
- RegConstraint<"$src.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
- let RC = 1 in
- def QVLFDUXA : XForm_1<31, 615,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFSX : XForm_1_memOp<31, 519,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfsx $FRT, $src", IIC_LdStLFD,
- [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
-
- let isCodeGenOnly = 1 in
- def QVLFSXb : XForm_1<31, 519,
- (outs qbrc:$FRT), (ins memrr:$src),
- "qvlfsx $FRT, $src", IIC_LdStLFD,
- [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
- let isCodeGenOnly = 1 in
- def QVLFSXs : XForm_1_memOp<31, 519,
- (outs qsrc:$FRT), (ins memrr:$src),
- "qvlfsx $FRT, $src", IIC_LdStLFD,
- [(set v4f32:$FRT, (load xoaddr:$src))]>;
-
- let RC = 1 in
- def QVLFSXA : XForm_1<31, 519,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfsxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFSUX : XForm_1<31, 551,
- (outs qsrc:$FRT, ptr_rc_nor0:$ea_result),
- (ins memrr:$src),
- "qvlfsux $FRT, $src", IIC_LdStLFDU, []>,
- RegConstraint<"$src.ptrreg = $ea_result">,
- NoEncode<"$ea_result">;
-
- let RC = 1 in
- def QVLFSUXA : XForm_1<31, 551,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFCDX : XForm_1<31, 71,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcdx $FRT, $src", IIC_LdStLFD, []>;
- let RC = 1 in
- def QVLFCDXA : XForm_1<31, 71,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFCDUX : XForm_1<31, 103,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcdux $FRT, $src", IIC_LdStLFD, []>;
- let RC = 1 in
- def QVLFCDUXA : XForm_1<31, 103,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFCSX : XForm_1<31, 7,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
- let isCodeGenOnly = 1 in
- def QVLFCSXs : XForm_1<31, 7,
- (outs qsrc:$FRT), (ins memrr:$src),
- "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
-
- let RC = 1 in
- def QVLFCSXA : XForm_1<31, 7,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFCSUX : XForm_1<31, 39,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcsux $FRT, $src", IIC_LdStLFD, []>;
- let RC = 1 in
- def QVLFCSUXA : XForm_1<31, 39,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFIWAX : XForm_1<31, 871,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfiwax $FRT, $src", IIC_LdStLFD, []>;
- let RC = 1 in
- def QVLFIWAXA : XForm_1<31, 871,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>;
-
- def QVLFIWZX : XForm_1<31, 839,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>;
- let RC = 1 in
- def QVLFIWZXA : XForm_1<31, 839,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>;
- }
-
-
- def QVLPCLDX : XForm_1<31, 582,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlpcldx $FRT, $src", IIC_LdStLFD, []>;
- def QVLPCLSX : XForm_1<31, 518,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlpclsx $FRT, $src", IIC_LdStLFD, []>;
- let isCodeGenOnly = 1 in
- def QVLPCLSXint : XForm_11<31, 518,
- (outs qfrc:$FRT), (ins G8RC:$src),
- "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>;
- def QVLPCRDX : XForm_1<31, 70,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>;
- def QVLPCRSX : XForm_1<31, 6,
- (outs qfrc:$FRT), (ins memrr:$src),
- "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>;
-
- // Store indexed instructions
- let mayStore = 1 in {
- def QVSTFDX : XForm_8_memOp<31, 711,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfdx $FRT, $dst", IIC_LdStSTFD,
- [(store qfrc:$FRT, xoaddr:$dst)]>;
- let isCodeGenOnly = 1 in
- def QVSTFDXb : XForm_8_memOp<31, 711,
- (outs), (ins qbrc:$FRT, memrr:$dst),
- "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
-
- let RC = 1 in
- def QVSTFDXA : XForm_8<31, 711,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res),
- (ins qfrc:$FRT, memrr:$dst),
- "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">,
- NoEncode<"$ea_res">;
-
- let RC = 1 in
- def QVSTFDUXA : XForm_8<31, 743,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFDXI : XForm_8<31, 709,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFDXIA : XForm_8<31, 709,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFDUXI : XForm_8<31, 741,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFDUXIA : XForm_8<31, 741,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFSX : XForm_8_memOp<31, 647,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsx $FRT, $dst", IIC_LdStSTFD,
- [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
- let isCodeGenOnly = 1 in
- def QVSTFSXs : XForm_8_memOp<31, 647,
- (outs), (ins qsrc:$FRT, memrr:$dst),
- "qvstfsx $FRT, $dst", IIC_LdStSTFD,
- [(store qsrc:$FRT, xoaddr:$dst)]>;
-
- let RC = 1 in
- def QVSTFSXA : XForm_8<31, 647,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
- (ins qsrc:$FRT, memrr:$dst),
- "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">,
- NoEncode<"$ea_res">;
- let isCodeGenOnly = 1 in
- def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
- (ins qfrc:$FRT, memrr:$dst),
- "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
- RegConstraint<"$dst.ptrreg = $ea_res">,
- NoEncode<"$ea_res">;
-
- let RC = 1 in
- def QVSTFSUXA : XForm_8<31, 679,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFSXI : XForm_8<31, 645,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFSXIA : XForm_8<31, 645,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFSUXI : XForm_8<31, 677,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFSUXIA : XForm_8<31, 677,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCDX : XForm_8<31, 199,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCDXA : XForm_8<31, 199,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCSX : XForm_8<31, 135,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
- let isCodeGenOnly = 1 in
- def QVSTFCSXs : XForm_8<31, 135,
- (outs), (ins qsrc:$FRT, memrr:$dst),
- "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
-
- let RC = 1 in
- def QVSTFCSXA : XForm_8<31, 135,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCDUX : XForm_8<31, 231,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCDUXA : XForm_8<31, 231,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCSUX : XForm_8<31, 167,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCSUXA : XForm_8<31, 167,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCDXI : XForm_8<31, 197,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCDXIA : XForm_8<31, 197,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCSXI : XForm_8<31, 133,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCSXIA : XForm_8<31, 133,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCDUXI : XForm_8<31, 229,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCDUXIA : XForm_8<31, 229,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFCSUXI : XForm_8<31, 165,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFCSUXIA : XForm_8<31, 165,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>;
-
- def QVSTFIWX : XForm_8<31, 967,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>;
- let RC = 1 in
- def QVSTFIWXA : XForm_8<31, 967,
- (outs), (ins qfrc:$FRT, memrr:$dst),
- "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>;
- }
-}
-
-} // neverHasSideEffects
-}
-
-def : InstAlias<"qvfclr $FRT",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>;
-def : InstAlias<"qvfand $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>;
-def : InstAlias<"qvfandc $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>;
-def : InstAlias<"qvfctfb $FRT, $FRA",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>;
-def : InstAlias<"qvfxor $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>;
-def : InstAlias<"qvfor $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>;
-def : InstAlias<"qvfnor $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>;
-def : InstAlias<"qvfequ $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>;
-def : InstAlias<"qvfnot $FRT, $FRA",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>;
-def : InstAlias<"qvforc $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>;
-def : InstAlias<"qvfnand $FRT, $FRA, $FRB",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>;
-def : InstAlias<"qvfset $FRT",
- (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>;
-
-//===----------------------------------------------------------------------===//
-// Additional QPX Patterns
-//
-
-def : Pat<(v4f64 (scalar_to_vector f64:$A)),
- (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>;
-def : Pat<(v4f32 (scalar_to_vector f32:$A)),
- (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
-
-def : Pat<(f64 (extractelt v4f64:$S, 0)),
- (EXTRACT_SUBREG $S, sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, 0)),
- (EXTRACT_SUBREG $S, sub_64)>;
-
-def : Pat<(f64 (extractelt v4f64:$S, 1)),
- (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
-def : Pat<(f64 (extractelt v4f64:$S, 2)),
- (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
-def : Pat<(f64 (extractelt v4f64:$S, 3)),
- (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
-
-def : Pat<(f32 (extractelt v4f32:$S, 1)),
- (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, 2)),
- (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, 3)),
- (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
-
-def : Pat<(f64 (extractelt v4f64:$S, i64:$F)),
- (EXTRACT_SUBREG (QVFPERM $S, $S,
- (QVLPCLSXint (RLDICR $F, 2,
- /* 63-2 = */ 61))),
- sub_64)>;
-def : Pat<(f32 (extractelt v4f32:$S, i64:$F)),
- (EXTRACT_SUBREG (QVFPERMs $S, $S,
- (QVLPCLSXint (RLDICR $F, 2,
- /* 63-2 = */ 61))),
- sub_64)>;
-
-def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C),
- (QVFPERM $A, $B, $C)>;
-
-def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B),
- (QVFCPSGN $A, $B)>;
-
-// FCOPYSIGN's operand types need not agree.
-def : Pat<(fcopysign v4f64:$frB, v4f32:$frA),
- (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>;
-def : Pat<(fcopysign QSRC:$frB, QFRC:$frA),
- (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>;
-
-def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>;
-def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>;
-def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>;
-
-def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>;
-def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>;
-def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>;
-def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>;
-
-def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>;
-def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>;
-
-def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B),
- (QVFADD $A, $B)>;
-def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B),
- (QVFSUB $A, $B)>;
-def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B),
- (QVFMUL $A, $B)>;
-
-// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b)
-def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B),
- (QVFNMSUB $A, $C, $B)>;
-def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B),
- (QVFNMSUB $A, $C, $B)>;
-def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
- (QVFNMSUBSs $A, $C, $B)>;
-def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
- (QVFNMSUBSs $A, $C, $B)>;
-
-def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C),
- (QVFMADD $A, $B, $C)>;
-def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C),
- (QVFNMADD $A, $B, $C)>;
-def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C),
- (QVFMSUB $A, $B, $C)>;
-def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C),
- (QVFNMSUB $A, $B, $C)>;
-
-def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src),
- (QVLFDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
- (QVLFDXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src),
- (QVLFSX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
- (QVLFSXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src),
- (QVLFCDXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src),
- (QVLFCDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src),
- (QVLFCSXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src),
- (QVLFCSX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
- (QVLFDXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src),
- (QVLFIWAXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src),
- (QVLFIWAX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src),
- (QVLFIWZXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src),
- (QVLFIWZX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
- (QVLFSXA xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src),
- (QVLPCLDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src),
- (QVLPCLSX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src),
- (QVLPCRDX xoaddr:$src)>;
-def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src),
- (QVLPCRSX xoaddr:$src)>;
-
-def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst),
- (QVSTFDX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst),
- (QVSTFSX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst),
- (QVSTFCDXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst),
- (QVSTFCDX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst),
- (QVSTFCSXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst),
- (QVSTFCSX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst),
- (QVSTFDXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst),
- (QVSTFIWXA $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst),
- (QVSTFIWX $T, xoaddr:$dst)>;
-def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst),
- (QVSTFSXA $T, xoaddr:$dst)>;
-
-def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
- (QVSTFDUX $rS, $ptrreg, $ptroff)>;
-def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
- (QVSTFSUX $rS, $ptrreg, $ptroff)>;
-def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
- (QVSTFSUXs $rS, $ptrreg, $ptroff)>;
-
-def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)),
- (QVFLOGICAL $A, $B, imm:$idx)>;
-def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)),
- (QVGPCI imm:$idx)>;
-
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE),
- (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE),
- (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE),
- (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO),
- (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ),
- (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT),
- (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE),
- (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
- (QVFCMPLTb $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT),
- (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
- (QVFTSTNANb $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE),
- (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
- (QVFCMPGTb $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE),
- (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
- (QVFCMPEQb $FRA, $FRB), (i32 13))>;
-
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ),
- (QVFCMPEQb $FRA, $FRB)>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT),
- (QVFCMPGTb $FRA, $FRB)>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE),
- (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
- (QVFCMPLTb $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT),
- (QVFCMPLTb $FRA, $FRB)>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE),
- (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
- (QVFCMPGTb $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE),
- (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
- (QVFCMPEQb $FRA, $FRB), (i32 10))>;
-
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE),
- (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE),
- (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE),
- (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO),
- (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ),
- (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT),
- (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE),
- (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
- (QVFCMPLTbs $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT),
- (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
- (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE),
- (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
- (QVFCMPGTbs $FRA, $FRB), (i32 13))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE),
- (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
- (QVFCMPEQbs $FRA, $FRB), (i32 13))>;
-
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ),
- (QVFCMPEQbs $FRA, $FRB)>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT),
- (QVFCMPGTbs $FRA, $FRB)>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE),
- (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
- (QVFCMPLTbs $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT),
- (QVFCMPLTbs $FRA, $FRB)>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE),
- (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
- (QVFCMPGTbs $FRA, $FRB), (i32 10))>;
-def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE),
- (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
- (QVFCMPEQbs $FRA, $FRB), (i32 10))>;
-
-def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)),
- (QVFLOGICALb $FRA, $FRB, (i32 4))>;
-def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)),
- (QVFLOGICALb $FRA, $FRB, (i32 8))>;
-def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)),
- (QVFLOGICALb $FRA, $FRB, (i32 9))>;
-def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)),
- (QVFLOGICALb $FRA, $FRB, (i32 13))>;
-def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)),
- (QVFLOGICALb $FRA, $FRB, (i32 14))>;
-
-def : Pat<(and v4i1:$FRA, v4i1:$FRB),
- (QVFLOGICALb $FRA, $FRB, (i32 1))>;
-def : Pat<(or v4i1:$FRA, v4i1:$FRB),
- (QVFLOGICALb $FRA, $FRB, (i32 7))>;
-def : Pat<(xor v4i1:$FRA, v4i1:$FRB),
- (QVFLOGICALb $FRA, $FRB, (i32 6))>;
-def : Pat<(not v4i1:$FRA),
- (QVFLOGICALb $FRA, $FRA, (i32 10))>;
-
-def : Pat<(v4f64 (fpextend v4f32:$src)),
- (COPY_TO_REGCLASS $src, QFRC)>;
-
-def : Pat<(v4f32 (fround_exact v4f64:$src)),
- (COPY_TO_REGCLASS $src, QSRC)>;
-
-// Extract the underlying floating-point values from the
-// QPX (-1.0, 1.0) boolean representation.
-def : Pat<(v4f64 (PPCqbflt v4i1:$src)),
- (COPY_TO_REGCLASS $src, QFRC)>;
-
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)),
- (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)),
- (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)),
- (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)),
- (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)),
- (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)),
- (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)),
- (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)),
- (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)),
- (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)),
- (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)),
- (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)),
- (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)),
- (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)),
- (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)),
- (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)),
- (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)),
- (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)),
- (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)),
- (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)),
- (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)),
- (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)),
- (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)),
- (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)),
- (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)),
- (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)),
- (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)),
- (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)),
- (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)),
- (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)),
- (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-} // end HasQPX
-
-let Predicates = [HasQPX, NoNaNsFPMath] in {
-def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
- (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
- (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>;
-
-def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
- (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
- (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>;
-}
-
-let Predicates = [HasQPX, NaNsFPMath] in {
-// When either of these operands is NaN, we should return the other operand.
-// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need
-// to explicitly or with a NaN test on the second operand.
-def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
- (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
- (QVFTSTNANb $FRB, $FRB), (i32 7)),
- $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
- (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
- (QVFTSTNANb $FRB, $FRB), (i32 7)),
- $FRB, $FRA)>;
-
-def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
- (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
- (QVFTSTNANbs $FRB, $FRB), (i32 7)),
- $FRB, $FRA)>;
-def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
- (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
- (QVFTSTNANbs $FRB, $FRB), (i32 7)),
- $FRB, $FRA)>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index 858eb0c9fe50..299b34ca8283 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -820,16 +820,6 @@ def SPESTWX : XForm_8<31, 151, (outs), (ins spe4rc:$rS, memrr:$dst),
} // HasSPE
let Predicates = [HasSPE] in {
-def : Pat<(f64 (extloadf32 iaddr:$src)),
- (COPY_TO_REGCLASS (SPELWZ iaddr:$src), SPERC)>;
-def : Pat<(f64 (extloadf32 xaddr:$src)),
- (COPY_TO_REGCLASS (SPELWZX xaddr:$src), SPERC)>;
-
-def : Pat<(f64 (fpextend f32:$src)),
- (COPY_TO_REGCLASS $src, SPERC)>;
-}
-
-let Predicates = [HasSPE] in {
def SELECT_CC_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst),
(ins crrc:$cond, spe4rc:$T, spe4rc:$F,
i32imm:$BROPC), "#SELECT_CC_SPE4",
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 9ba5058a6f81..db6e00c71b89 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -145,6 +145,7 @@ def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
def HasVSX : Predicate<"Subtarget->hasVSX()">;
def IsLittleEndian : Predicate<"Subtarget->isLittleEndian()">;
def IsBigEndian : Predicate<"!Subtarget->isLittleEndian()">;
+def IsPPC64 : Predicate<"Subtarget->isPPC64()">;
def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">;
def HasP8Vector : Predicate<"Subtarget->hasP8Vector()">;
def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">;
@@ -167,7 +168,7 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
def _rec : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
!strconcat(asmbase, !strconcat(". ", asmstr)), itin,
[(set InTy:$XT,
- (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
+ (InTy (PPCvcmp_rec InTy:$XA, InTy:$XB, xo)))]>,
isRecordForm;
}
}
@@ -362,7 +363,8 @@ let hasSideEffects = 0 in {
}
} // mayStore
- let Uses = [RM], mayRaiseFPException = 1 in {
+ let mayRaiseFPException = 1 in {
+ let Uses = [RM] in {
// Add/Mul Instructions
let isCommutable = 1 in {
def XSADDDP : XX3Form<60, 32,
@@ -622,12 +624,30 @@ let hasSideEffects = 0 in {
"xsrsqrtedp $XT, $XB", IIC_VecFP,
[(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+ let mayRaiseFPException = 0 in {
def XSTDIVDP : XX3Form_1<60, 61,
(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
"xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
def XSTSQRTDP : XX2Form_1<60, 106,
(outs crrc:$crD), (ins vsfrc:$XB),
- "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+ "xstsqrtdp $crD, $XB", IIC_FPCompare,
+ [(set i32:$crD, (PPCftsqrt f64:$XB))]>;
+ def XVTDIVDP : XX3Form_1<60, 125,
+ (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+ "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+ def XVTDIVSP : XX3Form_1<60, 93,
+ (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+ "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+ def XVTSQRTDP : XX2Form_1<60, 234,
+ (outs crrc:$crD), (ins vsrc:$XB),
+ "xvtsqrtdp $crD, $XB", IIC_FPCompare,
+ [(set i32:$crD, (PPCftsqrt v2f64:$XB))]>;
+ def XVTSQRTSP : XX2Form_1<60, 170,
+ (outs crrc:$crD), (ins vsrc:$XB),
+ "xvtsqrtsp $crD, $XB", IIC_FPCompare,
+ [(set i32:$crD, (PPCftsqrt v4f32:$XB))]>;
+ }
def XVDIVDP : XX3Form<60, 120,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
@@ -647,20 +667,6 @@ let hasSideEffects = 0 in {
"xvsqrtsp $XT, $XB", IIC_FPSqrtS,
[(set v4f32:$XT, (any_fsqrt v4f32:$XB))]>;
- def XVTDIVDP : XX3Form_1<60, 125,
- (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
- "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
- def XVTDIVSP : XX3Form_1<60, 93,
- (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
- "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
-
- def XVTSQRTDP : XX2Form_1<60, 234,
- (outs crrc:$crD), (ins vsrc:$XB),
- "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
- def XVTSQRTSP : XX2Form_1<60, 170,
- (outs crrc:$crD), (ins vsrc:$XB),
- "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;
-
def XVREDP : XX2Form<60, 218,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvredp $XT, $XB", IIC_VecFP,
@@ -707,6 +713,7 @@ let hasSideEffects = 0 in {
int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;
// Move Instructions
+ let mayRaiseFPException = 0 in {
def XSABSDP : XX2Form<60, 345,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xsabsdp $XT, $XB", IIC_VecFP,
@@ -760,6 +767,7 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XB),
"xvnegsp $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (fneg v4f32:$XB))]>;
+ }
// Conversion Instructions
def XSCVDPSP : XX2Form<60, 265,
@@ -768,50 +776,50 @@ let hasSideEffects = 0 in {
def XSCVDPSXDS : XX2Form<60, 344,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvdpsxds $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (PPCfctidz f64:$XB))]>;
+ [(set f64:$XT, (PPCany_fctidz f64:$XB))]>;
let isCodeGenOnly = 1 in
def XSCVDPSXDSs : XX2Form<60, 344,
(outs vssrc:$XT), (ins vssrc:$XB),
"xscvdpsxds $XT, $XB", IIC_VecFP,
- [(set f32:$XT, (PPCfctidz f32:$XB))]>;
+ [(set f32:$XT, (PPCany_fctidz f32:$XB))]>;
def XSCVDPSXWS : XX2Form<60, 88,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvdpsxws $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (PPCfctiwz f64:$XB))]>;
+ [(set f64:$XT, (PPCany_fctiwz f64:$XB))]>;
let isCodeGenOnly = 1 in
def XSCVDPSXWSs : XX2Form<60, 88,
(outs vssrc:$XT), (ins vssrc:$XB),
"xscvdpsxws $XT, $XB", IIC_VecFP,
- [(set f32:$XT, (PPCfctiwz f32:$XB))]>;
+ [(set f32:$XT, (PPCany_fctiwz f32:$XB))]>;
def XSCVDPUXDS : XX2Form<60, 328,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvdpuxds $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (PPCfctiduz f64:$XB))]>;
+ [(set f64:$XT, (PPCany_fctiduz f64:$XB))]>;
let isCodeGenOnly = 1 in
def XSCVDPUXDSs : XX2Form<60, 328,
(outs vssrc:$XT), (ins vssrc:$XB),
"xscvdpuxds $XT, $XB", IIC_VecFP,
- [(set f32:$XT, (PPCfctiduz f32:$XB))]>;
+ [(set f32:$XT, (PPCany_fctiduz f32:$XB))]>;
def XSCVDPUXWS : XX2Form<60, 72,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvdpuxws $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
+ [(set f64:$XT, (PPCany_fctiwuz f64:$XB))]>;
let isCodeGenOnly = 1 in
def XSCVDPUXWSs : XX2Form<60, 72,
(outs vssrc:$XT), (ins vssrc:$XB),
"xscvdpuxws $XT, $XB", IIC_VecFP,
- [(set f32:$XT, (PPCfctiwuz f32:$XB))]>;
+ [(set f32:$XT, (PPCany_fctiwuz f32:$XB))]>;
def XSCVSPDP : XX2Form<60, 329,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvspdp $XT, $XB", IIC_VecFP, []>;
def XSCVSXDDP : XX2Form<60, 376,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvsxddp $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (PPCfcfid f64:$XB))]>;
+ [(set f64:$XT, (PPCany_fcfid f64:$XB))]>;
def XSCVUXDDP : XX2Form<60, 360,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xscvuxddp $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (PPCfcfidu f64:$XB))]>;
+ [(set f64:$XT, (PPCany_fcfidu f64:$XB))]>;
def XVCVDPSP : XX2Form<60, 393,
(outs vsrc:$XT), (ins vsrc:$XB),
@@ -820,7 +828,7 @@ let hasSideEffects = 0 in {
def XVCVDPSXDS : XX2Form<60, 472,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvdpsxds $XT, $XB", IIC_VecFP,
- [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
+ [(set v2i64:$XT, (any_fp_to_sint v2f64:$XB))]>;
def XVCVDPSXWS : XX2Form<60, 216,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvdpsxws $XT, $XB", IIC_VecFP,
@@ -828,7 +836,7 @@ let hasSideEffects = 0 in {
def XVCVDPUXDS : XX2Form<60, 456,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvdpuxds $XT, $XB", IIC_VecFP,
- [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
+ [(set v2i64:$XT, (any_fp_to_uint v2f64:$XB))]>;
def XVCVDPUXWS : XX2Form<60, 200,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvdpuxws $XT, $XB", IIC_VecFP,
@@ -844,56 +852,105 @@ let hasSideEffects = 0 in {
def XVCVSPSXWS : XX2Form<60, 152,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvspsxws $XT, $XB", IIC_VecFP,
- [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>;
+ [(set v4i32:$XT, (any_fp_to_sint v4f32:$XB))]>;
def XVCVSPUXDS : XX2Form<60, 392,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvspuxds $XT, $XB", IIC_VecFP, []>;
def XVCVSPUXWS : XX2Form<60, 136,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvspuxws $XT, $XB", IIC_VecFP,
- [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>;
+ [(set v4i32:$XT, (any_fp_to_uint v4f32:$XB))]>;
def XVCVSXDDP : XX2Form<60, 504,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvsxddp $XT, $XB", IIC_VecFP,
- [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
+ [(set v2f64:$XT, (any_sint_to_fp v2i64:$XB))]>;
def XVCVSXDSP : XX2Form<60, 440,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvsxdsp $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>;
- def XVCVSXWDP : XX2Form<60, 248,
- (outs vsrc:$XT), (ins vsrc:$XB),
- "xvcvsxwdp $XT, $XB", IIC_VecFP,
- [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
def XVCVSXWSP : XX2Form<60, 184,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvsxwsp $XT, $XB", IIC_VecFP,
- [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
+ [(set v4f32:$XT, (any_sint_to_fp v4i32:$XB))]>;
def XVCVUXDDP : XX2Form<60, 488,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvuxddp $XT, $XB", IIC_VecFP,
- [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
+ [(set v2f64:$XT, (any_uint_to_fp v2i64:$XB))]>;
def XVCVUXDSP : XX2Form<60, 424,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvuxdsp $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>;
+ def XVCVUXWSP : XX2Form<60, 168,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvuxwsp $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (any_uint_to_fp v4i32:$XB))]>;
+
+ let mayRaiseFPException = 0 in {
+ def XVCVSXWDP : XX2Form<60, 248,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvcvsxwdp $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
def XVCVUXWDP : XX2Form<60, 232,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvcvuxwdp $XT, $XB", IIC_VecFP,
[(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>;
- def XVCVUXWSP : XX2Form<60, 168,
+ }
+
+ // Rounding Instructions respecting current rounding mode
+ def XSRDPIC : XX2Form<60, 107,
+ (outs vsfrc:$XT), (ins vsfrc:$XB),
+ "xsrdpic $XT, $XB", IIC_VecFP,
+ [(set f64:$XT, (fnearbyint f64:$XB))]>;
+ def XVRDPIC : XX2Form<60, 235,
(outs vsrc:$XT), (ins vsrc:$XB),
- "xvcvuxwsp $XT, $XB", IIC_VecFP,
- [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>;
+ "xvrdpic $XT, $XB", IIC_VecFP,
+ [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+ def XVRSPIC : XX2Form<60, 171,
+ (outs vsrc:$XT), (ins vsrc:$XB),
+ "xvrspic $XT, $XB", IIC_VecFP,
+ [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+ // Max/Min Instructions
+ let isCommutable = 1 in {
+ def XSMAXDP : XX3Form<60, 160,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsfrc:$XT,
+ (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
+ def XSMINDP : XX3Form<60, 168,
+ (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+ "xsmindp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsfrc:$XT,
+ (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
+
+ def XVMAXDP : XX3Form<60, 224,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
+ def XVMINDP : XX3Form<60, 232,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmindp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
+
+ def XVMAXSP : XX3Form<60, 192,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
+ def XVMINSP : XX3Form<60, 200,
+ (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+ "xvminsp $XT, $XA, $XB", IIC_VecFP,
+ [(set vsrc:$XT,
+ (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
+ } // isCommutable
+ } // Uses = [RM]
- // Rounding Instructions
+ // Rounding Instructions with static direction.
def XSRDPI : XX2Form<60, 73,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xsrdpi $XT, $XB", IIC_VecFP,
[(set f64:$XT, (any_fround f64:$XB))]>;
- def XSRDPIC : XX2Form<60, 107,
- (outs vsfrc:$XT), (ins vsfrc:$XB),
- "xsrdpic $XT, $XB", IIC_VecFP,
- [(set f64:$XT, (any_fnearbyint f64:$XB))]>;
def XSRDPIM : XX2Form<60, 121,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xsrdpim $XT, $XB", IIC_VecFP,
@@ -911,10 +968,6 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrdpi $XT, $XB", IIC_VecFP,
[(set v2f64:$XT, (any_fround v2f64:$XB))]>;
- def XVRDPIC : XX2Form<60, 235,
- (outs vsrc:$XT), (ins vsrc:$XB),
- "xvrdpic $XT, $XB", IIC_VecFP,
- [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>;
def XVRDPIM : XX2Form<60, 249,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrdpim $XT, $XB", IIC_VecFP,
@@ -932,10 +985,6 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrspi $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (any_fround v4f32:$XB))]>;
- def XVRSPIC : XX2Form<60, 171,
- (outs vsrc:$XT), (ins vsrc:$XB),
- "xvrspic $XT, $XB", IIC_VecFP,
- [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>;
def XVRSPIM : XX2Form<60, 185,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrspim $XT, $XB", IIC_VecFP,
@@ -948,43 +997,7 @@ let hasSideEffects = 0 in {
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrspiz $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (any_ftrunc v4f32:$XB))]>;
-
- // Max/Min Instructions
- let isCommutable = 1 in {
- def XSMAXDP : XX3Form<60, 160,
- (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
- "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
- [(set vsfrc:$XT,
- (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
- def XSMINDP : XX3Form<60, 168,
- (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
- "xsmindp $XT, $XA, $XB", IIC_VecFP,
- [(set vsfrc:$XT,
- (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
-
- def XVMAXDP : XX3Form<60, 224,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
- [(set vsrc:$XT,
- (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
- def XVMINDP : XX3Form<60, 232,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvmindp $XT, $XA, $XB", IIC_VecFP,
- [(set vsrc:$XT,
- (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
-
- def XVMAXSP : XX3Form<60, 192,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
- [(set vsrc:$XT,
- (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
- def XVMINSP : XX3Form<60, 200,
- (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
- "xvminsp $XT, $XA, $XB", IIC_VecFP,
- [(set vsrc:$XT,
- (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
- } // isCommutable
- } // Uses = [RM], mayRaiseFPException
+ } // mayRaiseFPException
// Logical Instructions
let isCommutable = 1 in
@@ -1170,7 +1183,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
"xsresp $XT, $XB", IIC_VecFP,
[(set f32:$XT, (PPCfre f32:$XB))]>;
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
- let hasSideEffects = 1, mayRaiseFPException = 1 in
+ let hasSideEffects = 1 in
def XSRSP : XX2Form<60, 281,
(outs vssrc:$XT), (ins vsfrc:$XB),
"xsrsp $XT, $XB", IIC_VecFP,
@@ -1268,18 +1281,18 @@ let Predicates = [HasVSX, HasP8Vector] in {
def XSCVSXDSP : XX2Form<60, 312,
(outs vssrc:$XT), (ins vsfrc:$XB),
"xscvsxdsp $XT, $XB", IIC_VecFP,
- [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+ [(set f32:$XT, (PPCany_fcfids f64:$XB))]>;
def XSCVUXDSP : XX2Form<60, 296,
(outs vssrc:$XT), (ins vsfrc:$XB),
"xscvuxdsp $XT, $XB", IIC_VecFP,
- [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+ [(set f32:$XT, (PPCany_fcfidus f64:$XB))]>;
+ } // mayRaiseFPException
// Conversions between vector and scalar single precision
def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
"xscvdpspn $XT, $XB", IIC_VecFP, []>;
def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
"xscvspdpn $XT, $XB", IIC_VecFP, []>;
- } // mayRaiseFPException
let Predicates = [HasVSX, HasDirectMove] in {
// VSX direct move instructions
@@ -1440,15 +1453,16 @@ let Predicates = [HasVSX, HasP9Vector] in {
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
// QP Compare Ordered/Unordered
let hasSideEffects = 1 in {
- def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
- def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
-
// DP/QP Compare Exponents
def XSCMPEXPDP : XX3Form_1<60, 59,
(outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
"xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
+ let mayRaiseFPException = 1 in {
+ def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
+ def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
+
// DP Compare ==, >=, >, !=
// Use vsrc for XT, because the entire register of XT is set.
// XT.dword[1] = 0x0000_0000_0000_0000
@@ -1458,6 +1472,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
IIC_FPCompare, []>;
def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
IIC_FPCompare, []>;
+ }
}
//===--------------------------------------------------------------------===//
@@ -1476,9 +1491,8 @@ let Predicates = [HasVSX, HasP9Vector] in {
f128:$vB))]>;
}
- // FIXME: Setting the hasSideEffects flag here to match current behaviour.
// Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
- let hasSideEffects = 1 in {
+ let mayRaiseFPException = 1 in {
def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
def XSCVQPSWZ : X_VT5_XO5_VB5<63, 9, 836, "xscvqpswz", []>;
def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
@@ -1494,11 +1508,12 @@ let Predicates = [HasVSX, HasP9Vector] in {
// vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
// but we still use vsfrc for it.
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
- let hasSideEffects = 1 in {
+ let hasSideEffects = 1, mayRaiseFPException = 1 in {
def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
}
+ let mayRaiseFPException = 1 in {
// Vector HP -> SP
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let hasSideEffects = 1 in
@@ -1507,16 +1522,15 @@ let Predicates = [HasVSX, HasP9Vector] in {
[(set v4f32:$XT,
(int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
- let mayRaiseFPException = 1 in {
- // Round to Quad-Precision Integer [with Inexact]
- def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>;
- def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>;
- }
+ // Round to Quad-Precision Integer [with Inexact]
+ def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>;
+ def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>;
// Round Quad-Precision to Double-Extended Precision (fp80)
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
let hasSideEffects = 1 in
def XSRQPXP : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
+ }
//===--------------------------------------------------------------------===//
// Insert/Extract Instructions
@@ -1607,6 +1621,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
(int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;
// Maximum/Minimum Type-C/Type-J DP
+ let mayRaiseFPException = 1 in {
def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc,
IIC_VecFP,
[(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>;
@@ -1621,6 +1636,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
IIC_VecFP, []>;
}
+ }
// Vector Byte-Reverse H/W/D/Q Word
// FIXME: Setting the hasSideEffects flag here to match current behaviour.
@@ -2392,33 +2408,48 @@ def MrgWords {
// arbitrarily chosen to be Big, Little.
//
// Predicate combinations available:
+// [HasVSX, IsLittleEndian, HasP8Altivec] Altivec patterns using VSX instr.
+// [HasVSX, IsBigEndian, HasP8Altivec] Altivec patterns using VSX instr.
// [HasVSX]
// [HasVSX, IsBigEndian]
// [HasVSX, IsLittleEndian]
// [HasVSX, NoP9Vector]
+// [HasVSX, NoP9Vector, IsLittleEndian]
// [HasVSX, HasOnlySwappingMemOps]
// [HasVSX, HasOnlySwappingMemOps, IsBigEndian]
// [HasVSX, HasP8Vector]
-// [HasVSX, HasP8Vector, IsBigEndian]
+// [HasVSX, HasP8Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasP8Vector, IsLittleEndian]
-// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian]
+// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian]
// [HasVSX, HasDirectMove]
// [HasVSX, HasDirectMove, IsBigEndian]
// [HasVSX, HasDirectMove, IsLittleEndian]
-// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian]
+// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian, IsPPC64]
+// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
-// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian]
// [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
// [HasVSX, HasP9Vector]
-// [HasVSX, HasP9Vector, IsBigEndian]
+// [HasVSX, HasP9Vector, IsBigEndian, IsPPC64]
// [HasVSX, HasP9Vector, IsLittleEndian]
// [HasVSX, HasP9Altivec]
-// [HasVSX, HasP9Altivec, IsBigEndian]
+// [HasVSX, HasP9Altivec, IsBigEndian, IsPPC64]
// [HasVSX, HasP9Altivec, IsLittleEndian]
-// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian]
+// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64]
// [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian]
+// These Altivec patterns are here because we need a VSX instruction to match
+// the intrinsic (but only for little endian system).
+let Predicates = [HasVSX, IsLittleEndian, HasP8Altivec] in
+ def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a,
+ v16i8:$b, v16i8:$c)),
+ (v16i8 (VPERMXOR $a, $b, (XXLNOR (COPY_TO_REGCLASS $c, VSRC),
+ (COPY_TO_REGCLASS $c, VSRC))))>;
+let Predicates = [HasVSX, IsBigEndian, HasP8Altivec] in
+ def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a,
+ v16i8:$b, v16i8:$c)),
+ (v16i8 (VPERMXOR $a, $b, $c))>;
+
let AddedComplexity = 400 in {
// Valid for any VSX subtarget, regardless of endianness.
let Predicates = [HasVSX] in {
@@ -2450,6 +2481,10 @@ def : Pat<(fneg (PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C)),
def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)),
(XVNMADDASP $C, $A, $B)>;
+def : Pat<(PPCfsqrt f64:$frA), (XSSQRTDP $frA)>;
+def : Pat<(PPCfsqrt v2f64:$frA), (XVSQRTDP $frA)>;
+def : Pat<(PPCfsqrt v4f32:$frA), (XVSQRTSP $frA)>;
+
def : Pat<(v2f64 (bitconvert v4f32:$A)),
(COPY_TO_REGCLASS $A, VSRC)>;
def : Pat<(v2f64 (bitconvert v4i32:$A)),
@@ -2579,6 +2614,16 @@ def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
(XVDIVDP $A, $B)>;
+// Vector test for software divide and sqrt.
+def : Pat<(i32 (int_ppc_vsx_xvtdivdp v2f64:$A, v2f64:$B)),
+ (COPY_TO_REGCLASS (XVTDIVDP $A, $B), GPRC)>;
+def : Pat<(i32 (int_ppc_vsx_xvtdivsp v4f32:$A, v4f32:$B)),
+ (COPY_TO_REGCLASS (XVTDIVSP $A, $B), GPRC)>;
+def : Pat<(i32 (int_ppc_vsx_xvtsqrtdp v2f64:$A)),
+ (COPY_TO_REGCLASS (XVTSQRTDP $A), GPRC)>;
+def : Pat<(i32 (int_ppc_vsx_xvtsqrtsp v4f32:$A)),
+ (COPY_TO_REGCLASS (XVTSQRTSP $A), GPRC)>;
+
// Reciprocal estimate
def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
(XVRESP $A)>;
@@ -2679,7 +2724,7 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
def : Pat<(f32 (any_fround f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPI
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (any_fnearbyint f32:$S)),
+def : Pat<(f32 (fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f32 (any_ffloor f32:$S)),
@@ -2694,11 +2739,11 @@ def : Pat<(f32 (any_ftrunc f32:$S)),
def : Pat<(f32 (any_frint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(v4f32 (frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
+def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
// Rounding for double precision.
-def : Pat<(f64 (frint f64:$S)), (f64 (XSRDPIC $S))>;
-def : Pat<(v2f64 (frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
+def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>;
+def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
// Materialize a zero-vector of long long
def : Pat<(v2i64 immAllZerosV),
@@ -2975,6 +3020,19 @@ defm : ScalToVecWPermute<
VSFRC)), sub_64)>;
} // HasVSX, NoP9Vector
+// Any little endian pre-Power9 VSX subtarget.
+let Predicates = [HasVSX, NoP9Vector, IsLittleEndian] in {
+// Load-and-splat using only X-Form VSX loads.
+defm : ScalToVecWPermute<
+ v2i64, (i64 (load xoaddr:$src)),
+ (XXPERMDIs (XFLOADf64 xoaddr:$src), 2),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
+defm : ScalToVecWPermute<
+ v2f64, (f64 (load xoaddr:$src)),
+ (XXPERMDIs (XFLOADf64 xoaddr:$src), 2),
+ (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>;
+} // HasVSX, NoP9Vector, IsLittleEndian
+
// Any VSX subtarget that only has loads and stores that load in big endian
// order regardless of endianness. This is really pre-Power9 subtargets.
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
@@ -2986,8 +3044,8 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
} // HasVSX, HasOnlySwappingMemOps
-// Big endian VSX subtarget that only has loads and stores that always load
-// in big endian order. Really big endian pre-Power9 subtargets.
+// Big endian VSX subtarget that only has loads and stores that always
+// load in big endian order. Really big endian pre-Power9 subtargets.
let Predicates = [HasVSX, HasOnlySwappingMemOps, IsBigEndian] in {
def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -3080,7 +3138,7 @@ def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
} // HasVSX, HasP8Vector
// Big endian Power8 VSX subtarget.
-let Predicates = [HasVSX, HasP8Vector, IsBigEndian] in {
+let Predicates = [HasVSX, HasP8Vector, IsBigEndian, IsPPC64] in {
def : Pat<DWToSPExtractConv.El0SS1,
(f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
def : Pat<DWToSPExtractConv.El1SS1,
@@ -3158,7 +3216,7 @@ foreach Idx = [ [0,3], [2,1], [3,2] ] in {
(STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
sub_64), xoaddr:$src)>;
}
-} // HasVSX, HasP8Vector, IsBigEndian
+} // HasVSX, HasP8Vector, IsBigEndian, IsPPC64
// Little endian Power8 VSX subtarget.
let Predicates = [HasVSX, HasP8Vector, IsLittleEndian] in {
@@ -3257,7 +3315,7 @@ foreach Idx = [ [0,2], [1,1], [3,3] ] in {
} // HasVSX, HasP8Vector, IsLittleEndian
// Big endian pre-Power9 VSX subtarget.
-let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian] in {
+let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64] in {
def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
@@ -3268,7 +3326,7 @@ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
(XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
xoaddr:$src)>;
-} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian
+} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64
// Little endian pre-Power9 VSX subtarget.
let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] in {
@@ -3525,8 +3583,8 @@ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
(i32 VectorExtractions.LE_VARIABLE_WORD)>;
} // HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian
-// Big endian pre-Power9 VSX subtarget that has direct moves.
-let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian] in {
+// Big endian pre-Power9 64Bit VSX subtarget that has direct moves.
+let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64] in {
// Big endian integer vectors using direct moves.
def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
(v2i64 (XXPERMDI
@@ -3540,7 +3598,7 @@ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
(MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
(XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
-} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian
+} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64
// Little endian pre-Power9 VSX subtarget that has direct moves.
let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] in {
@@ -3569,25 +3627,25 @@ def : Pat<(fneg (PPCfnmsub f128:$A, f128:$B, f128:$C)),
def : Pat<(PPCfnmsub f128:$A, f128:$B, (fneg f128:$C)),
(XSNMADDQP $C, $A, $B)>;
-def : Pat<(f128 (sint_to_fp i64:$src)),
+def : Pat<(f128 (any_sint_to_fp i64:$src)),
(f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
+def : Pat<(f128 (any_sint_to_fp (i64 (PPCmfvsr f64:$src)))),
(f128 (XSCVSDQP $src))>;
-def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
+def : Pat<(f128 (any_sint_to_fp (i32 (PPCmfvsr f64:$src)))),
(f128 (XSCVSDQP (VEXTSW2Ds $src)))>;
-def : Pat<(f128 (uint_to_fp i64:$src)),
+def : Pat<(f128 (any_uint_to_fp i64:$src)),
(f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
+def : Pat<(f128 (any_uint_to_fp (i64 (PPCmfvsr f64:$src)))),
(f128 (XSCVUDQP $src))>;
// Convert (Un)Signed Word -> QP.
-def : Pat<(f128 (sint_to_fp i32:$src)),
+def : Pat<(f128 (any_sint_to_fp i32:$src)),
(f128 (XSCVSDQP (MTVSRWA $src)))>;
-def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
+def : Pat<(f128 (any_sint_to_fp (i32 (load xoaddr:$src)))),
(f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
-def : Pat<(f128 (uint_to_fp i32:$src)),
+def : Pat<(f128 (any_uint_to_fp i32:$src)),
(f128 (XSCVUDQP (MTVSRWZ $src)))>;
-def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
+def : Pat<(f128 (any_uint_to_fp (i32 (load xoaddr:$src)))),
(f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
// Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
@@ -3761,11 +3819,11 @@ def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
(f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
// Truncate & Convert QP -> (Un)Signed (D)Word.
-def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
-def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
-def : Pat<(i32 (fp_to_sint f128:$src)),
+def : Pat<(i64 (any_fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
+def : Pat<(i64 (any_fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
+def : Pat<(i32 (any_fp_to_sint f128:$src)),
(i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
-def : Pat<(i32 (fp_to_uint f128:$src)),
+def : Pat<(i32 (any_fp_to_uint f128:$src)),
(i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;
// Instructions for store(fptosi).
@@ -3893,8 +3951,8 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
(v4i32 (LXVWSX xoaddr:$A))>;
} // HasVSX, HasP9Vector
-// Big endian Power9 subtarget.
-let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
+// Big endian 64Bit Power9 subtarget.
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
(f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
@@ -4067,7 +4125,7 @@ foreach Idx = 0-15 in {
def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
(f128 (XSCVUDQP
(XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
-} // HasVSX, HasP9Vector, IsBigEndian
+} // HasVSX, HasP9Vector, IsBigEndian, IsPPC64
// Little endian Power9 subtarget.
let Predicates = [HasVSX, HasP9Vector, IsLittleEndian] in {
@@ -4292,8 +4350,8 @@ def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
(v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
} // HasVSX, HasP9Altivec
-// Big endian Power9 VSX subtargets with P9 Altivec support.
-let Predicates = [HasVSX, HasP9Altivec, IsBigEndian] in {
+// Big endian Power9 64Bit VSX subtargets with P9 Altivec support.
+let Predicates = [HasVSX, HasP9Altivec, IsBigEndian, IsPPC64] in {
def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
(VEXTUBLX $Idx, $S)>;
@@ -4426,7 +4484,7 @@ def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
(v4i32 (VEXTSB2W $A))>;
def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
(v2i64 (VEXTSB2D $A))>;
-} // HasVSX, HasP9Altivec, IsBigEndian
+} // HasVSX, HasP9Altivec, IsBigEndian, IsPPC64
// Little endian Power9 VSX subtargets with P9 Altivec support.
let Predicates = [HasVSX, HasP9Altivec, IsLittleEndian] in {
@@ -4563,8 +4621,9 @@ def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
(v2i64 (VEXTSB2D $A))>;
} // HasVSX, HasP9Altivec, IsLittleEndian
-// Big endian VSX subtarget that supports additional direct moves from ISA3.0.
-let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian] in {
+// Big endian 64Bit VSX subtarget that supports additional direct moves from
+// ISA3.0.
+let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64] in {
def : Pat<(i64 (extractelt v2i64:$A, 1)),
(i64 (MFVSRLD $A))>;
// Better way to build integer vectors if we have MTVSRDD. Big endian.
@@ -4577,7 +4636,7 @@ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
(f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
-} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian
+} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64
// Little endian VSX subtarget that supports direct moves from ISA3.0.
let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian] in {
@@ -4602,20 +4661,24 @@ def : InstAlias<"xvmovdp $XT, $XB",
def : InstAlias<"xvmovsp $XT, $XB",
(XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
-def : InstAlias<"xxspltd $XT, $XB, 0",
- (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
-def : InstAlias<"xxspltd $XT, $XB, 1",
- (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+// Certain versions of the AIX assembler may missassemble these mnemonics.
+let Predicates = [ModernAs] in {
+ def : InstAlias<"xxspltd $XT, $XB, 0",
+ (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+ def : InstAlias<"xxspltd $XT, $XB, 1",
+ (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+ def : InstAlias<"xxspltd $XT, $XB, 0",
+ (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
+ def : InstAlias<"xxspltd $XT, $XB, 1",
+ (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
+}
+
def : InstAlias<"xxmrghd $XT, $XA, $XB",
(XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
def : InstAlias<"xxmrgld $XT, $XA, $XB",
(XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
def : InstAlias<"xxswapd $XT, $XB",
(XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
-def : InstAlias<"xxspltd $XT, $XB, 0",
- (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
-def : InstAlias<"xxspltd $XT, $XB, 1",
- (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
def : InstAlias<"xxswapd $XT, $XB",
(XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
def : InstAlias<"mfvrd $rA, $XT",
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index a7546d2be5d8..c24240909797 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -60,6 +60,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -80,10 +81,8 @@
using namespace llvm;
-// By default, we limit this to creating 16 common bases out of loops per
-// function. 16 is a little over half of the allocatable register set.
static cl::opt<unsigned> MaxVarsPrep("ppc-formprep-max-vars",
- cl::Hidden, cl::init(16),
+ cl::Hidden, cl::init(24),
cl::desc("Potential common base number threshold per function for PPC loop "
"prep"));
@@ -93,8 +92,7 @@ static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
// Sum of following 3 per loop thresholds for all loops can not be larger
// than MaxVarsPrep.
-// By default, we limit this to creating 9 PHIs for one loop.
-// 9 and 3 for each kind prep are exterimental values on Power9.
+// now the thresholds for each kind prep are exterimental values on Power9.
static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars",
cl::Hidden, cl::init(3),
cl::desc("Potential PHI threshold per loop for PPC loop prep of update "
@@ -105,7 +103,7 @@ static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars",
cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form"));
static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
- cl::Hidden, cl::init(3),
+ cl::Hidden, cl::init(8),
cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));
@@ -277,8 +275,11 @@ static Value *GetPointerOperand(Value *MemI) {
} else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
return SMemI->getPointerOperand();
} else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
- if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+ IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp)
return IMemI->getArgOperand(0);
+ if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)
+ return IMemI->getArgOperand(1);
}
return nullptr;
@@ -345,9 +346,13 @@ SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
MemI = SMemI;
PtrValue = SMemI->getPointerOperand();
} else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
- if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
+ IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
MemI = IMemI;
PtrValue = IMemI->getArgOperand(0);
+ } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
+ MemI = IMemI;
+ PtrValue = IMemI->getArgOperand(1);
} else continue;
} else continue;
@@ -606,6 +611,10 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain,
NewBasePtr = NewPHI;
}
+ // Clear the rewriter cache, because values that are in the rewriter's cache
+ // can be deleted below, causing the AssertingVH in the cache to trigger.
+ SCEVE.clear();
+
if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
BBChanged.insert(IDel->getParent());
BasePtr->replaceAllUsesWith(NewBasePtr);
@@ -791,7 +800,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
bool MadeChange = false;
// Only prep. the inner-most loop
- if (!L->empty())
+ if (!L->isInnermost())
return MadeChange;
// Return if already done enough preparation.
@@ -823,6 +832,11 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
if (ST && ST->hasAltivec() &&
PtrValue->getType()->getPointerElementType()->isVectorTy())
return false;
+ // There are no update forms for P10 lxvp/stxvp intrinsic.
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) ||
+ II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp))
+ return false;
// See getPreIndexedAddressParts, the displacement for LDU/STDU has to
// be 4's multiple (DS-form). For i64 loads/stores when the displacement
// fits in a 16-bit signed field but isn't a multiple of 4, it will be
@@ -860,7 +874,13 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
// Check if a load/store has DQ form.
auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) {
assert((PtrValue && I) && "Invalid parameter!");
- return !isa<IntrinsicInst>(I) && ST && ST->hasP9Vector() &&
+ // Check if it is a P10 lxvp/stxvp intrinsic.
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (II)
+ return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp ||
+ II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp;
+ // Check if it is a P9 vector load/store.
+ return ST && ST->hasP9Vector() &&
(PtrValue->getType()->getPointerElementType()->isVectorTy());
};
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index 2b0e604e0ccd..27b2c9a628d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -16,6 +16,7 @@
#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Instructions.h"
@@ -64,8 +65,7 @@ private:
/// Checks if the specified function name represents an entry in the MASSV
/// library.
bool PPCLowerMASSVEntries::isMASSVFunc(StringRef Name) {
- auto Iter = std::find(std::begin(MASSVFuncs), std::end(MASSVFuncs), Name);
- return Iter != std::end(MASSVFuncs);
+ return llvm::is_contained(MASSVFuncs, Name);
}
// FIXME:
@@ -105,7 +105,7 @@ bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
return false;
if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1)))
- if (ConstantFP *CFP = dyn_cast<ConstantFP>(Exp->getSplatValue())) {
+ if (ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(Exp->getSplatValue())) {
// If the argument is 0.75 or 0.25 it is cheaper to turn it into pow
// intrinsic so that it could be optimzed as sequence of sqrt's.
if (!CI->hasNoInfs() || !CI->hasApproxFunc())
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 236f98f32e18..5cc180d770b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -74,7 +74,9 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO;
break;
case PPCII::MO_TLS:
- RefKind = MCSymbolRefExpr::VK_PPC_TLS;
+ bool IsPCRel = (MO.getTargetFlags() & ~access) == PPCII::MO_PCREL_FLAG;
+ RefKind = IsPCRel ? MCSymbolRefExpr::VK_PPC_TLS_PCREL
+ : MCSymbolRefExpr::VK_PPC_TLS;
break;
}
@@ -84,6 +86,14 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
RefKind = MCSymbolRefExpr::VK_PCREL;
else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_GOT_FLAG))
RefKind = MCSymbolRefExpr::VK_PPC_GOT_PCREL;
+ else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG))
+ RefKind = MCSymbolRefExpr::VK_TPREL;
+ else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
+ RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL;
+ else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG)
+ RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL;
+ else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG)
+ RefKind = MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL;
const MachineInstr *MI = MO.getParent();
const MachineFunction *MF = MI->getMF();
@@ -100,6 +110,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
MIOpcode == PPC::BL8_NOTOC) {
RefKind = MCSymbolRefExpr::VK_PPC_NOTOC;
}
+ if (MO.getTargetFlags() == PPCII::MO_PCREL_OPT_FLAG)
+ RefKind = MCSymbolRefExpr::VK_PPC_PCREL_OPT;
}
const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 227c863685ae..c8b01aaef828 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -267,6 +267,113 @@ void PPCMIPeephole::UpdateTOCSaves(
TOCSaves[MI] = Keep;
}
+// This function returns a list of all PHI nodes in the tree starting from
+// the RootPHI node. We perform a BFS traversal to get an ordered list of nodes.
+// The list initially only contains the root PHI. When we visit a PHI node, we
+// add it to the list. We continue to look for other PHI node operands while
+// there are nodes to visit in the list. The function returns false if the
+// optimization cannot be applied on this tree.
+static bool collectUnprimedAccPHIs(MachineRegisterInfo *MRI,
+ MachineInstr *RootPHI,
+ SmallVectorImpl<MachineInstr *> &PHIs) {
+ PHIs.push_back(RootPHI);
+ unsigned VisitedIndex = 0;
+ while (VisitedIndex < PHIs.size()) {
+ MachineInstr *VisitedPHI = PHIs[VisitedIndex];
+ for (unsigned PHIOp = 1, NumOps = VisitedPHI->getNumOperands();
+ PHIOp != NumOps; PHIOp += 2) {
+ Register RegOp = VisitedPHI->getOperand(PHIOp).getReg();
+ if (!Register::isVirtualRegister(RegOp))
+ return false;
+ MachineInstr *Instr = MRI->getVRegDef(RegOp);
+ // While collecting the PHI nodes, we check if they can be converted (i.e.
+ // all the operands are either copies, implicit defs or PHI nodes).
+ unsigned Opcode = Instr->getOpcode();
+ if (Opcode == PPC::COPY) {
+ Register Reg = Instr->getOperand(1).getReg();
+ if (!Register::isVirtualRegister(Reg) ||
+ MRI->getRegClass(Reg) != &PPC::ACCRCRegClass)
+ return false;
+ } else if (Opcode != PPC::IMPLICIT_DEF && Opcode != PPC::PHI)
+ return false;
+ // If we detect a cycle in the PHI nodes, we exit. It would be
+ // possible to change cycles as well, but that would add a lot
+ // of complexity for a case that is unlikely to occur with MMA
+ // code.
+ if (Opcode != PPC::PHI)
+ continue;
+ if (llvm::is_contained(PHIs, Instr))
+ return false;
+ PHIs.push_back(Instr);
+ }
+ VisitedIndex++;
+ }
+ return true;
+}
+
+// This function changes the unprimed accumulator PHI nodes in the PHIs list to
+// primed accumulator PHI nodes. The list is traversed in reverse order to
+// change all the PHI operands of a PHI node before changing the node itself.
+// We keep a map to associate each changed PHI node to its non-changed form.
+static void convertUnprimedAccPHIs(const PPCInstrInfo *TII,
+ MachineRegisterInfo *MRI,
+ SmallVectorImpl<MachineInstr *> &PHIs,
+ Register Dst) {
+ DenseMap<MachineInstr *, MachineInstr *> ChangedPHIMap;
+ for (auto It = PHIs.rbegin(), End = PHIs.rend(); It != End; ++It) {
+ MachineInstr *PHI = *It;
+ SmallVector<std::pair<MachineOperand, MachineOperand>, 4> PHIOps;
+ // We check if the current PHI node can be changed by looking at its
+ // operands. If all the operands are either copies from primed
+ // accumulators, implicit definitions or other unprimed accumulator
+ // PHI nodes, we change it.
+ for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
+ PHIOp += 2) {
+ Register RegOp = PHI->getOperand(PHIOp).getReg();
+ MachineInstr *PHIInput = MRI->getVRegDef(RegOp);
+ unsigned Opcode = PHIInput->getOpcode();
+ assert((Opcode == PPC::COPY || Opcode == PPC::IMPLICIT_DEF ||
+ Opcode == PPC::PHI) &&
+ "Unexpected instruction");
+ if (Opcode == PPC::COPY) {
+ assert(MRI->getRegClass(PHIInput->getOperand(1).getReg()) ==
+ &PPC::ACCRCRegClass &&
+ "Unexpected register class");
+ PHIOps.push_back({PHIInput->getOperand(1), PHI->getOperand(PHIOp + 1)});
+ } else if (Opcode == PPC::IMPLICIT_DEF) {
+ Register AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass);
+ BuildMI(*PHIInput->getParent(), PHIInput, PHIInput->getDebugLoc(),
+ TII->get(PPC::IMPLICIT_DEF), AccReg);
+ PHIOps.push_back({MachineOperand::CreateReg(AccReg, false),
+ PHI->getOperand(PHIOp + 1)});
+ } else if (Opcode == PPC::PHI) {
+ // We found a PHI operand. At this point we know this operand
+ // has already been changed so we get its associated changed form
+ // from the map.
+ assert(ChangedPHIMap.count(PHIInput) == 1 &&
+ "This PHI node should have already been changed.");
+ MachineInstr *PrimedAccPHI = ChangedPHIMap.lookup(PHIInput);
+ PHIOps.push_back({MachineOperand::CreateReg(
+ PrimedAccPHI->getOperand(0).getReg(), false),
+ PHI->getOperand(PHIOp + 1)});
+ }
+ }
+ Register AccReg = Dst;
+ // If the PHI node we are changing is the root node, the register it defines
+ // will be the destination register of the original copy (of the PHI def).
+ // For all other PHI's in the list, we need to create another primed
+ // accumulator virtual register as the PHI will no longer define the
+ // unprimed accumulator.
+ if (PHI != PHIs[0])
+ AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass);
+ MachineInstrBuilder NewPHI = BuildMI(
+ *PHI->getParent(), PHI, PHI->getDebugLoc(), TII->get(PPC::PHI), AccReg);
+ for (auto RegMBB : PHIOps)
+ NewPHI.add(RegMBB.first).add(RegMBB.second);
+ ChangedPHIMap[PHI] = NewPHI.getInstr();
+ }
+}
+
// Perform peephole optimizations.
bool PPCMIPeephole::simplifyCode(void) {
bool Simplified = false;
@@ -321,6 +428,38 @@ bool PPCMIPeephole::simplifyCode(void) {
default:
break;
+ case PPC::COPY: {
+ Register Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ if (!Register::isVirtualRegister(Src) ||
+ !Register::isVirtualRegister(Dst))
+ break;
+ if (MRI->getRegClass(Src) != &PPC::UACCRCRegClass ||
+ MRI->getRegClass(Dst) != &PPC::ACCRCRegClass)
+ break;
+
+ // We are copying an unprimed accumulator to a primed accumulator.
+ // If the input to the copy is a PHI that is fed only by (i) copies in
+ // the other direction (ii) implicitly defined unprimed accumulators or
+ // (iii) other PHI nodes satisfying (i) and (ii), we can change
+ // the PHI to a PHI on primed accumulators (as long as we also change
+ // its operands). To detect and change such copies, we first get a list
+ // of all the PHI nodes starting from the root PHI node in BFS order.
+ // We then visit all these PHI nodes to check if they can be changed to
+ // primed accumulator PHI nodes and if so, we change them.
+ MachineInstr *RootPHI = MRI->getVRegDef(Src);
+ if (RootPHI->getOpcode() != PPC::PHI)
+ break;
+
+ SmallVector<MachineInstr *, 4> PHIs;
+ if (!collectUnprimedAccPHIs(MRI, RootPHI, PHIs))
+ break;
+
+ convertUnprimedAccPHIs(TII, MRI, PHIs, Dst);
+
+ ToErase = &MI;
+ break;
+ }
case PPC::LI:
case PPC::LI8: {
// If we are materializing a zero, look for any use operands for which
@@ -573,7 +712,7 @@ bool PPCMIPeephole::simplifyCode(void) {
Simplified = true;
Register ConvReg1 = RoundInstr->getOperand(1).getReg();
Register FRSPDefines = RoundInstr->getOperand(0).getReg();
- MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
+ MachineInstr &Use = *(MRI->use_instr_nodbg_begin(FRSPDefines));
for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
if (Use.getOperand(i).isReg() &&
Use.getOperand(i).getReg() == FRSPDefines)
@@ -848,142 +987,9 @@ bool PPCMIPeephole::simplifyCode(void) {
case PPC::RLWINM_rec:
case PPC::RLWINM8:
case PPC::RLWINM8_rec: {
- unsigned FoldingReg = MI.getOperand(1).getReg();
- if (!Register::isVirtualRegister(FoldingReg))
- break;
-
- MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg);
- if (SrcMI->getOpcode() != PPC::RLWINM &&
- SrcMI->getOpcode() != PPC::RLWINM_rec &&
- SrcMI->getOpcode() != PPC::RLWINM8 &&
- SrcMI->getOpcode() != PPC::RLWINM8_rec)
- break;
- assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() &&
- MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() &&
- SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) &&
- "Invalid PPC::RLWINM Instruction!");
- uint64_t SHSrc = SrcMI->getOperand(2).getImm();
- uint64_t SHMI = MI.getOperand(2).getImm();
- uint64_t MBSrc = SrcMI->getOperand(3).getImm();
- uint64_t MBMI = MI.getOperand(3).getImm();
- uint64_t MESrc = SrcMI->getOperand(4).getImm();
- uint64_t MEMI = MI.getOperand(4).getImm();
-
- assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) &&
- "Invalid PPC::RLWINM Instruction!");
-
- // If MBMI is bigger than MEMI, we always can not get run of ones.
- // RotatedSrcMask non-wrap:
- // 0........31|32........63
- // RotatedSrcMask: B---E B---E
- // MaskMI: -----------|--E B------
- // Result: ----- --- (Bad candidate)
- //
- // RotatedSrcMask wrap:
- // 0........31|32........63
- // RotatedSrcMask: --E B----|--E B----
- // MaskMI: -----------|--E B------
- // Result: --- -----|--- ----- (Bad candidate)
- //
- // One special case is RotatedSrcMask is a full set mask.
- // RotatedSrcMask full:
- // 0........31|32........63
- // RotatedSrcMask: ------EB---|-------EB---
- // MaskMI: -----------|--E B------
- // Result: -----------|--- ------- (Good candidate)
-
- // Mark special case.
- bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31);
-
- // For other MBMI > MEMI cases, just return.
- if ((MBMI > MEMI) && !SrcMaskFull)
- break;
-
- // Handle MBMI <= MEMI cases.
- APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI);
- // In MI, we only need low 32 bits of SrcMI, just consider about low 32
- // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0,
- // while in PowerPC ISA, lowerest bit is at index 63.
- APInt MaskSrc =
- APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc);
-
- APInt RotatedSrcMask = MaskSrc.rotl(SHMI);
- APInt FinalMask = RotatedSrcMask & MaskMI;
- uint32_t NewMB, NewME;
-
- // If final mask is 0, MI result should be 0 too.
- if (FinalMask.isNullValue()) {
- bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 ||
- MI.getOpcode() == PPC::RLWINM8_rec);
-
- Simplified = true;
-
- LLVM_DEBUG(dbgs() << "Replace Instr: ");
- LLVM_DEBUG(MI.dump());
-
- if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) {
- // Replace MI with "LI 0"
- MI.RemoveOperand(4);
- MI.RemoveOperand(3);
- MI.RemoveOperand(2);
- MI.getOperand(1).ChangeToImmediate(0);
- MI.setDesc(TII->get(Is64Bit ? PPC::LI8 : PPC::LI));
- } else {
- // Replace MI with "ANDI_rec reg, 0"
- MI.RemoveOperand(4);
- MI.RemoveOperand(3);
- MI.getOperand(2).setImm(0);
- MI.setDesc(TII->get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
- MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
- if (SrcMI->getOperand(1).isKill()) {
- MI.getOperand(1).setIsKill(true);
- SrcMI->getOperand(1).setIsKill(false);
- } else
- // About to replace MI.getOperand(1), clear its kill flag.
- MI.getOperand(1).setIsKill(false);
- }
-
- LLVM_DEBUG(dbgs() << "With: ");
- LLVM_DEBUG(MI.dump());
- } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB,
- NewME) && NewMB <= NewME)|| SrcMaskFull) {
- // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger
- // than NewME. Otherwise we get a 64 bit value after folding, but MI
- // return a 32 bit value.
-
- Simplified = true;
- LLVM_DEBUG(dbgs() << "Converting Instr: ");
- LLVM_DEBUG(MI.dump());
-
- uint16_t NewSH = (SHSrc + SHMI) % 32;
- MI.getOperand(2).setImm(NewSH);
- // If SrcMI mask is full, no need to update MBMI and MEMI.
- if (!SrcMaskFull) {
- MI.getOperand(3).setImm(NewMB);
- MI.getOperand(4).setImm(NewME);
- }
- MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
- if (SrcMI->getOperand(1).isKill()) {
- MI.getOperand(1).setIsKill(true);
- SrcMI->getOperand(1).setIsKill(false);
- } else
- // About to replace MI.getOperand(1), clear its kill flag.
- MI.getOperand(1).setIsKill(false);
-
- LLVM_DEBUG(dbgs() << "To: ");
- LLVM_DEBUG(MI.dump());
- }
- if (Simplified) {
- // If FoldingReg has no non-debug use and it has no implicit def (it
- // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI.
- // Otherwise keep it.
+ Simplified = TII->combineRLWINM(MI, &ToErase);
+ if (Simplified)
++NumRotatesCollapsed;
- if (MRI->use_nodbg_empty(FoldingReg) && !SrcMI->hasImplicitDef()) {
- ToErase = SrcMI;
- LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
- LLVM_DEBUG(SrcMI->dump());
- }
- }
break;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index daf88589bb52..c976a9c62d3b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
#include "PPCMachineFunctionInfo.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/XCOFF.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/MC/MCContext.h"
#include "llvm/Support/CommandLine.h"
@@ -63,3 +64,36 @@ bool PPCFunctionInfo::isLiveInZExt(Register VReg) const {
return LiveIn.second.isZExt();
return false;
}
+
+void PPCFunctionInfo::appendParameterType(ParamType Type) {
+ uint32_t CopyParamType = ParameterType;
+ int Bits = 0;
+
+ // If it is fixed type, we only need to increase the FixedParamNum, for
+ // the bit encode of fixed type is bit of zero, we do not need to change the
+ // ParamType.
+ if (Type == FixedType) {
+ ++FixedParamNum;
+ return;
+ }
+
+ ++FloatingPointParamNum;
+
+ for (int I = 0;
+ I < static_cast<int>(FloatingPointParamNum + FixedParamNum - 1); ++I) {
+ if (CopyParamType & XCOFF::TracebackTable::ParmTypeIsFloatingBit) {
+ // '10'b => floating point short parameter.
+ // '11'b => floating point long parameter.
+ CopyParamType <<= 2;
+ Bits += 2;
+ } else {
+ // '0'b => fixed parameter.
+ CopyParamType <<= 1;
+ ++Bits;
+ }
+ }
+
+ assert(Type != FixedType && "FixedType should already be handled.");
+ if (Bits < 31)
+ ParameterType |= Type << (30 - Bits);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 29ca53e273d7..4b73b36318b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -22,6 +22,16 @@ namespace llvm {
/// PPCFunctionInfo - This class is derived from MachineFunction private
/// PowerPC target-specific information for each MachineFunction.
class PPCFunctionInfo : public MachineFunctionInfo {
+public:
+ // The value in the ParamType are used to indicate the bitstrings used in the
+ // encoding format.
+ enum ParamType {
+ FixedType = 0x0,
+ ShortFloatPoint = 0x2,
+ LongFloatPoint = 0x3
+ };
+
+private:
virtual void anchor();
/// FramePointerSaveIndex - Frame index of where the old frame pointer is
@@ -69,9 +79,6 @@ class PPCFunctionInfo : public MachineFunctionInfo {
/// disabled.
bool DisableNonVolatileCR = false;
- /// Indicates whether VRSAVE is spilled in the current function.
- bool SpillsVRSAVE = false;
-
/// LRStoreRequired - The bool indicates whether there is some explicit use of
/// the LR/LR8 stack slot that is not obvious from scanning the code. This
/// requires that the code generator produce a store of LR to the stack on
@@ -110,6 +117,20 @@ class PPCFunctionInfo : public MachineFunctionInfo {
/// register for parameter passing.
unsigned VarArgsNumFPR = 0;
+ /// FixedParamNum - Number of fixed parameter.
+ unsigned FixedParamNum = 0;
+
+ /// FloatingParamNum - Number of floating point parameter.
+ unsigned FloatingPointParamNum = 0;
+
+ /// ParamType - Encode type for every parameter
+ /// in the order of parameters passing in.
+ /// Bitstring starts from the most significant (leftmost) bit.
+ /// '0'b => fixed parameter.
+ /// '10'b => floating point short parameter.
+ /// '11'b => floating point long parameter.
+ uint32_t ParameterType = 0;
+
/// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
int CRSpillFrameIndex = 0;
@@ -175,9 +196,6 @@ public:
void setDisableNonVolatileCR() { DisableNonVolatileCR = true; }
bool isNonVolatileCRDisabled() const { return DisableNonVolatileCR; }
- void setSpillsVRSAVE() { SpillsVRSAVE = true; }
- bool isVRSAVESpilled() const { return SpillsVRSAVE; }
-
void setLRStoreRequired() { LRStoreRequired = true; }
bool isLRStoreRequired() const { return LRStoreRequired; }
@@ -196,6 +214,13 @@ public:
unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
+ unsigned getFixedParamNum() const { return FixedParamNum; }
+
+ unsigned getFloatingPointParamNum() const { return FloatingPointParamNum; }
+
+ uint32_t getParameterType() const { return ParameterType; }
+ void appendParameterType(ParamType Type);
+
unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
index 5649d7d13966..ce615e554d94 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
@@ -49,10 +49,103 @@ bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
- GenericScheduler::tryCandidate(Cand, TryCand, Zone);
+ // From GenericScheduler::tryCandidate
- if (!Cand.isValid() || !Zone)
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
return;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return;
+
+ // We only compare a subset of features when comparing nodes between
+ // Top and Bottom boundary. Some properties are simply incomparable, in many
+ // other instances we should only override the other boundary if something
+ // is a clear good pick on one boundary. Skip heuristics that are more
+ // "tie-breaking" in nature.
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // For loops that are acyclic path limited, aggressively schedule for
+ // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+ // heuristics to take precedence.
+ if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+ tryLatency(TryCand, Cand, *Zone))
+ return;
+
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return;
+ }
+
+ // Keep clustered nodes together to encourage downstream peephole
+ // optimizations which may reduce resource requirements.
+ //
+ // This is a best effort to set things up for a post-RA pass. Optimizations
+ // like generating loads of multiple registers should ideally be done within
+ // the scheduler pass by combining the loads during DAG postprocessing.
+ const SUnit *CandNextClusterSU =
+ Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ const SUnit *TryCandNextClusterSU =
+ TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+ Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+ return;
+
+ if (SameBoundary) {
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+ getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+ return;
+ }
+
+ // Avoid increasing the max pressure of the entire region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+ Cand, RegMax, TRI, DAG->MF))
+ return;
+
+ if (SameBoundary) {
+ // Avoid critical resource consumption and balance the schedule.
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return;
+
+ // Avoid serializing long latency dependence chains.
+ // For acyclic path limited loops, latency was already checked above.
+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+ return;
+
+ // Fall through to original instruction order.
+ if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+ (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ TryCand.Reason = NodeOrder;
+ }
+ }
+
+ // GenericScheduler::tryCandidate end
// Add powerpc specific heuristic only when TryCand isn't selected or
// selected as node order.
@@ -61,8 +154,10 @@ void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
// There are some benefits to schedule the ADDI before the load to hide the
// latency, as RA may create a true dependency between the load and addi.
- if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
- return;
+ if (SameBoundary) {
+ if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
+ return;
+ }
}
bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
@@ -79,11 +174,44 @@ bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) {
- PostGenericScheduler::tryCandidate(Cand, TryCand);
+ // From PostGenericScheduler::tryCandidate
+
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return;
+ }
+
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
+ Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return;
- if (!Cand.isValid())
+ // Keep clustered nodes together.
+ if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
+ Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
return;
+ // Avoid critical resource consumption and balance the schedule.
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return;
+
+ // Avoid serializing long latency dependence chains.
+ if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
+ return;
+ }
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
+ TryCand.Reason = NodeOrder;
+
+ // PostGenericScheduler::tryCandidate end
+
// Add powerpc post ra specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
index 815dfd1402f4..d12c6d9cd406 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -51,8 +51,8 @@ public:
Kd(Kind), Supported(HasFeature), DepOpIdx(Index), OpSet1(First),
OpSet2(Second) {}
- bool hasOp1(unsigned Opc) const { return OpSet1.count(Opc) != 0; }
- bool hasOp2(unsigned Opc) const { return OpSet2.count(Opc) != 0; }
+ bool hasOp1(unsigned Opc) const { return OpSet1.contains(Opc); }
+ bool hasOp2(unsigned Opc) const { return OpSet2.contains(Opc); }
bool isSupported() const { return Supported; }
Optional<unsigned> depOpIdx() const {
if (DepOpIdx < 0)
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index 4ea714ff15f7..a8853609a7c8 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -21,8 +21,8 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/Statistic.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -39,10 +39,54 @@ STATISTIC(NumFrameOffFoldInPreEmit,
"Number of folding frame offset by using r+r in pre-emit peephole");
static cl::opt<bool>
+EnablePCRelLinkerOpt("ppc-pcrel-linker-opt", cl::Hidden, cl::init(true),
+ cl::desc("enable PC Relative linker optimization"));
+
+static cl::opt<bool>
RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true),
cl::desc("Run pre-emit peephole optimizations."));
namespace {
+
+static bool hasPCRelativeForm(MachineInstr &Use) {
+ switch (Use.getOpcode()) {
+ default:
+ return false;
+ case PPC::LBZ:
+ case PPC::LBZ8:
+ case PPC::LHA:
+ case PPC::LHA8:
+ case PPC::LHZ:
+ case PPC::LHZ8:
+ case PPC::LWZ:
+ case PPC::LWZ8:
+ case PPC::STB:
+ case PPC::STB8:
+ case PPC::STH:
+ case PPC::STH8:
+ case PPC::STW:
+ case PPC::STW8:
+ case PPC::LD:
+ case PPC::STD:
+ case PPC::LWA:
+ case PPC::LXSD:
+ case PPC::LXSSP:
+ case PPC::LXV:
+ case PPC::STXSD:
+ case PPC::STXSSP:
+ case PPC::STXV:
+ case PPC::LFD:
+ case PPC::LFS:
+ case PPC::STFD:
+ case PPC::STFS:
+ case PPC::DFLOADf32:
+ case PPC::DFLOADf64:
+ case PPC::DFSTOREf32:
+ case PPC::DFSTOREf64:
+ return true;
+ }
+}
+
class PPCPreEmitPeephole : public MachineFunctionPass {
public:
static char ID;
@@ -77,7 +121,7 @@ namespace {
for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
// Skip load immediate that is marked to be erased later because it
// cannot be used to replace any other instructions.
- if (InstrsToErase.find(&*BBI) != InstrsToErase.end())
+ if (InstrsToErase.contains(&*BBI))
continue;
// Skip non-load immediate.
unsigned Opc = BBI->getOpcode();
@@ -172,6 +216,196 @@ namespace {
return !InstrsToErase.empty();
}
+ // Check if this instruction is a PLDpc that is part of a GOT indirect
+ // access.
+ bool isGOTPLDpc(MachineInstr &Instr) {
+ if (Instr.getOpcode() != PPC::PLDpc)
+ return false;
+
+ // The result must be a register.
+ const MachineOperand &LoadedAddressReg = Instr.getOperand(0);
+ if (!LoadedAddressReg.isReg())
+ return false;
+
+ // Make sure that this is a global symbol.
+ const MachineOperand &SymbolOp = Instr.getOperand(1);
+ if (!SymbolOp.isGlobal())
+ return false;
+
+ // Finally return true only if the GOT flag is present.
+ return (SymbolOp.getTargetFlags() & PPCII::MO_GOT_FLAG);
+ }
+
+ bool addLinkerOpt(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) {
+ MachineFunction *MF = MBB.getParent();
+ // If the linker opt is disabled then just return.
+ if (!EnablePCRelLinkerOpt)
+ return false;
+
+ // Add this linker opt only if we are using PC Relative memops.
+ if (!MF->getSubtarget<PPCSubtarget>().isUsingPCRelativeCalls())
+ return false;
+
+ // Struct to keep track of one def/use pair for a GOT indirect access.
+ struct GOTDefUsePair {
+ MachineBasicBlock::iterator DefInst;
+ MachineBasicBlock::iterator UseInst;
+ Register DefReg;
+ Register UseReg;
+ bool StillValid;
+ };
+ // Vector of def/ues pairs in this basic block.
+ SmallVector<GOTDefUsePair, 4> CandPairs;
+ SmallVector<GOTDefUsePair, 4> ValidPairs;
+ bool MadeChange = false;
+
+ // Run through all of the instructions in the basic block and try to
+ // collect potential pairs of GOT indirect access instructions.
+ for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+ // Look for the initial GOT indirect load.
+ if (isGOTPLDpc(*BBI)) {
+ GOTDefUsePair CurrentPair{BBI, MachineBasicBlock::iterator(),
+ BBI->getOperand(0).getReg(),
+ PPC::NoRegister, true};
+ CandPairs.push_back(CurrentPair);
+ continue;
+ }
+
+ // We haven't encountered any new PLD instructions, nothing to check.
+ if (CandPairs.empty())
+ continue;
+
+ // Run through the candidate pairs and see if any of the registers
+ // defined in the PLD instructions are used by this instruction.
+ // Note: the size of CandPairs can change in the loop.
+ for (unsigned Idx = 0; Idx < CandPairs.size(); Idx++) {
+ GOTDefUsePair &Pair = CandPairs[Idx];
+ // The instruction does not use or modify this PLD's def reg,
+ // ignore it.
+ if (!BBI->readsRegister(Pair.DefReg, TRI) &&
+ !BBI->modifiesRegister(Pair.DefReg, TRI))
+ continue;
+
+ // The use needs to be used in the address compuation and not
+ // as the register being stored for a store.
+ const MachineOperand *UseOp =
+ hasPCRelativeForm(*BBI) ? &BBI->getOperand(2) : nullptr;
+
+ // Check for a valid use.
+ if (UseOp && UseOp->isReg() && UseOp->getReg() == Pair.DefReg &&
+ UseOp->isUse() && UseOp->isKill()) {
+ Pair.UseInst = BBI;
+ Pair.UseReg = BBI->getOperand(0).getReg();
+ ValidPairs.push_back(Pair);
+ }
+ CandPairs.erase(CandPairs.begin() + Idx);
+ }
+ }
+
+ // Go through all of the pairs and check for any more valid uses.
+ for (auto Pair = ValidPairs.begin(); Pair != ValidPairs.end(); Pair++) {
+ // We shouldn't be here if we don't have a valid pair.
+ assert(Pair->UseInst.isValid() && Pair->StillValid &&
+ "Kept an invalid def/use pair for GOT PCRel opt");
+ // We have found a potential pair. Search through the instructions
+ // between the def and the use to see if it is valid to mark this as a
+ // linker opt.
+ MachineBasicBlock::iterator BBI = Pair->DefInst;
+ ++BBI;
+ for (; BBI != Pair->UseInst; ++BBI) {
+ if (BBI->readsRegister(Pair->UseReg, TRI) ||
+ BBI->modifiesRegister(Pair->UseReg, TRI)) {
+ Pair->StillValid = false;
+ break;
+ }
+ }
+
+ if (!Pair->StillValid)
+ continue;
+
+ // The load/store instruction that uses the address from the PLD will
+ // either use a register (for a store) or define a register (for the
+ // load). That register will be added as an implicit def to the PLD
+ // and as an implicit use on the second memory op. This is a precaution
+ // to prevent future passes from using that register between the two
+ // instructions.
+ MachineOperand ImplDef =
+ MachineOperand::CreateReg(Pair->UseReg, true, true);
+ MachineOperand ImplUse =
+ MachineOperand::CreateReg(Pair->UseReg, false, true);
+ Pair->DefInst->addOperand(ImplDef);
+ Pair->UseInst->addOperand(ImplUse);
+
+ // Create the symbol.
+ MCContext &Context = MF->getContext();
+ MCSymbol *Symbol = Context.createNamedTempSymbol("pcrel");
+ MachineOperand PCRelLabel =
+ MachineOperand::CreateMCSymbol(Symbol, PPCII::MO_PCREL_OPT_FLAG);
+ Pair->DefInst->addOperand(*MF, PCRelLabel);
+ Pair->UseInst->addOperand(*MF, PCRelLabel);
+ MadeChange |= true;
+ }
+ return MadeChange;
+ }
+
+ // This function removes redundant pairs of accumulator prime/unprime
+ // instructions. In some situations, it's possible the compiler inserts an
+ // accumulator prime instruction followed by an unprime instruction (e.g.
+ // when we store an accumulator after restoring it from a spill). If the
+ // accumulator is not used between the two, they can be removed. This
+ // function removes these redundant pairs from basic blocks.
+ // The algorithm is quite straightforward - every time we encounter a prime
+ // instruction, the primed register is added to a candidate set. Any use
+ // other than a prime removes the candidate from the set and any de-prime
+ // of a current candidate marks both the prime and de-prime for removal.
+ // This way we ensure we only remove prime/de-prime *pairs* with no
+ // intervening uses.
+ bool removeAccPrimeUnprime(MachineBasicBlock &MBB) {
+ DenseSet<MachineInstr *> InstrsToErase;
+ // Initially, none of the acc registers are candidates.
+ SmallVector<MachineInstr *, 8> Candidates(
+ PPC::UACCRCRegClass.getNumRegs(), nullptr);
+
+ for (MachineInstr &BBI : MBB.instrs()) {
+ unsigned Opc = BBI.getOpcode();
+ // If we are visiting a xxmtacc instruction, we add it and its operand
+ // register to the candidate set.
+ if (Opc == PPC::XXMTACC) {
+ Register Acc = BBI.getOperand(0).getReg();
+ assert(PPC::ACCRCRegClass.contains(Acc) &&
+ "Unexpected register for XXMTACC");
+ Candidates[Acc - PPC::ACC0] = &BBI;
+ }
+ // If we are visiting a xxmfacc instruction and its operand register is
+ // in the candidate set, we mark the two instructions for removal.
+ else if (Opc == PPC::XXMFACC) {
+ Register Acc = BBI.getOperand(0).getReg();
+ assert(PPC::ACCRCRegClass.contains(Acc) &&
+ "Unexpected register for XXMFACC");
+ if (!Candidates[Acc - PPC::ACC0])
+ continue;
+ InstrsToErase.insert(&BBI);
+ InstrsToErase.insert(Candidates[Acc - PPC::ACC0]);
+ }
+ // If we are visiting an instruction using an accumulator register
+ // as operand, we remove it from the candidate set.
+ else {
+ for (MachineOperand &Operand : BBI.operands()) {
+ if (!Operand.isReg())
+ continue;
+ Register Reg = Operand.getReg();
+ if (PPC::ACCRCRegClass.contains(Reg))
+ Candidates[Reg - PPC::ACC0] = nullptr;
+ }
+ }
+ }
+
+ for (MachineInstr *MI : InstrsToErase)
+ MI->eraseFromParent();
+ NumRemovedInPreEmit += InstrsToErase.size();
+ return !InstrsToErase.empty();
+ }
+
bool runOnMachineFunction(MachineFunction &MF) override {
if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) {
// Remove UNENCODED_NOP even when this pass is disabled.
@@ -192,6 +426,8 @@ namespace {
SmallVector<MachineInstr *, 4> InstrsToErase;
for (MachineBasicBlock &MBB : MF) {
Changed |= removeRedundantLIs(MBB, TRI);
+ Changed |= addLinkerOpt(MBB, TRI);
+ Changed |= removeAccPrimeUnprime(MBB);
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc == PPC::UNENCODED_NOP) {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
deleted file mode 100644
index 6e9042643820..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The QPX vector registers overlay the scalar floating-point registers, and
-// any scalar floating-point loads splat their value across all vector lanes.
-// Thus, if we have a scalar load followed by a splat, we can remove the splat
-// (i.e. replace the load with a load-and-splat pseudo instruction).
-//
-// This pass must run after anything that might do store-to-load forwarding.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPC.h"
-#include "PPCInstrBuilder.h"
-#include "PPCInstrInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "ppc-qpx-load-splat"
-
-STATISTIC(NumSimplified, "Number of QPX load splats simplified");
-
-namespace {
- struct PPCQPXLoadSplat : public MachineFunctionPass {
- static char ID;
- PPCQPXLoadSplat() : MachineFunctionPass(ID) {
- initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &Fn) override;
-
- StringRef getPassName() const override {
- return "PowerPC QPX Load Splat Simplification";
- }
- };
- char PPCQPXLoadSplat::ID = 0;
-}
-
-INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
- "PowerPC QPX Load Splat Simplification",
- false, false)
-
-FunctionPass *llvm::createPPCQPXLoadSplatPass() {
- return new PPCQPXLoadSplat();
-}
-
-bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(MF.getFunction()))
- return false;
-
- bool MadeChange = false;
- const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-
- for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
- MachineBasicBlock *MBB = &*MFI;
- SmallVector<MachineInstr *, 4> Splats;
-
- for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
- MachineInstr *MI = &*MBBI;
-
- if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
- Splats.clear();
- continue;
- }
-
- // We're looking for a sequence like this:
- // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2)
- // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm
-
- for (auto SI = Splats.begin(); SI != Splats.end();) {
- MachineInstr *SMI = *SI;
- Register SplatReg = SMI->getOperand(0).getReg();
- Register SrcReg = SMI->getOperand(1).getReg();
-
- if (MI->modifiesRegister(SrcReg, TRI)) {
- switch (MI->getOpcode()) {
- default:
- SI = Splats.erase(SI);
- continue;
- case PPC::LFS:
- case PPC::LFD:
- case PPC::LFSU:
- case PPC::LFDU:
- case PPC::LFSUX:
- case PPC::LFDUX:
- case PPC::LFSX:
- case PPC::LFDX:
- case PPC::LFIWAX:
- case PPC::LFIWZX:
- if (SplatReg != SrcReg) {
- // We need to change the load to define the scalar subregister of
- // the QPX splat source register.
- unsigned SubRegIndex =
- TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
- Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
-
- // Substitute both the explicit defined register, and also the
- // implicit def of the containing QPX register.
- MI->getOperand(0).setReg(SplatSubReg);
- MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
- }
-
- SI = Splats.erase(SI);
-
- // If SMI is directly after MI, then MBBI's base iterator is
- // pointing at SMI. Adjust MBBI around the call to erase SMI to
- // avoid invalidating MBBI.
- ++MBBI;
- SMI->eraseFromParent();
- --MBBI;
-
- ++NumSimplified;
- MadeChange = true;
- continue;
- }
- }
-
- // If this instruction defines the splat register, then we cannot move
- // the previous definition above it. If it reads from the splat
- // register, then it must already be alive from some previous
- // definition, and if the splat register is different from the source
- // register, then this definition must not be the load for which we're
- // searching.
- if (MI->modifiesRegister(SplatReg, TRI) ||
- (SrcReg != SplatReg &&
- MI->readsRegister(SplatReg, TRI))) {
- SI = Splats.erase(SI);
- continue;
- }
-
- ++SI;
- }
-
- if (MI->getOpcode() != PPC::QVESPLATI &&
- MI->getOpcode() != PPC::QVESPLATIs &&
- MI->getOpcode() != PPC::QVESPLATIb)
- continue;
- if (MI->getOperand(2).getImm() != 0)
- continue;
-
- // If there are other uses of the scalar value after this, replacing
- // those uses might be non-trivial.
- if (!MI->getOperand(1).isKill())
- continue;
-
- Splats.push_back(MI);
- }
- }
-
- return MadeChange;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 90cc81beb89d..5cee00c61fc1 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -206,9 +206,9 @@ static bool splitMBB(BlockSplitInfo &BSI) {
NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
NewMBB->transferSuccessors(ThisMBB);
if (!ProbOrigTarget.isUnknown()) {
- auto MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigTarget);
+ auto MBBI = find(NewMBB->successors(), OrigTarget);
NewMBB->setSuccProbability(MBBI, ProbOrigTarget);
- MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigFallThrough);
+ MBBI = find(NewMBB->successors(), OrigFallThrough);
NewMBB->setSuccProbability(MBBI, ProbOrigFallThrough);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ed8948a63972..178a13443e2a 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -75,6 +75,21 @@ MaxCRBitSpillDist("ppc-max-crbit-spill-dist",
"spill on ppc"),
cl::Hidden, cl::init(100));
+// Copies/moves of physical accumulators are expensive operations
+// that should be avoided whenever possible. MMA instructions are
+// meant to be used in performance-sensitive computational kernels.
+// This option is provided, at least for the time being, to give the
+// user a tool to detect this expensive operation and either rework
+// their code or report a compiler bug if that turns out to be the
+// cause.
+#ifndef NDEBUG
+static cl::opt<bool>
+ReportAccMoves("ppc-report-acc-moves",
+ cl::desc("Emit information about accumulator register spills "
+ "and copies"),
+ cl::Hidden, cl::init(false));
+#endif
+
static unsigned offsetMinAlignForOpcode(unsigned OpC);
PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
@@ -141,6 +156,10 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
const MCPhysReg*
PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+ if (Subtarget.isAIXABI() &&
+ (Subtarget.hasAltivec() && !TM.getAIXExtendedAltivecABI()))
+ report_fatal_error("the default AIX Altivec ABI is not yet "
+ "supported.");
if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) {
if (!TM.isPPC64() && Subtarget.isAIXABI())
report_fatal_error("AnyReg unimplemented on 32-bit AIX.");
@@ -187,8 +206,11 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return SaveR2 ? CSR_PPC64_R2_SaveList : CSR_PPC64_SaveList;
}
// 32-bit targets.
- if (Subtarget.isAIXABI())
+ if (Subtarget.isAIXABI()) {
+ if (Subtarget.hasAltivec())
+ return CSR_AIX32_Altivec_SaveList;
return CSR_AIX32_SaveList;
+ }
if (Subtarget.hasAltivec())
return CSR_SVR432_Altivec_SaveList;
else if (Subtarget.hasSPE())
@@ -209,8 +231,10 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
}
if (Subtarget.isAIXABI()) {
- assert(!Subtarget.hasAltivec() && "Altivec is not implemented on AIX yet.");
- return TM.isPPC64() ? CSR_PPC64_RegMask : CSR_AIX32_RegMask;
+ return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask
+ : CSR_PPC64_RegMask)
+ : (Subtarget.hasAltivec() ? CSR_AIX32_Altivec_RegMask
+ : CSR_AIX32_RegMask);
}
if (CC == CallingConv::Cold) {
@@ -404,9 +428,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
case PPC::F8RCRegClassID:
case PPC::F4RCRegClassID:
- case PPC::QFRCRegClassID:
- case PPC::QSRCRegClassID:
- case PPC::QBRCRegClassID:
case PPC::VRRCRegClassID:
case PPC::VFRCRegClassID:
case PPC::VSLRCRegClassID:
@@ -830,6 +851,16 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
SpillsKnownBit = true;
break;
default:
+ // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
+ // bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
+ // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
+ // register), and SETNBC will set this.
+ if (Subtarget.isISA3_1()) {
+ BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
+ .addReg(SrcReg, RegState::Undef);
+ break;
+ }
+
// On Power9, we can use SETB to extract the LT bit. This only works for
// the LT bit since SETB produces -1/1/0 for LT/GT/<neither>. So the value
// of the bit we care about (32-bit sign bit) will be set to the value of
@@ -929,54 +960,104 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
MBB.erase(II);
}
-void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
- unsigned FrameIndex) const {
- // Get the instruction.
- MachineInstr &MI = *II; // ; SPILL_VRSAVE <SrcReg>, <offset>
- // Get the instruction's basic block.
+void PPCRegisterInfo::emitAccCopyInfo(MachineBasicBlock &MBB,
+ MCRegister DestReg, MCRegister SrcReg) {
+#ifdef NDEBUG
+ return;
+#else
+ if (ReportAccMoves) {
+ std::string Dest = PPC::ACCRCRegClass.contains(DestReg) ? "acc" : "uacc";
+ std::string Src = PPC::ACCRCRegClass.contains(SrcReg) ? "acc" : "uacc";
+ dbgs() << "Emitting copy from " << Src << " to " << Dest << ":\n";
+ MBB.dump();
+ }
+#endif
+}
+
+static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
+ bool IsRestore) {
+#ifdef NDEBUG
+ return;
+#else
+ if (ReportAccMoves) {
+ dbgs() << "Emitting " << (IsPrimed ? "acc" : "uacc") << " register "
+ << (IsRestore ? "restore" : "spill") << ":\n";
+ MBB.dump();
+ }
+#endif
+}
+
+/// lowerACCSpilling - Generate the code for spilling the accumulator register.
+/// Similarly to other spills/reloads that use pseudo-ops, we do not actually
+/// eliminate the FrameIndex here nor compute the stack offset. We simply
+/// create a real instruction with an FI and rely on eliminateFrameIndex to
+/// handle the FI elimination.
+void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ MachineInstr &MI = *II; // SPILL_ACC <SrcReg>, <offset>
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- DebugLoc dl = MI.getDebugLoc();
-
- const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+ DebugLoc DL = MI.getDebugLoc();
Register SrcReg = MI.getOperand(0).getReg();
-
- BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
- .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
-
- addFrameReference(
- BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill),
- FrameIndex);
+ bool IsKilled = MI.getOperand(0).isKill();
+
+ bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+ Register Reg =
+ PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
+
+ emitAccSpillRestoreInfo(MBB, IsPrimed, false);
+
+ // De-prime the register being spilled, create two stores for the pair
+ // subregisters accounting for endianness and then re-prime the register if
+ // it isn't killed. This uses the Offset parameter to addFrameReference() to
+ // adjust the offset of the store that is within the 64-byte stack slot.
+ if (IsPrimed)
+ BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+ .addReg(Reg, getKillRegState(IsKilled)),
+ FrameIndex, IsLittleEndian ? 32 : 0);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+ .addReg(Reg + 1, getKillRegState(IsKilled)),
+ FrameIndex, IsLittleEndian ? 0 : 32);
+ if (IsPrimed && !IsKilled)
+ BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
// Discard the pseudo instruction.
MBB.erase(II);
}
-void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
- unsigned FrameIndex) const {
- // Get the instruction.
- MachineInstr &MI = *II; // ; <DestReg> = RESTORE_VRSAVE <offset>
- // Get the instruction's basic block.
+/// lowerACCRestore - Generate the code to restore the accumulator register.
+void PPCRegisterInfo::lowerACCRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const {
+ MachineInstr &MI = *II; // <DestReg> = RESTORE_ACC <offset>
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- DebugLoc dl = MI.getDebugLoc();
+ DebugLoc DL = MI.getDebugLoc();
- const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
Register DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
- "RESTORE_VRSAVE does not define its destination");
+ "RESTORE_ACC does not define its destination");
- addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
- Reg), FrameIndex);
+ bool IsPrimed = PPC::ACCRCRegClass.contains(DestReg);
+ Register Reg =
+ PPC::VSRp0 + (DestReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
+ bool IsLittleEndian = Subtarget.isLittleEndian();
- BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
- .addReg(Reg, RegState::Kill);
+ emitAccSpillRestoreInfo(MBB, IsPrimed, true);
+
+ // Create two loads for the pair subregisters accounting for endianness and
+ // then prime the accumulator register being restored.
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg),
+ FrameIndex, IsLittleEndian ? 32 : 0);
+ addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg + 1),
+ FrameIndex, IsLittleEndian ? 0 : 32);
+ if (IsPrimed)
+ BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), DestReg).addReg(DestReg);
// Discard the pseudo instruction.
MBB.erase(II);
@@ -1113,11 +1194,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
} else if (OpC == PPC::RESTORE_CRBIT) {
lowerCRBitRestore(II, FrameIndex);
return;
- } else if (OpC == PPC::SPILL_VRSAVE) {
- lowerVRSAVESpilling(II, FrameIndex);
+ } else if (OpC == PPC::SPILL_ACC || OpC == PPC::SPILL_UACC) {
+ lowerACCSpilling(II, FrameIndex);
return;
- } else if (OpC == PPC::RESTORE_VRSAVE) {
- lowerVRSAVERestore(II, FrameIndex);
+ } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) {
+ lowerACCRestore(II, FrameIndex);
return;
}
@@ -1294,10 +1375,9 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
/// Insert defining instruction(s) for BaseReg to
/// be a pointer to FrameIdx at the beginning of the basic block.
-void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
- Register BaseReg,
- int FrameIdx,
- int64_t Offset) const {
+Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ int FrameIdx,
+ int64_t Offset) const {
unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
MachineBasicBlock::iterator Ins = MBB->begin();
@@ -1310,10 +1390,14 @@ void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
const MCInstrDesc &MCID = TII.get(ADDriOpc);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const TargetRegisterClass *RC = getPointerRegClass(MF);
+ Register BaseReg = MRI.createVirtualRegister(RC);
MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
BuildMI(*MBB, Ins, DL, MCID, BaseReg)
.addFrameIndex(FrameIdx).addImm(Offset);
+
+ return BaseReg;
}
void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 61acd955e1cb..93f330ab56b6 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -119,10 +119,14 @@ public:
unsigned FrameIndex) const;
void lowerCRBitRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const;
- void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
- unsigned FrameIndex) const;
- void lowerVRSAVERestore(MachineBasicBlock::iterator II,
- unsigned FrameIndex) const;
+
+ void lowerACCSpilling(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+ void lowerACCRestore(MachineBasicBlock::iterator II,
+ unsigned FrameIndex) const;
+
+ static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg,
+ MCRegister SrcReg);
bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
int &FrameIdx) const override;
@@ -132,9 +136,8 @@ public:
// Support for virtual base registers.
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
- int FrameIdx,
- int64_t Offset) const override;
+ Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
+ int64_t Offset) const override;
void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
@@ -151,12 +154,18 @@ public:
/// register name so that only the number is left. Used by for linux asm.
static const char *stripRegisterPrefix(const char *RegName) {
switch (RegName[0]) {
+ case 'a':
+ if (RegName[1] == 'c' && RegName[2] == 'c')
+ return RegName + 3;
+ break;
case 'r':
case 'f':
- case 'q': // for QPX
case 'v':
- if (RegName[1] == 's')
+ if (RegName[1] == 's') {
+ if (RegName[2] == 'p')
+ return RegName + 3;
return RegName + 2;
+ }
return RegName + 1;
case 'c': if (RegName[1] == 'r') return RegName + 2;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index b45757c1acc5..e03617aa75ff 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -16,6 +16,10 @@ def sub_eq : SubRegIndex<1, 2>;
def sub_un : SubRegIndex<1, 3>;
def sub_32 : SubRegIndex<32>;
def sub_64 : SubRegIndex<64>;
+def sub_vsx0 : SubRegIndex<128>;
+def sub_vsx1 : SubRegIndex<128, 128>;
+def sub_pair0 : SubRegIndex<256>;
+def sub_pair1 : SubRegIndex<256, 256>;
}
@@ -54,13 +58,6 @@ class FPR<bits<5> num, string n> : PPCReg<n> {
let HWEncoding{4-0} = num;
}
-// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX)
-class QFPR<FPR SubReg, string n> : PPCReg<n> {
- let HWEncoding = SubReg.HWEncoding;
- let SubRegs = [SubReg];
- let SubRegIndices = [sub_64];
-}
-
// VF - One of the 32 64-bit floating-point subregisters of the vector
// registers (used by VSX).
class VF<bits<5> num, string n> : PPCReg<n> {
@@ -101,6 +98,27 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> {
let HWEncoding{4-0} = num;
}
+// ACC - One of the 8 512-bit VSX accumulators.
+class ACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+ let HWEncoding{2-0} = num;
+ let SubRegs = subregs;
+}
+
+// UACC - One of the 8 512-bit VSX accumulators prior to being primed.
+// Without using this register class, the register allocator has no way to
+// differentiate a primed accumulator from an unprimed accumulator.
+// This may result in invalid copies between primed and unprimed accumulators.
+class UACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+ let HWEncoding{2-0} = num;
+ let SubRegs = subregs;
+}
+
+// VSR Pairs - One of the 32 paired even-odd consecutive VSRs.
+class VSRPair<bits<5> num, string n, list<Register> subregs> : PPCReg<n> {
+ let HWEncoding{4-0} = num;
+ let SubRegs = subregs;
+}
+
// General-purpose registers
foreach Index = 0-31 in {
def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
@@ -132,12 +150,6 @@ foreach Index = 0-31 in {
DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
}
-// QPX Floating-point registers
-foreach Index = 0-31 in {
- def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>,
- DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
-}
-
// Vector registers
foreach Index = 0-31 in {
def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
@@ -156,6 +168,23 @@ foreach Index = 32-63 in {
def VSX#Index : VSXReg<Index, "vs"#Index>;
}
+let SubRegIndices = [sub_vsx0, sub_vsx1] in {
+ // VSR pairs 0 - 15 (corresponding to VSRs 0 - 30 paired with 1 - 31).
+ foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
+ def VSRp#!srl(Index, 1) : VSRPair<!srl(Index, 1), "vsp"#Index,
+ [!cast<VSRL>("VSL"#Index), !cast<VSRL>("VSL"#!add(Index, 1))]>,
+ DwarfRegNum<[0, 0]>;
+ }
+
+ // VSR pairs 16 - 31 (corresponding to VSRs 32 - 62 paired with 33 - 63).
+ foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
+ def VSRp#!add(!srl(Index, 1), 16) :
+ VSRPair<!add(!srl(Index, 1), 16), "vsp"#!add(Index, 32),
+ [!cast<VR>("V"#Index), !cast<VR>("V"#!add(Index, 1))]>,
+ DwarfRegNum<[0, 0]>;
+ }
+}
+
// The representation of r0 when treated as the constant 0.
def ZERO : GPR<0, "0">, DwarfRegAlias<R0>;
def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
@@ -343,16 +372,6 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC,
// Register class for single precision scalars in VSX registers
def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
-// For QPX
-def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13),
- (sequence "QF%u", 31, 14))>;
-def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>;
-def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> {
- // These are actually stored as floating-point values where a positive
- // number is true and anything else (including NaN) is false.
- let Size = 256;
-}
-
def CRBITRC : RegisterClass<"PPC", [i1], 32,
(add CR2LT, CR2GT, CR2EQ, CR2UN,
CR3LT, CR3GT, CR3EQ, CR3UN,
@@ -395,3 +414,44 @@ def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
let CopyCost = -1;
}
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+ def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>;
+ def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>;
+ def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>;
+ def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>;
+ def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>;
+ def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>;
+ def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>;
+ def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>;
+}
+def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
+ ACC4, ACC5, ACC6, ACC7)> {
+ let Size = 512;
+}
+
+let SubRegIndices = [sub_pair0, sub_pair1] in {
+ def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>;
+ def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>;
+ def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>;
+ def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>;
+ def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>;
+ def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>;
+ def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>;
+ def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>;
+}
+def UACCRC : RegisterClass<"PPC", [v512i1], 128,
+ (add UACC0, UACC1, UACC2, UACC3,
+ UACC4, UACC5, UACC6, UACC7)> {
+ let Size = 512;
+}
+
+// Allocate in the same order as the underlying VSX registers.
+def VSRpRC :
+ RegisterClass<"PPC", [v256i1], 128,
+ (add (sequence "VSRp%u", 0, 6),
+ (sequence "VSRp%u", 15, 7), VSRp17, VSRp18,
+ VSRp16, VSRp19, VSRp20, VSRp21, VSRp22, VSRp23,
+ VSRp24, VSRp25, VSRp31, VSRp30, VSRp29, VSRp28,
+ VSRp27, VSRp26)> {
+ let Size = 256;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index 0a1ae7e55b3c..571cc219ff2b 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -40,12 +40,11 @@ def P9Model : SchedMachineModel {
let CompleteModel = 1;
- // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing
- // Engine), prefixed instructions on Power 9, PC relative mem ops, or
- // instructions introduced in ISA 3.1.
- let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops,
- IsISA3_1];
-
+ // Do not support SPE (Signal Processing Engine), prefixed instructions on
+ // Power 9, paired vector mem ops, MMA, PC relative mem ops, or instructions
+ // introduced in ISA 3.1.
+ let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, MMA,
+ PCRelativeMemops, IsISA3_1];
}
let SchedModel = P9Model in {
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 3836cc960394..d31195f67ef1 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -11,9 +11,13 @@
//===----------------------------------------------------------------------===//
#include "PPCSubtarget.h"
+#include "GISel/PPCCallLowering.h"
+#include "GISel/PPCLegalizerInfo.h"
+#include "GISel/PPCRegisterBankInfo.h"
#include "PPC.h"
#include "PPCRegisterInfo.h"
#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/Attributes.h"
@@ -35,10 +39,6 @@ using namespace llvm;
static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness",
cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden);
-static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
- cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
- cl::Hidden);
-
static cl::opt<bool>
EnableMachinePipeliner("ppc-enable-pipeliner",
cl::desc("Enable Machine Pipeliner for PPC"),
@@ -53,11 +53,19 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const PPCTargetMachine &TM)
- : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+ : PPCGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TargetTriple(TT),
IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
TargetTriple.getArch() == Triple::ppc64le),
TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
- InstrInfo(*this), TLInfo(TM, *this) {}
+ InstrInfo(*this), TLInfo(TM, *this) {
+ CallLoweringInfo.reset(new PPCCallLowering(*getTargetLowering()));
+ Legalizer.reset(new PPCLegalizerInfo(*this));
+ auto *RBI = new PPCRegisterBankInfo(*getRegisterInfo());
+ RegBankInfo.reset(RBI);
+
+ InstSelector.reset(createPPCInstructionSelector(
+ *static_cast<const PPCTargetMachine *>(&TM), *this, *RBI));
+}
void PPCSubtarget::initializeEnvironment() {
StackAlignment = Align(16);
@@ -69,8 +77,8 @@ void PPCSubtarget::initializeEnvironment() {
HasHardFloat = false;
HasAltivec = false;
HasSPE = false;
+ HasEFPU2 = false;
HasFPU = false;
- HasQPX = false;
HasVSX = false;
NeedsTwoConstNR = false;
HasP8Vector = false;
@@ -78,6 +86,7 @@ void PPCSubtarget::initializeEnvironment() {
HasP8Crypto = false;
HasP9Vector = false;
HasP9Altivec = false;
+ HasMMA = false;
HasP10Vector = false;
HasPrefixInstrs = false;
HasPCRelativeMemops = false;
@@ -109,10 +118,10 @@ void PPCSubtarget::initializeEnvironment() {
HasInvariantFunctionDescriptors = false;
HasPartwordAtomics = false;
HasDirectMove = false;
- IsQPXStackUnaligned = false;
HasHTM = false;
HasFloat128 = false;
HasFusion = false;
+ HasStoreFusion = false;
HasAddiLoadFusion = false;
HasAddisLoadFusion = false;
IsISA3_0 = false;
@@ -122,7 +131,10 @@ void PPCSubtarget::initializeEnvironment() {
VectorsUseTwoUnits = false;
UsePPCPreRASchedStrategy = false;
UsePPCPostRASchedStrategy = false;
+ PairedVectorMemops = false;
PredictableSelectIsExpensive = false;
+ HasModernAIXAs = false;
+ IsAIX = false;
HasPOPCNTD = POPCNTD_Unavailable;
}
@@ -144,7 +156,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
InstrItins = getInstrItineraryForCPU(CPUName);
// Parse features string.
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
// If the user requested use of 64-bit regs, but the cpu selected doesn't
// support it, ignore.
@@ -158,7 +170,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (HasSPE && IsPPC64)
report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
- if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
+ if (HasSPE && (HasAltivec || HasVSX || HasFPU))
report_fatal_error(
"SPE and traditional floating point cannot both be enabled.\n", false);
@@ -166,15 +178,12 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (!HasSPE)
HasFPU = true;
- // QPX requires a 32-byte aligned stack. Note that we need to do this if
- // we're compiling for a BG/Q system regardless of whether or not QPX
- // is enabled because external functions will assume this alignment.
- IsQPXStackUnaligned = QPXStackUnaligned;
StackAlignment = getPlatformStackAlignment();
// Determine endianness.
// FIXME: Part of the TargetMachine.
- IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+ IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le ||
+ TargetTriple.getArch() == Triple::ppcle);
}
bool PPCSubtarget::enableMachineScheduler() const { return true; }
@@ -235,3 +244,20 @@ bool PPCSubtarget::isUsingPCRelativeCalls() const {
return isPPC64() && hasPCRelativeMemops() && isELFv2ABI() &&
CodeModel::Medium == getTargetMachine().getCodeModel();
}
+
+// GlobalISEL
+const CallLowering *PPCSubtarget::getCallLowering() const {
+ return CallLoweringInfo.get();
+}
+
+const RegisterBankInfo *PPCSubtarget::getRegBankInfo() const {
+ return RegBankInfo.get();
+}
+
+const LegalizerInfo *PPCSubtarget::getLegalizerInfo() const {
+ return Legalizer.get();
+}
+
+InstructionSelector *PPCSubtarget::getInstructionSelector() const {
+ return InstSelector.get();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
index ec329022c457..50d89390d5bc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -17,6 +17,9 @@
#include "PPCISelLowering.h"
#include "PPCInstrInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -97,7 +100,7 @@ protected:
bool HasAltivec;
bool HasFPU;
bool HasSPE;
- bool HasQPX;
+ bool HasEFPU2;
bool HasVSX;
bool NeedsTwoConstNR;
bool HasP8Vector;
@@ -108,6 +111,7 @@ protected:
bool HasP10Vector;
bool HasPrefixInstrs;
bool HasPCRelativeMemops;
+ bool HasMMA;
bool HasFCPSGN;
bool HasFSQRT;
bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -137,6 +141,7 @@ protected:
bool HasHTM;
bool HasFloat128;
bool HasFusion;
+ bool HasStoreFusion;
bool HasAddiLoadFusion;
bool HasAddisLoadFusion;
bool IsISA3_0;
@@ -146,21 +151,25 @@ protected:
bool VectorsUseTwoUnits;
bool UsePPCPreRASchedStrategy;
bool UsePPCPostRASchedStrategy;
+ bool PairedVectorMemops;
bool PredictableSelectIsExpensive;
+ bool HasModernAIXAs;
+ bool IsAIX;
POPCNTDKind HasPOPCNTD;
- /// When targeting QPX running a stock PPC64 Linux kernel where the stack
- /// alignment has not been changed, we need to keep the 16-byte alignment
- /// of the stack.
- bool IsQPXStackUnaligned;
-
const PPCTargetMachine &TM;
PPCFrameLowering FrameLowering;
PPCInstrInfo InstrInfo;
PPCTargetLowering TLInfo;
SelectionDAGTargetInfo TSInfo;
+ /// GlobalISel related APIs.
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
@@ -170,16 +179,13 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
/// getStackAlignment - Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
Align getStackAlignment() const { return StackAlignment; }
- /// getDarwinDirective - Returns the -m directive specified for the cpu.
- unsigned getDarwinDirective() const { return CPUDirective; }
-
/// getCPUDirective - Returns the -m directive specified for the cpu.
///
unsigned getCPUDirective() const { return CPUDirective; }
@@ -254,8 +260,8 @@ public:
bool hasFPCVT() const { return HasFPCVT; }
bool hasAltivec() const { return HasAltivec; }
bool hasSPE() const { return HasSPE; }
+ bool hasEFPU2() const { return HasEFPU2; }
bool hasFPU() const { return HasFPU; }
- bool hasQPX() const { return HasQPX; }
bool hasVSX() const { return HasVSX; }
bool needsTwoConstNR() const { return NeedsTwoConstNR; }
bool hasP8Vector() const { return HasP8Vector; }
@@ -266,6 +272,8 @@ public:
bool hasP10Vector() const { return HasP10Vector; }
bool hasPrefixInstrs() const { return HasPrefixInstrs; }
bool hasPCRelativeMemops() const { return HasPCRelativeMemops; }
+ bool hasMMA() const { return HasMMA; }
+ bool pairedVectorMemops() const { return PairedVectorMemops; }
bool hasMFOCRF() const { return HasMFOCRF; }
bool hasISEL() const { return HasISEL; }
bool hasBPERMD() const { return HasBPERMD; }
@@ -291,11 +299,7 @@ public:
bool hasPartwordAtomics() const { return HasPartwordAtomics; }
bool hasDirectMove() const { return HasDirectMove; }
- bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
Align getPlatformStackAlignment() const {
- if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
- return Align(32);
-
return Align(16);
}
@@ -315,6 +319,7 @@ public:
bool isISA3_1() const { return IsISA3_1; }
bool useLongCalls() const { return UseLongCalls; }
bool hasFusion() const { return HasFusion; }
+ bool hasStoreFusion() const { return HasStoreFusion; }
bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
bool needsSwapsForVSXMemOps() const {
@@ -325,9 +330,6 @@ public:
const Triple &getTargetTriple() const { return TargetTriple; }
- /// isBGQ - True if this is a BG/Q platform.
- bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
-
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
@@ -404,6 +406,12 @@ public:
bool isPredictableSelectIsExpensive() const {
return PredictableSelectIsExpensive;
}
+
+ // GlobalISEL
+ const CallLowering *getCallLowering() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ InstructionSelector *getInstructionSelector() const override;
};
} // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 4b809e0c8553..43dcc5844c4e 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -50,16 +50,17 @@ protected:
bool Changed = false;
bool NeedFence = true;
bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+ bool IsPCREL = false;
for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
I != IE;) {
MachineInstr &MI = *I;
+ IsPCREL = isPCREL(MI);
if (MI.getOpcode() != PPC::ADDItlsgdLADDR &&
MI.getOpcode() != PPC::ADDItlsldLADDR &&
MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
- MI.getOpcode() != PPC::ADDItlsldLADDR32) {
-
+ MI.getOpcode() != PPC::ADDItlsldLADDR32 && !IsPCREL) {
// Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
// as scheduling fences, we skip creating fences if we already
// have existing ADJCALLSTACKDOWN/UP to avoid nesting,
@@ -76,12 +77,16 @@ protected:
LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI);
Register OutReg = MI.getOperand(0).getReg();
- Register InReg = MI.getOperand(1).getReg();
- DebugLoc DL = MI.getDebugLoc();
+ Register InReg = PPC::NoRegister;
Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
- unsigned Opc1, Opc2;
- const Register OrigRegs[] = {OutReg, InReg, GPR3};
+ SmallVector<Register, 3> OrigRegs = {OutReg, GPR3};
+ if (!IsPCREL) {
+ InReg = MI.getOperand(1).getReg();
+ OrigRegs.push_back(InReg);
+ }
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned Opc1, Opc2;
switch (MI.getOpcode()) {
default:
llvm_unreachable("Opcode inconsistency error");
@@ -101,6 +106,13 @@ protected:
Opc1 = PPC::ADDItlsldL32;
Opc2 = PPC::GETtlsldADDR32;
break;
+ case PPC::PADDI8pc:
+ assert(IsPCREL && "Expecting General/Local Dynamic PCRel");
+ Opc1 = PPC::PADDI8pc;
+ Opc2 = MI.getOperand(2).getTargetFlags() ==
+ PPCII::MO_GOT_TLSGD_PCREL_FLAG
+ ? PPC::GETtlsADDRPCREL
+ : PPC::GETtlsldADDRPCREL;
}
// We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
@@ -113,9 +125,15 @@ protected:
BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
.addImm(0);
- // Expand into two ops built prior to the existing instruction.
- MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
- .addReg(InReg);
+ MachineInstr *Addi;
+ if (IsPCREL) {
+ Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addImm(0);
+ } else {
+ // Expand into two ops built prior to the existing instruction.
+ assert(InReg != PPC::NoRegister && "Operand must be a register");
+ Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addReg(InReg);
+ }
+
Addi->addOperand(MI.getOperand(2));
// The ADDItls* instruction is the first instruction in the
@@ -125,7 +143,10 @@ protected:
MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3)
.addReg(GPR3));
- Call->addOperand(MI.getOperand(3));
+ if (IsPCREL)
+ Call->addOperand(MI.getOperand(2));
+ else
+ Call->addOperand(MI.getOperand(3));
if (NeedFence)
BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
@@ -150,6 +171,14 @@ protected:
}
public:
+ bool isPCREL(const MachineInstr &MI) {
+ return (MI.getOpcode() == PPC::PADDI8pc) &&
+ (MI.getOperand(2).getTargetFlags() ==
+ PPCII::MO_GOT_TLSGD_PCREL_FLAG ||
+ MI.getOperand(2).getTargetFlags() ==
+ PPCII::MO_GOT_TLSLD_PCREL_FLAG);
+ }
+
bool runOnMachineFunction(MachineFunction &MF) override {
TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
LIS = &getAnalysis<LiveIntervals>();
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index f15f9c7f4942..0634833e64dc 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -24,12 +24,18 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -64,10 +70,6 @@ opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
cl::desc("Disable VSX Swap Removal for PPC"));
static cl::
-opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
- cl::desc("Disable QPX load splat simplification"));
-
-static cl::
opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
cl::desc("Disable machine peepholes for PPC"));
@@ -98,8 +100,9 @@ static cl::opt<bool>
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
// Register the targets
RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
- RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
- RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
+ RegisterTargetMachine<PPCTargetMachine> B(getThePPC32LETarget());
+ RegisterTargetMachine<PPCTargetMachine> C(getThePPC64Target());
+ RegisterTargetMachine<PPCTargetMachine> D(getThePPC64LETarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
#ifndef NDEBUG
@@ -114,13 +117,13 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
initializePPCReduceCRLogicalsPass(PR);
initializePPCBSelPass(PR);
initializePPCBranchCoalescingPass(PR);
- initializePPCQPXLoadSplatPass(PR);
initializePPCBoolRetToIntPass(PR);
initializePPCExpandISELPass(PR);
initializePPCPreEmitPeepholePass(PR);
initializePPCTLSDynamicCallPass(PR);
initializePPCMIPeepholePass(PR);
initializePPCLowerMASSVEntriesPass(PR);
+ initializeGlobalISel(PR);
}
/// Return the datalayout string of a subtarget.
@@ -128,8 +131,8 @@ static std::string getDataLayoutString(const Triple &T) {
bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
std::string Ret;
- // Most PPC* platforms are big endian, PPC64LE is little endian.
- if (T.getArch() == Triple::ppc64le)
+ // Most PPC* platforms are big endian, PPC(64)LE is little endian.
+ if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
Ret = "e";
else
Ret = "E";
@@ -143,10 +146,7 @@ static std::string getDataLayoutString(const Triple &T) {
// Note, the alignment values for f64 and i64 on ppc64 in Darwin
// documentation are wrong; these are correct (i.e. "what gcc does").
- if (is64Bit || !T.isOSDarwin())
- Ret += "-i64:64";
- else
- Ret += "-f64:32:64";
+ Ret += "-i64:64";
// PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
if (is64Bit)
@@ -154,6 +154,13 @@ static std::string getDataLayoutString(const Triple &T) {
else
Ret += "-n32";
+ // Specify the vector alignment explicitly. For v256i1 and v512i1, the
+ // calculated alignment would be 256*alignment(i1) and 512*alignment(i1),
+ // which is 256 and 512 bytes - way over aligned.
+ if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) &&
+ (T.isOSAIX() || T.isOSLinux()))
+ Ret += "-v256:256:256-v512:512:512";
+
return Ret;
}
@@ -183,13 +190,17 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
FullFS = "+invariant-function-descriptors";
}
+ if (TT.isOSAIX()) {
+ if (!FullFS.empty())
+ FullFS = "+aix," + FullFS;
+ else
+ FullFS = "+aix";
+ }
+
return FullFS;
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
- if (TT.isOSDarwin())
- return std::make_unique<TargetLoweringObjectFileMachO>();
-
if (TT.isOSAIX())
return std::make_unique<TargetLoweringObjectFileXCOFF>();
@@ -198,9 +209,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
const TargetOptions &Options) {
- if (TT.isOSDarwin())
- report_fatal_error("Darwin is no longer supported for PowerPC");
-
if (Options.MCOptions.getABIName().startswith("elfv1"))
return PPCTargetMachine::PPC_ABI_ELFv1;
else if (Options.MCOptions.getABIName().startswith("elfv2"))
@@ -230,10 +238,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
if (RM.hasValue())
return *RM;
- // Darwin defaults to dynamic-no-pic.
- if (TT.isOSDarwin())
- return Reloc::DynamicNoPIC;
-
// Big Endian PPC and AIX default to PIC.
if (TT.getArch() == Triple::ppc64 || TT.isOSAIX())
return Reloc::PIC_;
@@ -276,6 +280,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
std::make_unique<GenericScheduler>(C));
// add DAG Mutations here.
DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.hasStoreFusion())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.hasFusion())
DAG->addMutation(createPowerPCMacroFusionDAGMutation());
@@ -290,6 +296,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
std::make_unique<PPCPostRASchedStrategy>(C) :
std::make_unique<PostGenericScheduler>(C), true);
// add DAG Mutations here.
+ if (ST.hasStoreFusion())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.hasFusion())
DAG->addMutation(createPowerPCMacroFusionDAGMutation());
return DAG;
@@ -321,12 +329,10 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
@@ -388,6 +394,12 @@ public:
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+ // GlobalISEL
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override {
return createPPCMachineScheduler(C);
@@ -411,14 +423,9 @@ void PPCPassConfig::addIRPasses() {
// Lower generic MASSV routines to PowerPC subtarget-specific entries.
addPass(createPPCLowerMASSVEntriesPass());
-
- // For the BG/Q (or if explicitly requested), add explicit data prefetch
- // intrinsics.
- bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&
- getOptLevel() != CodeGenOpt::None;
+
+ // If explicitly requested, add explicit data prefetch intrinsics.
if (EnablePrefetch.getNumOccurrences() > 0)
- UsePrefetching = EnablePrefetch;
- if (UsePrefetching)
addPass(createLoopDataPrefetchPass());
if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
@@ -515,15 +522,8 @@ void PPCPassConfig::addPreRegAlloc() {
}
void PPCPassConfig::addPreSched2() {
- if (getOptLevel() != CodeGenOpt::None) {
+ if (getOptLevel() != CodeGenOpt::None)
addPass(&IfConverterID);
-
- // This optimization must happen after anything that might do store-to-load
- // forwarding. Here we're after RA (and, thus, when spills are inserted)
- // but before post-RA scheduling.
- if (!DisableQPXLoadSplat)
- addPass(createPPCQPXLoadSplatPass());
- }
}
void PPCPassConfig::addPreEmitPass() {
@@ -550,3 +550,24 @@ static MachineSchedRegistry
PPCPostRASchedRegistry("ppc-postra",
"Run PowerPC PostRA specific scheduler",
createPPCPostMachineScheduler);
+
+// Global ISEL
+bool PPCPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool PPCPassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+
+bool PPCPassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+
+bool PPCPassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
index fd1d14ae32d4..21faa4e710e3 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -58,6 +58,11 @@ public:
const Triple &TT = getTargetTriple();
return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
};
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+ // Addrspacecasts are always noops.
+ return true;
+ }
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index dc10dd80c8fa..b3d8100fe016 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -8,13 +8,19 @@
#include "PPCTargetTransformInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
+
using namespace llvm;
#define DEBUG_TYPE "ppctti"
@@ -22,8 +28,7 @@ using namespace llvm;
static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
-// This is currently only used for the data prefetch pass which is only enabled
-// for BG/Q by default.
+// This is currently only used for the data prefetch pass
static cl::opt<unsigned>
CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
cl::desc("The loop prefetch cache line size"));
@@ -59,6 +64,109 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
+Optional<Instruction *>
+PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::ppc_altivec_lvx:
+ case Intrinsic::ppc_altivec_lvxl:
+ // Turn PPC lvx -> load if the pointer is known aligned.
+ if (getOrEnforceKnownAlignment(
+ II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
+ &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+ Value *Ptr = IC.Builder.CreateBitCast(
+ II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+ return new LoadInst(II.getType(), Ptr, "", false, Align(16));
+ }
+ break;
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x: {
+ // Turn PPC VSX loads into normal loads.
+ Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
+ PointerType::getUnqual(II.getType()));
+ return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
+ }
+ case Intrinsic::ppc_altivec_stvx:
+ case Intrinsic::ppc_altivec_stvxl:
+ // Turn stvx -> store if the pointer is known aligned.
+ if (getOrEnforceKnownAlignment(
+ II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
+ &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+ Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+ Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+ return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
+ }
+ break;
+ case Intrinsic::ppc_vsx_stxvw4x:
+ case Intrinsic::ppc_vsx_stxvd2x: {
+ // Turn PPC VSX stores into normal stores.
+ Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+ Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+ return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
+ }
+ case Intrinsic::ppc_altivec_vperm:
+ // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+ // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+ // a vectorshuffle for little endian, we must undo the transformation
+ // performed on vec_perm in altivec.h. That is, we must complement
+ // the permutation mask with respect to 31 and reverse the order of
+ // V1 and V2.
+ if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
+ "Bad type for intrinsic!");
+
+ // Check that all of the elements are integer constants or undefs.
+ bool AllEltsOk = true;
+ for (unsigned i = 0; i != 16; ++i) {
+ Constant *Elt = Mask->getAggregateElement(i);
+ if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+ AllEltsOk = false;
+ break;
+ }
+ }
+
+ if (AllEltsOk) {
+ // Cast the input vectors to byte vectors.
+ Value *Op0 =
+ IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
+ Value *Op1 =
+ IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
+ Value *Result = UndefValue::get(Op0->getType());
+
+ // Only extract each element once.
+ Value *ExtractedElts[32];
+ memset(ExtractedElts, 0, sizeof(ExtractedElts));
+
+ for (unsigned i = 0; i != 16; ++i) {
+ if (isa<UndefValue>(Mask->getAggregateElement(i)))
+ continue;
+ unsigned Idx =
+ cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
+ Idx &= 31; // Match the hardware behavior.
+ if (DL.isLittleEndian())
+ Idx = 31 - Idx;
+
+ if (!ExtractedElts[Idx]) {
+ Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
+ Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
+ ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
+ Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
+ }
+
+ // Insert this value into the result vector.
+ Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
+ IC.Builder.getInt32(i));
+ }
+ return CastInst::Create(Instruction::BitCast, Result, II.getType());
+ }
+ }
+ break;
+ }
+ return None;
+}
+
int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
if (DisablePPCConstHoist)
@@ -126,9 +234,10 @@ int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
if (DisablePPCConstHoist)
- return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind);
+ return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
assert(Ty->isIntegerTy());
@@ -274,8 +383,34 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
return false;
};
+ auto supportedHalfPrecisionOp = [](Instruction *Inst) {
+ switch (Inst->getOpcode()) {
+ default:
+ return false;
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::FPToUI:
+ case Instruction::UIToFP:
+ case Instruction::FPToSI:
+ case Instruction::SIToFP:
+ return true;
+ }
+ };
+
for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
J != JE; ++J) {
+ // There are no direct operations on half precision so assume that
+ // anything with that type requires a call except for a few select
+ // operations with Power9.
+ if (Instruction *CurrInst = dyn_cast<Instruction>(J)) {
+ for (const auto &Op : CurrInst->operands()) {
+ if (Op->getType()->getScalarType()->isHalfTy() ||
+ CurrInst->getType()->getScalarType()->isHalfTy())
+ return !(ST->isISA3_0() && supportedHalfPrecisionOp(CurrInst));
+ }
+ }
if (CallInst *CI = dyn_cast<CallInst>(J)) {
// Inline ASM is okay, unless it clobbers the ctr register.
if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
@@ -297,6 +432,30 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
case Intrinsic::loop_decrement:
return true;
+ // Binary operations on 128-bit value will use CTR.
+ case Intrinsic::experimental_constrained_fadd:
+ case Intrinsic::experimental_constrained_fsub:
+ case Intrinsic::experimental_constrained_fmul:
+ case Intrinsic::experimental_constrained_fdiv:
+ case Intrinsic::experimental_constrained_frem:
+ if (F->getType()->getScalarType()->isFP128Ty() ||
+ F->getType()->getScalarType()->isPPC_FP128Ty())
+ return true;
+ break;
+
+ case Intrinsic::experimental_constrained_fptosi:
+ case Intrinsic::experimental_constrained_fptoui:
+ case Intrinsic::experimental_constrained_sitofp:
+ case Intrinsic::experimental_constrained_uitofp: {
+ Type *SrcType = CI->getArgOperand(0)->getType()->getScalarType();
+ Type *DstType = CI->getType()->getScalarType();
+ if (SrcType->isPPC_FP128Ty() || DstType->isPPC_FP128Ty() ||
+ isLargeIntegerTy(!TM.isPPC64(), SrcType) ||
+ isLargeIntegerTy(!TM.isPPC64(), DstType))
+ return true;
+ break;
+ }
+
// Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
// because, although it does clobber the counter register, the
// control can't then return to inside the loop unless there is also
@@ -315,6 +474,15 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
case Intrinsic::pow:
case Intrinsic::sin:
case Intrinsic::cos:
+ case Intrinsic::experimental_constrained_powi:
+ case Intrinsic::experimental_constrained_log:
+ case Intrinsic::experimental_constrained_log2:
+ case Intrinsic::experimental_constrained_log10:
+ case Intrinsic::experimental_constrained_exp:
+ case Intrinsic::experimental_constrained_exp2:
+ case Intrinsic::experimental_constrained_pow:
+ case Intrinsic::experimental_constrained_sin:
+ case Intrinsic::experimental_constrained_cos:
return true;
case Intrinsic::copysign:
if (CI->getArgOperand(0)->getType()->getScalarType()->
@@ -336,6 +504,54 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
case Intrinsic::llround: Opcode = ISD::LLROUND; break;
case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
+ case Intrinsic::experimental_constrained_fcmp:
+ Opcode = ISD::STRICT_FSETCC;
+ break;
+ case Intrinsic::experimental_constrained_fcmps:
+ Opcode = ISD::STRICT_FSETCCS;
+ break;
+ case Intrinsic::experimental_constrained_fma:
+ Opcode = ISD::STRICT_FMA;
+ break;
+ case Intrinsic::experimental_constrained_sqrt:
+ Opcode = ISD::STRICT_FSQRT;
+ break;
+ case Intrinsic::experimental_constrained_floor:
+ Opcode = ISD::STRICT_FFLOOR;
+ break;
+ case Intrinsic::experimental_constrained_ceil:
+ Opcode = ISD::STRICT_FCEIL;
+ break;
+ case Intrinsic::experimental_constrained_trunc:
+ Opcode = ISD::STRICT_FTRUNC;
+ break;
+ case Intrinsic::experimental_constrained_rint:
+ Opcode = ISD::STRICT_FRINT;
+ break;
+ case Intrinsic::experimental_constrained_lrint:
+ Opcode = ISD::STRICT_LRINT;
+ break;
+ case Intrinsic::experimental_constrained_llrint:
+ Opcode = ISD::STRICT_LLRINT;
+ break;
+ case Intrinsic::experimental_constrained_nearbyint:
+ Opcode = ISD::STRICT_FNEARBYINT;
+ break;
+ case Intrinsic::experimental_constrained_round:
+ Opcode = ISD::STRICT_FROUND;
+ break;
+ case Intrinsic::experimental_constrained_lround:
+ Opcode = ISD::STRICT_LROUND;
+ break;
+ case Intrinsic::experimental_constrained_llround:
+ Opcode = ISD::STRICT_LLROUND;
+ break;
+ case Intrinsic::experimental_constrained_minnum:
+ Opcode = ISD::STRICT_FMINNUM;
+ break;
+ case Intrinsic::experimental_constrained_maxnum:
+ Opcode = ISD::STRICT_FMAXNUM;
+ break;
case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
}
@@ -597,10 +813,7 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
}
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
- // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
- // on combining the loads generated for consecutive accesses, and failure to
- // do so is particularly expensive. This makes it much more likely (compared
- // to only using concatenation unrolling).
+ // On the A2, always unroll aggressively.
if (ST->getCPUDirective() == PPC::DIR_A2)
return true;
@@ -660,7 +873,6 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
if (Vector) {
- if (ST->hasQPX()) return 256;
if (ST->hasAltivec()) return 128;
return 0;
}
@@ -689,8 +901,6 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
}
unsigned PPCTTIImpl::getPrefetchDistance() const {
- // This seems like a reasonable default for the BG/Q (this pass is enabled, by
- // default, only on the BG/Q).
return 300;
}
@@ -779,7 +989,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+ // PPC, for both Altivec/VSX, support cheap arbitrary permutations
// (at least in the sense that there need only be one non-loop-invariant
// instruction). We need one such shuffle instruction for each actual
// register (this is not true for arbitrary shuffles, but is true for the
@@ -796,11 +1006,12 @@ int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
}
int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
- int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+ int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
// TODO: Allow non-throughput costs that aren't binary.
if (CostKind != TTI::TCK_RecipThroughput)
@@ -809,9 +1020,11 @@ int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
}
int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
- int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ int Cost =
+ BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return Cost;
@@ -835,13 +1048,6 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return Cost;
- } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
- // Floating point scalars are already located in index #0.
- if (Index == 0)
- return 0;
-
- return Cost;
-
} else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
if (ST->hasP9Altivec()) {
if (ISD == ISD::INSERT_VECTOR_ELT)
@@ -865,7 +1071,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
// The cost of the load constant for a vector extract is disregarded
// (invariant, easily schedulable).
return vectorCostAdjustment(1, Opcode, Val, nullptr);
-
+
} else if (ST->hasDirectMove())
// Assume permute has standard cost.
// Assume move-to/move-from VSR have 2x standard cost.
@@ -916,8 +1122,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
bool IsVSXType = ST->hasVSX() &&
(LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
- bool IsQPXType = ST->hasQPX() &&
- (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
// VSX has 32b/64b load instructions. Legalization can handle loading of
// 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
@@ -940,8 +1144,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// for Altivec types using the VSX instructions, but that's more expensive
// than using the permutation-based load sequence. On the P8, that's no
// longer true.
- if (Opcode == Instruction::Load &&
- ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+ if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
*Alignment >= LT.second.getScalarType().getStoreSize())
return Cost + LT.first; // Add the cost of the permutations.
@@ -994,7 +1197,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(
getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
CostKind);
- // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+ // PPC, for both Altivec/VSX, support cheap arbitrary permutations
// (at least in the sense that there need only be one non-loop-invariant
// instruction). For each result vector, we need one shuffle per incoming
// vector (except that the first shuffle can take two incoming vectors
@@ -1044,3 +1247,51 @@ bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
else
return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
}
+
+bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
+ return false;
+}
+
+bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) {
+ switch (Inst->getIntrinsicID()) {
+ case Intrinsic::ppc_altivec_lvx:
+ case Intrinsic::ppc_altivec_lvxl:
+ case Intrinsic::ppc_altivec_lvebx:
+ case Intrinsic::ppc_altivec_lvehx:
+ case Intrinsic::ppc_altivec_lvewx:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x_be:
+ case Intrinsic::ppc_vsx_lxvw4x_be:
+ case Intrinsic::ppc_vsx_lxvl:
+ case Intrinsic::ppc_vsx_lxvll:
+ case Intrinsic::ppc_vsx_lxvp: {
+ Info.PtrVal = Inst->getArgOperand(0);
+ Info.ReadMem = true;
+ Info.WriteMem = false;
+ return true;
+ }
+ case Intrinsic::ppc_altivec_stvx:
+ case Intrinsic::ppc_altivec_stvxl:
+ case Intrinsic::ppc_altivec_stvebx:
+ case Intrinsic::ppc_altivec_stvehx:
+ case Intrinsic::ppc_altivec_stvewx:
+ case Intrinsic::ppc_vsx_stxvd2x:
+ case Intrinsic::ppc_vsx_stxvw4x:
+ case Intrinsic::ppc_vsx_stxvd2x_be:
+ case Intrinsic::ppc_vsx_stxvw4x_be:
+ case Intrinsic::ppc_vsx_stxvl:
+ case Intrinsic::ppc_vsx_stxvll:
+ case Intrinsic::ppc_vsx_stxvp: {
+ Info.PtrVal = Inst->getArgOperand(1);
+ Info.ReadMem = false;
+ Info.WriteMem = true;
+ return true;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index d998521084e1..bc946715156f 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -41,6 +41,9 @@ public:
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+
/// \name Scalar TTI Implementations
/// @{
@@ -49,7 +52,8 @@ public:
TTI::TargetCostKind CostKind);
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind);
@@ -64,12 +68,14 @@ public:
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT, AssumptionCache *AC,
TargetLibraryInfo *LibInfo);
+ bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
+ bool isNumRegsMajorCostOfLSR();
/// @}
@@ -103,10 +109,11 @@ public:
const Instruction *CxtI = nullptr);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::TargetCostKind CostKind,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 3e6d1c7939f1..e72e29112da7 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -315,9 +315,9 @@ protected:
// Extend the live interval of the addend source (it might end at the
// copy to be removed, or somewhere in between there and here). This
// is necessary only if it is a physical register.
- if (!Register::isVirtualRegister(AddendSrcReg))
- for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
- ++Units) {
+ if (!AddendSrcReg.isVirtual())
+ for (MCRegUnitIterator Units(AddendSrcReg.asMCReg(), TRI);
+ Units.isValid(); ++Units) {
unsigned Unit = *Units;
LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index c3729da0b07b..ff251f55afff 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -254,10 +254,11 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (isAnyVecReg(Reg, Partial)) {
+ // All operands need to be checked because there are instructions that
+ // operate on a partial register and produce a full register (such as
+ // XXPERMDIs).
+ if (isAnyVecReg(Reg, Partial))
RelevantInstr = true;
- break;
- }
}
if (!RelevantInstr)
@@ -689,6 +690,29 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
LLVM_DEBUG(UseMI.dump());
LLVM_DEBUG(dbgs() << "\n");
}
+
+ // It is possible that the load feeds a swap and that swap feeds a
+ // store. In such a case, the code is actually trying to store a swapped
+ // vector. We must reject such webs.
+ if (SwapVector[UseIdx].IsSwap && !SwapVector[UseIdx].IsLoad &&
+ !SwapVector[UseIdx].IsStore) {
+ Register SwapDefReg = UseMI.getOperand(0).getReg();
+ for (MachineInstr &UseOfUseMI :
+ MRI->use_nodbg_instructions(SwapDefReg)) {
+ int UseOfUseIdx = SwapMap[&UseOfUseMI];
+ if (SwapVector[UseOfUseIdx].IsStore) {
+ SwapVector[Repr].WebRejected = 1;
+ LLVM_DEBUG(
+ dbgs() << format(
+ "Web %d rejected for load/swap feeding a store\n", Repr));
+ LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
+ LLVM_DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
+ LLVM_DEBUG(UseMI.dump());
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ }
}
// Reject webs that contain swapping stores that are fed by something
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 649bd648a6cf..6bb952f27fee 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -14,6 +14,10 @@ Target &llvm::getThePPC32Target() {
static Target ThePPC32Target;
return ThePPC32Target;
}
+Target &llvm::getThePPC32LETarget() {
+ static Target ThePPC32LETarget;
+ return ThePPC32LETarget;
+}
Target &llvm::getThePPC64Target() {
static Target ThePPC64Target;
return ThePPC64Target;
@@ -24,9 +28,12 @@ Target &llvm::getThePPC64LETarget() {
}
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() {
- RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32",
+ RegisterTarget<Triple::ppc, /*HasJIT=*/true> W(getThePPC32Target(), "ppc32",
"PowerPC 32", "PPC");
+ RegisterTarget<Triple::ppcle, /*HasJIT=*/true> X(
+ getThePPC32LETarget(), "ppc32le", "PowerPC 32 LE", "PPC");
+
RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64",
"PowerPC 64", "PPC");
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
index 2d0afbfb1be0..f9d20ef00df8 100644
--- a/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h
@@ -14,6 +14,7 @@ namespace llvm {
class Target;
Target &getThePPC32Target();
+Target &getThePPC32LETarget();
Target &getThePPC64Target();
Target &getThePPC64LETarget();
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 407f980bd35e..e7e590153605 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -7,20 +7,18 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/RISCVAsmBackend.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
+#include "MCTargetDesc/RISCVInstPrinter.h"
#include "MCTargetDesc/RISCVMCExpr.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "MCTargetDesc/RISCVMatInt.h"
#include "MCTargetDesc/RISCVTargetStreamer.h"
-#include "RISCVInstrInfo.h"
#include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
-#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -33,6 +31,7 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/RISCVAttributes.h"
@@ -99,7 +98,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
// Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
// synthesize the desired immedate value into the destination register.
- void emitLoadImm(Register DestReg, int64_t Value, MCStreamer &Out);
+ void emitLoadImm(MCRegister DestReg, int64_t Value, MCStreamer &Out);
// Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
// helpers such as emitLoadLocalAddress and emitLoadAddress.
@@ -125,6 +124,13 @@ class RISCVAsmParser : public MCTargetAsmParser {
void emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
MCStreamer &Out, bool HasTmpReg);
+ // Helper to emit pseudo sign/zero extend instruction.
+ void emitPseudoExtend(MCInst &Inst, bool SignExtend, int64_t Width,
+ SMLoc IDLoc, MCStreamer &Out);
+
+ // Helper to emit pseudo vmsge{u}.vx instruction.
+ void emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc, MCStreamer &Out);
+
// Checks that a PseudoAddTPRel is using x4/tp in its second input operand.
// Enforcing this using a restricted register class for the second input
// operand of PseudoAddTPRel results in a poor diagnostic due to the fact
@@ -217,8 +223,7 @@ public:
};
static bool classifySymbolRef(const MCExpr *Expr,
- RISCVMCExpr::VariantKind &Kind,
- int64_t &Addend);
+ RISCVMCExpr::VariantKind &Kind);
RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
@@ -262,7 +267,7 @@ struct RISCVOperand : public MCParsedAsmOperand {
bool IsRV64;
struct RegOp {
- Register RegNum;
+ MCRegister RegNum;
};
struct ImmOp {
@@ -277,23 +282,8 @@ struct RISCVOperand : public MCParsedAsmOperand {
// e.g.: read/write or user/supervisor/machine privileges.
};
- enum class VSEW {
- SEW_8 = 0,
- SEW_16,
- SEW_32,
- SEW_64,
- SEW_128,
- SEW_256,
- SEW_512,
- SEW_1024,
- };
-
- enum class VLMUL { LMUL_1 = 0, LMUL_2, LMUL_4, LMUL_8 };
-
struct VTypeOp {
- VSEW Sew;
- VLMUL Lmul;
- unsigned Encoding;
+ unsigned Val;
};
SMLoc StartLoc, EndLoc;
@@ -373,7 +363,7 @@ public:
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
bool IsValid;
if (!IsConstantImm)
- IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+ IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
else
IsValid = isShiftedInt<N - 1, 1>(Imm);
return IsValid && VK == RISCVMCExpr::VK_RISCV_None;
@@ -387,7 +377,7 @@ public:
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
- return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
@@ -397,7 +387,7 @@ public:
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
- return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
(VK == RISCVMCExpr::VK_RISCV_CALL ||
VK == RISCVMCExpr::VK_RISCV_CALL_PLT);
}
@@ -408,7 +398,7 @@ public:
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
- return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
VK == RISCVMCExpr::VK_RISCV_CALL;
}
@@ -418,7 +408,7 @@ public:
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
- return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+ return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
VK == RISCVMCExpr::VK_RISCV_TPREL_ADD;
}
@@ -523,16 +513,6 @@ public:
return IsConstantImm && isUInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
}
- bool isUImm5NonZero() const {
- int64_t Imm;
- RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
- if (!isImm())
- return false;
- bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
- return IsConstantImm && isUInt<5>(Imm) && (Imm != 0) &&
- VK == RISCVMCExpr::VK_RISCV_None;
- }
-
bool isSImm5() const {
if (!isImm())
return false;
@@ -549,7 +529,7 @@ public:
int64_t Imm;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isInt<6>(Imm) &&
- VK == RISCVMCExpr::VK_RISCV_None;
+ VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm6NonZero() const {
@@ -633,7 +613,7 @@ public:
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm)
- IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+ IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
else
IsValid = isInt<12>(Imm);
return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) ||
@@ -664,7 +644,7 @@ public:
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm) {
- IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+ IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
return IsValid && (VK == RISCVMCExpr::VK_RISCV_HI ||
VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
} else {
@@ -682,7 +662,7 @@ public:
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm) {
- IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
+ IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
@@ -730,7 +710,7 @@ public:
}
StringRef getSysReg() const {
- assert(Kind == KindTy::SystemRegister && "Invalid access!");
+ assert(Kind == KindTy::SystemRegister && "Invalid type access!");
return StringRef(SysReg.Data, SysReg.Length);
}
@@ -744,59 +724,25 @@ public:
return Tok;
}
- static StringRef getSEWStr(VSEW Sew) {
- switch (Sew) {
- case VSEW::SEW_8:
- return "e8";
- case VSEW::SEW_16:
- return "e16";
- case VSEW::SEW_32:
- return "e32";
- case VSEW::SEW_64:
- return "e64";
- case VSEW::SEW_128:
- return "e128";
- case VSEW::SEW_256:
- return "e256";
- case VSEW::SEW_512:
- return "e512";
- case VSEW::SEW_1024:
- return "e1024";
- }
- return "";
- }
-
- static StringRef getLMULStr(VLMUL Lmul) {
- switch (Lmul) {
- case VLMUL::LMUL_1:
- return "m1";
- case VLMUL::LMUL_2:
- return "m2";
- case VLMUL::LMUL_4:
- return "m4";
- case VLMUL::LMUL_8:
- return "m8";
- }
- return "";
- }
-
- StringRef getVType(SmallString<32> &Buf) const {
- assert(Kind == KindTy::VType && "Invalid access!");
- Buf.append(getSEWStr(VType.Sew));
- Buf.append(",");
- Buf.append(getLMULStr(VType.Lmul));
-
- return Buf.str();
+ unsigned getVType() const {
+ assert(Kind == KindTy::VType && "Invalid type access!");
+ return VType.Val;
}
void print(raw_ostream &OS) const override {
+ auto RegName = [](unsigned Reg) {
+ if (Reg)
+ return RISCVInstPrinter::getRegisterName(Reg);
+ else
+ return "noreg";
+ };
+
switch (Kind) {
case KindTy::Immediate:
OS << *getImm();
break;
case KindTy::Register:
- OS << "<register x";
- OS << getReg() << ">";
+ OS << "<register " << RegName(getReg()) << ">";
break;
case KindTy::Token:
OS << "'" << getToken() << "'";
@@ -805,8 +751,9 @@ public:
OS << "<sysreg: " << getSysReg() << '>';
break;
case KindTy::VType:
- SmallString<32> VTypeBuf;
- OS << "<vtype: " << getVType(VTypeBuf) << '>';
+ OS << "<vtype: ";
+ RISCVVType::printVType(getVType(), OS);
+ OS << '>';
break;
}
}
@@ -852,15 +799,10 @@ public:
return Op;
}
- static std::unique_ptr<RISCVOperand> createVType(APInt Sew, APInt Lmul,
- SMLoc S, bool IsRV64) {
+ static std::unique_ptr<RISCVOperand> createVType(unsigned VTypeI, SMLoc S,
+ bool IsRV64) {
auto Op = std::make_unique<RISCVOperand>(KindTy::VType);
- Sew.ashrInPlace(3);
- unsigned SewLog2 = Sew.logBase2();
- unsigned LmulLog2 = Lmul.logBase2();
- Op->VType.Sew = static_cast<VSEW>(SewLog2);
- Op->VType.Lmul = static_cast<VLMUL>(LmulLog2);
- Op->VType.Encoding = (SewLog2 << 2) | LmulLog2;
+ Op->VType.Val = VTypeI;
Op->StartLoc = S;
Op->IsRV64 = IsRV64;
return Op;
@@ -889,16 +831,6 @@ public:
addExpr(Inst, getImm());
}
- void addSImm5Plus1Operands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- int64_t Imm = 0;
- RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
- bool IsConstant = evaluateConstantImm(getImm(), Imm, VK);
- assert(IsConstant && "Expect constant value!");
- (void)IsConstant;
- Inst.addOperand(MCOperand::createImm(Imm - 1));
- }
-
void addFenceArgOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// isFenceArg has validated the operand, meaning this cast is safe
@@ -925,7 +857,7 @@ public:
void addVTypeIOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createImm(VType.Encoding));
+ Inst.addOperand(MCOperand::createImm(getVType()));
}
// Returns the rounding mode represented by this RISCVOperand. Should only
@@ -952,7 +884,12 @@ public:
#define GET_MNEMONIC_SPELL_CHECKER
#include "RISCVGenAsmMatcher.inc"
-static Register convertFPR64ToFPR32(Register Reg) {
+static MCRegister convertFPR64ToFPR16(MCRegister Reg) {
+ assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register");
+ return Reg - RISCV::F0_D + RISCV::F0_H;
+}
+
+static MCRegister convertFPR64ToFPR32(MCRegister Reg) {
assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register");
return Reg - RISCV::F0_D + RISCV::F0_F;
}
@@ -963,7 +900,7 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
if (!Op.isReg())
return Match_InvalidOperand;
- Register Reg = Op.getReg();
+ MCRegister Reg = Op.getReg();
bool IsRegFPR64 =
RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg);
bool IsRegFPR64C =
@@ -976,6 +913,12 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
Op.Reg.RegNum = convertFPR64ToFPR32(Reg);
return Match_Success;
}
+ // As the parser couldn't differentiate an FPR16 from an FPR64, coerce the
+ // register from FPR64 to FPR16 if necessary.
+ if (IsRegFPR64 && Kind == MCK_FPR16) {
+ Op.Reg.RegNum = convertFPR64ToFPR16(Reg);
+ return Match_Success;
+ }
return Match_InvalidOperand;
}
@@ -1079,6 +1022,9 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 4) - 1);
case Match_InvalidUImm5:
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
+ case Match_InvalidSImm5:
+ return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4),
+ (1 << 4) - 1);
case Match_InvalidSImm6:
return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
(1 << 5) - 1);
@@ -1181,8 +1127,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
case Match_InvalidVTypeI: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
- return Error(ErrorLoc,
- "operand must be e[8|16|32|64|128|256|512|1024],m[1|2|4|8]");
+ return Error(
+ ErrorLoc,
+ "operand must be "
+ "e[8|16|32|64|128|256|512|1024],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]");
}
case Match_InvalidVMaskRegister: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
@@ -1202,13 +1150,15 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// alternative ABI names), setting RegNo to the matching register. Upon
// failure, returns true and sets RegNo to 0. If IsRV32E then registers
// x16-x31 will be rejected.
-static bool matchRegisterNameHelper(bool IsRV32E, Register &RegNo,
+static bool matchRegisterNameHelper(bool IsRV32E, MCRegister &RegNo,
StringRef Name) {
RegNo = MatchRegisterName(Name);
- // The 32- and 64-bit FPRs have the same asm name. Check that the initial
- // match always matches the 64-bit variant, and not the 32-bit one.
+ // The 16-/32- and 64-bit FPRs have the same asm name. Check that the initial
+ // match always matches the 64-bit variant, and not the 16/32-bit one.
+ assert(!(RegNo >= RISCV::F0_H && RegNo <= RISCV::F31_H));
assert(!(RegNo >= RISCV::F0_F && RegNo <= RISCV::F31_F));
// The default FPR register class is based on the tablegen enum ordering.
+ static_assert(RISCV::F0_D < RISCV::F0_H, "FPR matching must be updated");
static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated");
if (RegNo == RISCV::NoRegister)
RegNo = MatchRegisterAltName(Name);
@@ -1233,7 +1183,7 @@ OperandMatchResultTy RISCVAsmParser::tryParseRegister(unsigned &RegNo,
RegNo = 0;
StringRef Name = getLexer().getTok().getIdentifier();
- if (matchRegisterNameHelper(isRV32E(), (Register &)RegNo, Name))
+ if (matchRegisterNameHelper(isRV32E(), (MCRegister &)RegNo, Name))
return MatchOperand_NoMatch;
getParser().Lex(); // Eat identifier token.
@@ -1265,7 +1215,7 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
return MatchOperand_NoMatch;
case AsmToken::Identifier:
StringRef Name = getLexer().getTok().getIdentifier();
- Register RegNo;
+ MCRegister RegNo;
matchRegisterNameHelper(isRV32E(), RegNo, Name);
if (RegNo == RISCV::NoRegister) {
@@ -1549,39 +1499,75 @@ OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
if (getLexer().getKind() != AsmToken::Identifier)
return MatchOperand_NoMatch;
- // Parse "e8,m1"
+ // Parse "e8,m1,t[a|u],m[a|u]"
StringRef Name = getLexer().getTok().getIdentifier();
if (!Name.consume_front("e"))
return MatchOperand_NoMatch;
- APInt Sew(16, Name, 10);
- if (Sew != 8 && Sew != 16 && Sew != 32 && Sew != 64 && Sew != 128 &&
- Sew != 256 && Sew != 512 && Sew != 1024)
+ unsigned Sew;
+ if (Name.getAsInteger(10, Sew))
+ return MatchOperand_NoMatch;
+ if (!RISCVVType::isValidSEW(Sew))
return MatchOperand_NoMatch;
getLexer().Lex();
- if (getLexer().getKind() == AsmToken::EndOfStatement) {
- Operands.push_back(
- RISCVOperand::createVType(Sew, APInt(16, 1), S, isRV64()));
+ if (!getLexer().is(AsmToken::Comma))
+ return MatchOperand_NoMatch;
+ getLexer().Lex();
- return MatchOperand_Success;
- }
+ Name = getLexer().getTok().getIdentifier();
+ if (!Name.consume_front("m"))
+ return MatchOperand_NoMatch;
+ // "m" or "mf"
+ bool Fractional = Name.consume_front("f");
+ unsigned Lmul;
+ if (Name.getAsInteger(10, Lmul))
+ return MatchOperand_NoMatch;
+ if (!RISCVVType::isValidLMUL(Lmul, Fractional))
+ return MatchOperand_NoMatch;
+ getLexer().Lex();
if (!getLexer().is(AsmToken::Comma))
return MatchOperand_NoMatch;
getLexer().Lex();
Name = getLexer().getTok().getIdentifier();
- if (!Name.consume_front("m"))
+ // ta or tu
+ bool TailAgnostic;
+ if (Name == "ta")
+ TailAgnostic = true;
+ else if (Name == "tu")
+ TailAgnostic = false;
+ else
+ return MatchOperand_NoMatch;
+ getLexer().Lex();
+
+ if (!getLexer().is(AsmToken::Comma))
return MatchOperand_NoMatch;
- APInt Lmul(16, Name, 10);
- if (Lmul != 1 && Lmul != 2 && Lmul != 4 && Lmul != 8)
+ getLexer().Lex();
+
+ Name = getLexer().getTok().getIdentifier();
+ // ma or mu
+ bool MaskAgnostic;
+ if (Name == "ma")
+ MaskAgnostic = true;
+ else if (Name == "mu")
+ MaskAgnostic = false;
+ else
return MatchOperand_NoMatch;
getLexer().Lex();
if (getLexer().getKind() != AsmToken::EndOfStatement)
return MatchOperand_NoMatch;
- Operands.push_back(RISCVOperand::createVType(Sew, Lmul, S, isRV64()));
+ unsigned SewLog2 = Log2_32(Sew / 8);
+ unsigned LmulLog2 = Log2_32(Lmul);
+ RISCVVSEW VSEW = static_cast<RISCVVSEW>(SewLog2);
+ RISCVVLMUL VLMUL =
+ static_cast<RISCVVLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
+
+ unsigned VTypeI =
+ RISCVVType::encodeVTYPE(VLMUL, VSEW, TailAgnostic, MaskAgnostic);
+ Operands.push_back(RISCVOperand::createVType(VTypeI, S, isRV64()));
return MatchOperand_Success;
}
@@ -1596,7 +1582,7 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
Error(getLoc(), "expected '.t' suffix");
return MatchOperand_ParseFail;
}
- Register RegNo;
+ MCRegister RegNo;
matchRegisterNameHelper(isRV32E(), RegNo, Name);
if (RegNo == RISCV::NoRegister)
@@ -1788,48 +1774,19 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
}
bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
- RISCVMCExpr::VariantKind &Kind,
- int64_t &Addend) {
+ RISCVMCExpr::VariantKind &Kind) {
Kind = RISCVMCExpr::VK_RISCV_None;
- Addend = 0;
if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
Kind = RE->getKind();
Expr = RE->getSubExpr();
}
- // It's a simple symbol reference or constant with no addend.
- if (isa<MCConstantExpr>(Expr) || isa<MCSymbolRefExpr>(Expr))
- return true;
-
- const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
- if (!BE)
- return false;
-
- if (!isa<MCSymbolRefExpr>(BE->getLHS()))
- return false;
-
- if (BE->getOpcode() != MCBinaryExpr::Add &&
- BE->getOpcode() != MCBinaryExpr::Sub)
- return false;
-
- // We are able to support the subtraction of two symbol references
- if (BE->getOpcode() == MCBinaryExpr::Sub &&
- isa<MCSymbolRefExpr>(BE->getRHS()))
- return true;
-
- // See if the addend is a constant, otherwise there's more going
- // on here than we can deal with.
- auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
- if (!AddendExpr)
- return false;
-
- Addend = AddendExpr->getValue();
- if (BE->getOpcode() == MCBinaryExpr::Sub)
- Addend = -Addend;
-
- // It's some symbol reference + a constant addend
- return Kind != RISCVMCExpr::VK_RISCV_Invalid;
+ MCValue Res;
+ MCFixup Fixup;
+ if (Expr->evaluateAsRelocatable(Res, nullptr, &Fixup))
+ return Res.getRefKind() == RISCVMCExpr::VK_RISCV_None;
+ return false;
}
bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
@@ -2040,7 +1997,33 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
else
return Error(ValueExprLoc, "bad arch string " + Arch);
+ // .attribute arch overrides the current architecture, so unset all
+ // currently enabled extensions
+ clearFeatureBits(RISCV::FeatureRV32E, "e");
+ clearFeatureBits(RISCV::FeatureStdExtM, "m");
+ clearFeatureBits(RISCV::FeatureStdExtA, "a");
+ clearFeatureBits(RISCV::FeatureStdExtF, "f");
+ clearFeatureBits(RISCV::FeatureStdExtD, "d");
+ clearFeatureBits(RISCV::FeatureStdExtC, "c");
+ clearFeatureBits(RISCV::FeatureStdExtB, "experimental-b");
+ clearFeatureBits(RISCV::FeatureStdExtV, "experimental-v");
+ clearFeatureBits(RISCV::FeatureExtZfh, "experimental-zfh");
+ clearFeatureBits(RISCV::FeatureExtZba, "experimental-zba");
+ clearFeatureBits(RISCV::FeatureExtZbb, "experimental-zbb");
+ clearFeatureBits(RISCV::FeatureExtZbc, "experimental-zbc");
+ clearFeatureBits(RISCV::FeatureExtZbe, "experimental-zbe");
+ clearFeatureBits(RISCV::FeatureExtZbf, "experimental-zbf");
+ clearFeatureBits(RISCV::FeatureExtZbm, "experimental-zbm");
+ clearFeatureBits(RISCV::FeatureExtZbp, "experimental-zbp");
+ clearFeatureBits(RISCV::FeatureExtZbproposedc, "experimental-zbproposedc");
+ clearFeatureBits(RISCV::FeatureExtZbr, "experimental-zbr");
+ clearFeatureBits(RISCV::FeatureExtZbs, "experimental-zbs");
+ clearFeatureBits(RISCV::FeatureExtZbt, "experimental-zbt");
+ clearFeatureBits(RISCV::FeatureExtZvamo, "experimental-zvamo");
+ clearFeatureBits(RISCV::FeatureStdExtZvlsseg, "experimental-zvlsseg");
+
while (!Arch.empty()) {
+ bool DropFirst = true;
if (Arch[0] == 'i')
clearFeatureBits(RISCV::FeatureRV32E, "e");
else if (Arch[0] == 'e')
@@ -2062,19 +2045,57 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
setFeatureBits(RISCV::FeatureStdExtD, "d");
} else if (Arch[0] == 'c') {
setFeatureBits(RISCV::FeatureStdExtC, "c");
+ } else if (Arch[0] == 'b') {
+ setFeatureBits(RISCV::FeatureStdExtB, "experimental-b");
+ } else if (Arch[0] == 'v') {
+ setFeatureBits(RISCV::FeatureStdExtV, "experimental-v");
+ } else if (Arch[0] == 's' || Arch[0] == 'x' || Arch[0] == 'z') {
+ StringRef Ext =
+ Arch.take_until([](char c) { return ::isdigit(c) || c == '_'; });
+ if (Ext == "zba")
+ setFeatureBits(RISCV::FeatureExtZba, "experimental-zba");
+ else if (Ext == "zbb")
+ setFeatureBits(RISCV::FeatureExtZbb, "experimental-zbb");
+ else if (Ext == "zbc")
+ setFeatureBits(RISCV::FeatureExtZbc, "experimental-zbc");
+ else if (Ext == "zbe")
+ setFeatureBits(RISCV::FeatureExtZbe, "experimental-zbe");
+ else if (Ext == "zbf")
+ setFeatureBits(RISCV::FeatureExtZbf, "experimental-zbf");
+ else if (Ext == "zbm")
+ setFeatureBits(RISCV::FeatureExtZbm, "experimental-zbm");
+ else if (Ext == "zbp")
+ setFeatureBits(RISCV::FeatureExtZbp, "experimental-zbp");
+ else if (Ext == "zbproposedc")
+ setFeatureBits(RISCV::FeatureExtZbproposedc,
+ "experimental-zbproposedc");
+ else if (Ext == "zbr")
+ setFeatureBits(RISCV::FeatureExtZbr, "experimental-zbr");
+ else if (Ext == "zbs")
+ setFeatureBits(RISCV::FeatureExtZbs, "experimental-zbs");
+ else if (Ext == "zbt")
+ setFeatureBits(RISCV::FeatureExtZbt, "experimental-zbt");
+ else if (Ext == "zfh")
+ setFeatureBits(RISCV::FeatureExtZfh, "experimental-zfh");
+ else if (Ext == "zvamo")
+ setFeatureBits(RISCV::FeatureExtZvamo, "experimental-zvamo");
+ else if (Ext == "zvlsseg")
+ setFeatureBits(RISCV::FeatureStdExtZvlsseg, "experimental-zvlsseg");
+ else
+ return Error(ValueExprLoc, "bad arch string " + Ext);
+ Arch = Arch.drop_until([](char c) { return ::isdigit(c) || c == '_'; });
+ DropFirst = false;
} else
return Error(ValueExprLoc, "bad arch string " + Arch);
- Arch = Arch.drop_front(1);
+ if (DropFirst)
+ Arch = Arch.drop_front(1);
int major = 0;
int minor = 0;
Arch.consumeInteger(10, major);
Arch.consume_front("p");
Arch.consumeInteger(10, minor);
- if (major != 0 || minor != 0) {
- Arch = Arch.drop_until([](char c) { return c == '_' || c == '"'; });
- Arch = Arch.drop_while([](char c) { return c == '_'; });
- }
+ Arch = Arch.drop_while([](char c) { return c == '_'; });
}
}
@@ -2102,6 +2123,38 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
formalArchStr = (Twine(formalArchStr) + "_d2p0").str();
if (getFeatureBits(RISCV::FeatureStdExtC))
formalArchStr = (Twine(formalArchStr) + "_c2p0").str();
+ if (getFeatureBits(RISCV::FeatureStdExtB))
+ formalArchStr = (Twine(formalArchStr) + "_b0p93").str();
+ if (getFeatureBits(RISCV::FeatureStdExtV))
+ formalArchStr = (Twine(formalArchStr) + "_v1p0").str();
+ if (getFeatureBits(RISCV::FeatureExtZfh))
+ formalArchStr = (Twine(formalArchStr) + "_zfh0p1").str();
+ if (getFeatureBits(RISCV::FeatureExtZba))
+ formalArchStr = (Twine(formalArchStr) + "_zba0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbb))
+ formalArchStr = (Twine(formalArchStr) + "_zbb0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbc))
+ formalArchStr = (Twine(formalArchStr) + "_zbc0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbe))
+ formalArchStr = (Twine(formalArchStr) + "_zbe0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbf))
+ formalArchStr = (Twine(formalArchStr) + "_zbf0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbm))
+ formalArchStr = (Twine(formalArchStr) + "_zbm0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbp))
+ formalArchStr = (Twine(formalArchStr) + "_zbp0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbproposedc))
+ formalArchStr = (Twine(formalArchStr) + "_zbproposedc0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbr))
+ formalArchStr = (Twine(formalArchStr) + "_zbr0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbs))
+ formalArchStr = (Twine(formalArchStr) + "_zbs0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZbt))
+ formalArchStr = (Twine(formalArchStr) + "_zbt0p93").str();
+ if (getFeatureBits(RISCV::FeatureExtZvamo))
+ formalArchStr = (Twine(formalArchStr) + "_zvamo1p0").str();
+ if (getFeatureBits(RISCV::FeatureStdExtZvlsseg))
+ formalArchStr = (Twine(formalArchStr) + "_zvlsseg1p0").str();
getTargetStreamer().emitTextAttribute(Tag, formalArchStr);
}
@@ -2118,12 +2171,12 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
S.emitInstruction((Res ? CInst : Inst), getSTI());
}
-void RISCVAsmParser::emitLoadImm(Register DestReg, int64_t Value,
+void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
MCStreamer &Out) {
RISCVMatInt::InstSeq Seq;
RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
- Register SrcReg = RISCV::X0;
+ MCRegister SrcReg = RISCV::X0;
for (RISCVMatInt::Inst &Inst : Seq) {
if (Inst.Opc == RISCV::LUI) {
emitToStreamer(
@@ -2149,8 +2202,7 @@ void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
// OP DestReg, TmpReg, %pcrel_lo(TmpLabel)
MCContext &Ctx = getContext();
- MCSymbol *TmpLabel = Ctx.createTempSymbol(
- "pcrel_hi", /* AlwaysAddSuffix */ true, /* CanBeUnnamed */ false);
+ MCSymbol *TmpLabel = Ctx.createNamedTempSymbol("pcrel_hi");
Out.emitLabel(TmpLabel);
const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
@@ -2254,6 +2306,88 @@ void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode,
Opcode, IDLoc, Out);
}
+void RISCVAsmParser::emitPseudoExtend(MCInst &Inst, bool SignExtend,
+ int64_t Width, SMLoc IDLoc,
+ MCStreamer &Out) {
+ // The sign/zero extend pseudo-instruction does two shifts, with the shift
+ // amounts dependent on the XLEN.
+ //
+ // The expansion looks like this
+ //
+ // SLLI rd, rs, XLEN - Width
+ // SR[A|R]I rd, rd, XLEN - Width
+ MCOperand DestReg = Inst.getOperand(0);
+ MCOperand SourceReg = Inst.getOperand(1);
+
+ unsigned SecondOpcode = SignExtend ? RISCV::SRAI : RISCV::SRLI;
+ int64_t ShAmt = (isRV64() ? 64 : 32) - Width;
+
+ assert(ShAmt > 0 && "Shift amount must be non-zero.");
+
+ emitToStreamer(Out, MCInstBuilder(RISCV::SLLI)
+ .addOperand(DestReg)
+ .addOperand(SourceReg)
+ .addImm(ShAmt));
+
+ emitToStreamer(Out, MCInstBuilder(SecondOpcode)
+ .addOperand(DestReg)
+ .addOperand(DestReg)
+ .addImm(ShAmt));
+}
+
+void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
+ MCStreamer &Out) {
+ if (Inst.getNumOperands() == 3) {
+ // unmasked va >= x
+ //
+ // pseudoinstruction: vmsge{u}.vx vd, va, x
+ // expansion: vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd
+ emitToStreamer(Out, MCInstBuilder(Opcode)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(2))
+ .addReg(RISCV::NoRegister));
+ emitToStreamer(Out, MCInstBuilder(RISCV::VMNAND_MM)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(0)));
+ } else if (Inst.getNumOperands() == 4) {
+ // masked va >= x, vd != v0
+ //
+ // pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t
+ // expansion: vmslt{u}.vx vd, va, x, v0.t; vmxor.mm vd, vd, v0
+ assert(Inst.getOperand(0).getReg() != RISCV::V0 &&
+ "The destination register should not be V0.");
+ emitToStreamer(Out, MCInstBuilder(Opcode)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(2))
+ .addOperand(Inst.getOperand(3)));
+ emitToStreamer(Out, MCInstBuilder(RISCV::VMXOR_MM)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(0))
+ .addReg(RISCV::V0));
+ } else if (Inst.getNumOperands() == 5) {
+ // masked va >= x, vd == v0
+ //
+ // pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
+ // expansion: vmslt{u}.vx vt, va, x; vmandnot.mm vd, vd, vt
+ assert(Inst.getOperand(0).getReg() == RISCV::V0 &&
+ "The destination register should be V0.");
+ assert(Inst.getOperand(1).getReg() != RISCV::V0 &&
+ "The temporary vector register should not be V0.");
+ emitToStreamer(Out, MCInstBuilder(Opcode)
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(2))
+ .addOperand(Inst.getOperand(3))
+ .addOperand(Inst.getOperand(4)));
+ emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1)));
+ }
+}
+
bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
OperandVector &Operands) {
assert(Inst.getOpcode() == RISCV::PseudoAddTPRel && "Invalid instruction");
@@ -2275,77 +2409,48 @@ std::unique_ptr<RISCVOperand> RISCVAsmParser::defaultMaskRegOp() const {
bool RISCVAsmParser::validateInstruction(MCInst &Inst,
OperandVector &Operands) {
const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
- unsigned TargetFlags =
- (MCID.TSFlags >> RISCV::ConstraintOffset) & RISCV::ConstraintMask;
- if (TargetFlags == RISCV::NoConstraint)
+ unsigned Constraints =
+ (MCID.TSFlags & RISCVII::ConstraintMask) >> RISCVII::ConstraintShift;
+ if (Constraints == RISCVII::NoConstraint)
return false;
unsigned DestReg = Inst.getOperand(0).getReg();
// Operands[1] will be the first operand, DestReg.
SMLoc Loc = Operands[1]->getStartLoc();
- if ((TargetFlags == RISCV::WidenV) || (TargetFlags == RISCV::WidenW) ||
- (TargetFlags == RISCV::SlideUp) || (TargetFlags == RISCV::Vrgather) ||
- (TargetFlags == RISCV::Vcompress)) {
- if (TargetFlags != RISCV::WidenW) {
- unsigned Src2Reg = Inst.getOperand(1).getReg();
- if (DestReg == Src2Reg)
- return Error(Loc, "The destination vector register group cannot overlap"
- " the source vector register group.");
- if (TargetFlags == RISCV::WidenV) {
- // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
- if (DestReg + 1 == Src2Reg)
- return Error(Loc,
- "The destination vector register group cannot overlap"
- " the source vector register group.");
- }
- }
- if (Inst.getOperand(2).isReg()) {
- unsigned Src1Reg = Inst.getOperand(2).getReg();
- if (DestReg == Src1Reg)
- return Error(Loc, "The destination vector register group cannot overlap"
- " the source vector register group.");
- if (TargetFlags == RISCV::WidenV || TargetFlags == RISCV::WidenW) {
- // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
- if (DestReg + 1 == Src1Reg)
- return Error(Loc,
- "The destination vector register group cannot overlap"
- " the source vector register group.");
- }
- }
- if (Inst.getNumOperands() == 4) {
- unsigned MaskReg = Inst.getOperand(3).getReg();
-
- if (DestReg == MaskReg)
- return Error(Loc, "The destination vector register group cannot overlap"
- " the mask register.");
- }
- } else if (TargetFlags == RISCV::Narrow) {
- unsigned Src2Reg = Inst.getOperand(1).getReg();
- if (DestReg == Src2Reg)
+ if (Constraints & RISCVII::VS2Constraint) {
+ unsigned CheckReg = Inst.getOperand(1).getReg();
+ if (DestReg == CheckReg)
return Error(Loc, "The destination vector register group cannot overlap"
" the source vector register group.");
- // Assume Src2Reg LMUL is 2 at least for widening/narrowing operations.
- if (DestReg == Src2Reg + 1)
+ }
+ if ((Constraints & RISCVII::VS1Constraint) && (Inst.getOperand(2).isReg())) {
+ unsigned CheckReg = Inst.getOperand(2).getReg();
+ if (DestReg == CheckReg)
return Error(Loc, "The destination vector register group cannot overlap"
" the source vector register group.");
- } else if (TargetFlags == RISCV::WidenCvt || TargetFlags == RISCV::Iota) {
- unsigned Src2Reg = Inst.getOperand(1).getReg();
- if (DestReg == Src2Reg)
+ }
+ if ((Constraints & RISCVII::VMConstraint) && (DestReg == RISCV::V0)) {
+ // vadc, vsbc are special cases. These instructions have no mask register.
+ // The destination register could not be V0.
+ unsigned Opcode = Inst.getOpcode();
+ if (Opcode == RISCV::VADC_VVM || Opcode == RISCV::VADC_VXM ||
+ Opcode == RISCV::VADC_VIM || Opcode == RISCV::VSBC_VVM ||
+ Opcode == RISCV::VSBC_VXM || Opcode == RISCV::VFMERGE_VFM ||
+ Opcode == RISCV::VMERGE_VIM || Opcode == RISCV::VMERGE_VVM ||
+ Opcode == RISCV::VMERGE_VXM)
+ return Error(Loc, "The destination vector register group cannot be V0.");
+
+ // Regardless masked or unmasked version, the number of operands is the
+ // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister"
+ // actually. We need to check the last operand to ensure whether it is
+ // masked or not.
+ unsigned CheckReg = Inst.getOperand(Inst.getNumOperands() - 1).getReg();
+ assert((CheckReg == RISCV::V0 || CheckReg == RISCV::NoRegister) &&
+ "Unexpected register for mask operand");
+
+ if (DestReg == CheckReg)
return Error(Loc, "The destination vector register group cannot overlap"
- " the source vector register group.");
- if (TargetFlags == RISCV::WidenCvt) {
- // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
- if (DestReg + 1 == Src2Reg)
- return Error(Loc, "The destination vector register group cannot overlap"
- " the source vector register group.");
- }
- if (Inst.getNumOperands() == 3) {
- unsigned MaskReg = Inst.getOperand(2).getReg();
-
- if (DestReg == MaskReg)
- return Error(Loc, "The destination vector register group cannot overlap"
- " the mask register.");
- }
+ " the mask register.");
}
return false;
}
@@ -2359,7 +2464,7 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
default:
break;
case RISCV::PseudoLI: {
- Register Reg = Inst.getOperand(0).getReg();
+ MCRegister Reg = Inst.getOperand(0).getReg();
const MCOperand &Op1 = Inst.getOperand(1);
if (Op1.isExpr()) {
// We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar.
@@ -2412,6 +2517,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
case RISCV::PseudoLD:
emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false);
return false;
+ case RISCV::PseudoFLH:
+ emitLoadStoreSymbol(Inst, RISCV::FLH, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
case RISCV::PseudoFLW:
emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true);
return false;
@@ -2430,6 +2538,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
case RISCV::PseudoSD:
emitLoadStoreSymbol(Inst, RISCV::SD, IDLoc, Out, /*HasTmpReg=*/true);
return false;
+ case RISCV::PseudoFSH:
+ emitLoadStoreSymbol(Inst, RISCV::FSH, IDLoc, Out, /*HasTmpReg=*/true);
+ return false;
case RISCV::PseudoFSW:
emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true);
return false;
@@ -2440,6 +2551,72 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
if (checkPseudoAddTPRel(Inst, Operands))
return true;
break;
+ case RISCV::PseudoSEXT_B:
+ emitPseudoExtend(Inst, /*SignExtend=*/true, /*Width=*/8, IDLoc, Out);
+ return false;
+ case RISCV::PseudoSEXT_H:
+ emitPseudoExtend(Inst, /*SignExtend=*/true, /*Width=*/16, IDLoc, Out);
+ return false;
+ case RISCV::PseudoZEXT_H:
+ emitPseudoExtend(Inst, /*SignExtend=*/false, /*Width=*/16, IDLoc, Out);
+ return false;
+ case RISCV::PseudoZEXT_W:
+ emitPseudoExtend(Inst, /*SignExtend=*/false, /*Width=*/32, IDLoc, Out);
+ return false;
+ case RISCV::PseudoVMSGEU_VX:
+ case RISCV::PseudoVMSGEU_VX_M:
+ case RISCV::PseudoVMSGEU_VX_M_T:
+ emitVMSGE(Inst, RISCV::VMSLTU_VX, IDLoc, Out);
+ return false;
+ case RISCV::PseudoVMSGE_VX:
+ case RISCV::PseudoVMSGE_VX_M:
+ case RISCV::PseudoVMSGE_VX_M_T:
+ emitVMSGE(Inst, RISCV::VMSLT_VX, IDLoc, Out);
+ return false;
+ case RISCV::PseudoVMSGE_VI:
+ case RISCV::PseudoVMSLT_VI: {
+ // These instructions are signed and so is immediate so we can subtract one
+ // and change the opcode.
+ int64_t Imm = Inst.getOperand(2).getImm();
+ unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGE_VI ? RISCV::VMSGT_VI
+ : RISCV::VMSLE_VI;
+ emitToStreamer(Out, MCInstBuilder(Opc)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1))
+ .addImm(Imm - 1)
+ .addOperand(Inst.getOperand(3)));
+ return false;
+ }
+ case RISCV::PseudoVMSGEU_VI:
+ case RISCV::PseudoVMSLTU_VI: {
+ int64_t Imm = Inst.getOperand(2).getImm();
+ // Unsigned comparisons are tricky because the immediate is signed. If the
+ // immediate is 0 we can't just subtract one. vmsltu.vi v0, v1, 0 is always
+ // false, but vmsle.vi v0, v1, -1 is always true. Instead we use
+ // vmsne v0, v1, v1 which is always false.
+ if (Imm == 0) {
+ unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGEU_VI
+ ? RISCV::VMSEQ_VV
+ : RISCV::VMSNE_VV;
+ emitToStreamer(Out, MCInstBuilder(Opc)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(1))
+ .addOperand(Inst.getOperand(3)));
+ } else {
+ // Other immediate values can subtract one like signed.
+ unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGEU_VI
+ ? RISCV::VMSGTU_VI
+ : RISCV::VMSLEU_VI;
+ emitToStreamer(Out, MCInstBuilder(Opc)
+ .addOperand(Inst.getOperand(0))
+ .addOperand(Inst.getOperand(1))
+ .addImm(Imm - 1)
+ .addOperand(Inst.getOperand(3)));
+ }
+
+ return false;
+ }
}
emitToStreamer(Out, Inst);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 37edc19398a5..623552390f53 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -10,10 +10,9 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
-#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
@@ -71,7 +70,18 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 32 || (IsRV32E && RegNo >= 16))
return MCDisassembler::Fail;
- Register Reg = RISCV::X0 + RegNo;
+ MCRegister Reg = RISCV::X0 + RegNo;
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo >= 32)
+ return MCDisassembler::Fail;
+
+ MCRegister Reg = RISCV::F0_H + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -82,7 +92,7 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 32)
return MCDisassembler::Fail;
- Register Reg = RISCV::F0_F + RegNo;
+ MCRegister Reg = RISCV::F0_F + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -93,7 +103,7 @@ static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 8) {
return MCDisassembler::Fail;
}
- Register Reg = RISCV::F8_F + RegNo;
+ MCRegister Reg = RISCV::F8_F + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -104,7 +114,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 32)
return MCDisassembler::Fail;
- Register Reg = RISCV::F0_D + RegNo;
+ MCRegister Reg = RISCV::F0_D + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -115,7 +125,7 @@ static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 8) {
return MCDisassembler::Fail;
}
- Register Reg = RISCV::F8_D + RegNo;
+ MCRegister Reg = RISCV::F8_D + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -146,7 +156,7 @@ static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 8)
return MCDisassembler::Fail;
- Register Reg = RISCV::X8 + RegNo;
+ MCRegister Reg = RISCV::X8 + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -157,14 +167,14 @@ static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint64_t RegNo,
if (RegNo >= 32)
return MCDisassembler::Fail;
- Register Reg = RISCV::V0 + RegNo;
+ MCRegister Reg = RISCV::V0 + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
uint64_t Address, const void *Decoder) {
- Register Reg = RISCV::NoRegister;
+ MCRegister Reg = RISCV::NoRegister;
switch (RegNo) {
default:
return MCDisassembler::Fail;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 090132af3585..56991ccf010a 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -9,9 +9,9 @@
#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H
#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVASMBACKEND_H
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "MCTargetDesc/RISCVFixupKinds.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "Utils/RISCVBaseInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 43b1f8b80c5f..fa36234d0f5f 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -1,3 +1,16 @@
+//===-- RISCVBaseInfo.cpp - Top level definitions for RISCV MC ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone enum definitions for the RISCV target
+// useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
#include "RISCVBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Triple.h"
@@ -6,7 +19,7 @@
namespace llvm {
namespace RISCVSysReg {
#define GET_SysRegsList_IMPL
-#include "RISCVGenSystemOperands.inc"
+#include "RISCVGenSearchableTables.inc"
} // namespace RISCVSysReg
namespace RISCVABI {
@@ -65,7 +78,10 @@ ABI getTargetABI(StringRef ABIName) {
// To avoid the BP value clobbered by a function call, we need to choose a
// callee saved register to save the value. RV32E only has X8 and X9 as callee
// saved registers and X8 will be used as fp. So we choose X9 as bp.
-Register getBPReg() { return RISCV::X9; }
+MCRegister getBPReg() { return RISCV::X9; }
+
+// Returns the register holding shadow call stack pointer.
+MCRegister getSCSPReg() { return RISCV::X18; }
} // namespace RISCVABI
@@ -78,4 +94,49 @@ void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
} // namespace RISCVFeatures
+namespace RISCVVPseudosTable {
+
+#define GET_RISCVVPseudosTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVVPseudosTable
+
+void RISCVVType::printVType(unsigned VType, raw_ostream &OS) {
+ RISCVVSEW VSEW = getVSEW(VType);
+ RISCVVLMUL VLMUL = getVLMUL(VType);
+
+ unsigned Sew = 1 << (static_cast<unsigned>(VSEW) + 3);
+ OS << "e" << Sew;
+
+ switch (VLMUL) {
+ case RISCVVLMUL::LMUL_RESERVED:
+ llvm_unreachable("Unexpected LMUL value!");
+ case RISCVVLMUL::LMUL_1:
+ case RISCVVLMUL::LMUL_2:
+ case RISCVVLMUL::LMUL_4:
+ case RISCVVLMUL::LMUL_8: {
+ unsigned LMul = 1 << static_cast<unsigned>(VLMUL);
+ OS << ",m" << LMul;
+ break;
+ }
+ case RISCVVLMUL::LMUL_F2:
+ case RISCVVLMUL::LMUL_F4:
+ case RISCVVLMUL::LMUL_F8: {
+ unsigned LMul = 1 << (8 - static_cast<unsigned>(VLMUL));
+ OS << ",mf" << LMul;
+ break;
+ }
+ }
+
+ if (isTailAgnostic(VType))
+ OS << ",ta";
+ else
+ OS << ",tu";
+
+ if (isMaskAgnostic(VType))
+ OS << ",ma";
+ else
+ OS << ",mu";
+}
+
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
new file mode 100644
index 000000000000..6c9f860c204c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -0,0 +1,406 @@
+//===-- RISCVBaseInfo.h - Top level definitions for RISCV MC ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone enum definitions for the RISCV target
+// useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+
+// RISCVII - This namespace holds all of the target specific flags that
+// instruction info tracks. All definitions must match RISCVInstrFormats.td.
+namespace RISCVII {
+enum {
+ InstFormatPseudo = 0,
+ InstFormatR = 1,
+ InstFormatR4 = 2,
+ InstFormatI = 3,
+ InstFormatS = 4,
+ InstFormatB = 5,
+ InstFormatU = 6,
+ InstFormatJ = 7,
+ InstFormatCR = 8,
+ InstFormatCI = 9,
+ InstFormatCSS = 10,
+ InstFormatCIW = 11,
+ InstFormatCL = 12,
+ InstFormatCS = 13,
+ InstFormatCA = 14,
+ InstFormatCB = 15,
+ InstFormatCJ = 16,
+ InstFormatOther = 17,
+
+ InstFormatMask = 31,
+
+ ConstraintShift = 5,
+ ConstraintMask = 0b111 << ConstraintShift,
+
+ VLMulShift = ConstraintShift + 3,
+ VLMulMask = 0b111 << VLMulShift,
+
+ // Do we need to add a dummy mask op when converting RVV Pseudo to MCInst.
+ HasDummyMaskOpShift = VLMulShift + 3,
+ HasDummyMaskOpMask = 1 << HasDummyMaskOpShift,
+
+ // Does this instruction only update element 0 the destination register.
+ WritesElement0Shift = HasDummyMaskOpShift + 1,
+ WritesElement0Mask = 1 << WritesElement0Shift,
+
+ // Does this instruction have a merge operand that must be removed when
+ // converting to MCInst. It will be the first explicit use operand. Used by
+ // RVV Pseudos.
+ HasMergeOpShift = WritesElement0Shift + 1,
+ HasMergeOpMask = 1 << HasMergeOpShift,
+
+ // Does this instruction have a SEW operand. It will be the last explicit
+ // operand. Used by RVV Pseudos.
+ HasSEWOpShift = HasMergeOpShift + 1,
+ HasSEWOpMask = 1 << HasSEWOpShift,
+
+ // Does this instruction have a VL operand. It will be the second to last
+ // explicit operand. Used by RVV Pseudos.
+ HasVLOpShift = HasSEWOpShift + 1,
+ HasVLOpMask = 1 << HasVLOpShift,
+};
+
+// Match with the definitions in RISCVInstrFormatsV.td
+enum RVVConstraintType {
+ NoConstraint = 0,
+ VS2Constraint = 0b001,
+ VS1Constraint = 0b010,
+ VMConstraint = 0b100,
+};
+
+// RISC-V Specific Machine Operand Flags
+enum {
+ MO_None = 0,
+ MO_CALL = 1,
+ MO_PLT = 2,
+ MO_LO = 3,
+ MO_HI = 4,
+ MO_PCREL_LO = 5,
+ MO_PCREL_HI = 6,
+ MO_GOT_HI = 7,
+ MO_TPREL_LO = 8,
+ MO_TPREL_HI = 9,
+ MO_TPREL_ADD = 10,
+ MO_TLS_GOT_HI = 11,
+ MO_TLS_GD_HI = 12,
+
+ // Used to differentiate between target-specific "direct" flags and "bitmask"
+ // flags. A machine operand can only have one "direct" flag, but can have
+ // multiple "bitmask" flags.
+ MO_DIRECT_FLAG_MASK = 15
+};
+} // namespace RISCVII
+
+namespace RISCVOp {
+enum OperandType : unsigned {
+ OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
+ OPERAND_UIMM5,
+ OPERAND_UIMM12,
+ OPERAND_SIMM12,
+ OPERAND_UIMM20,
+ OPERAND_UIMMLOG2XLEN,
+ OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
+};
+} // namespace RISCVOp
+
+// Describes the predecessor/successor bits used in the FENCE instruction.
+namespace RISCVFenceField {
+enum FenceField {
+ I = 8,
+ O = 4,
+ R = 2,
+ W = 1
+};
+}
+
+// Describes the supported floating point rounding mode encodings.
+namespace RISCVFPRndMode {
+enum RoundingMode {
+ RNE = 0,
+ RTZ = 1,
+ RDN = 2,
+ RUP = 3,
+ RMM = 4,
+ DYN = 7,
+ Invalid
+};
+
+inline static StringRef roundingModeToString(RoundingMode RndMode) {
+ switch (RndMode) {
+ default:
+ llvm_unreachable("Unknown floating point rounding mode");
+ case RISCVFPRndMode::RNE:
+ return "rne";
+ case RISCVFPRndMode::RTZ:
+ return "rtz";
+ case RISCVFPRndMode::RDN:
+ return "rdn";
+ case RISCVFPRndMode::RUP:
+ return "rup";
+ case RISCVFPRndMode::RMM:
+ return "rmm";
+ case RISCVFPRndMode::DYN:
+ return "dyn";
+ }
+}
+
+inline static RoundingMode stringToRoundingMode(StringRef Str) {
+ return StringSwitch<RoundingMode>(Str)
+ .Case("rne", RISCVFPRndMode::RNE)
+ .Case("rtz", RISCVFPRndMode::RTZ)
+ .Case("rdn", RISCVFPRndMode::RDN)
+ .Case("rup", RISCVFPRndMode::RUP)
+ .Case("rmm", RISCVFPRndMode::RMM)
+ .Case("dyn", RISCVFPRndMode::DYN)
+ .Default(RISCVFPRndMode::Invalid);
+}
+
+inline static bool isValidRoundingMode(unsigned Mode) {
+ switch (Mode) {
+ default:
+ return false;
+ case RISCVFPRndMode::RNE:
+ case RISCVFPRndMode::RTZ:
+ case RISCVFPRndMode::RDN:
+ case RISCVFPRndMode::RUP:
+ case RISCVFPRndMode::RMM:
+ case RISCVFPRndMode::DYN:
+ return true;
+ }
+}
+} // namespace RISCVFPRndMode
+
+namespace RISCVSysReg {
+struct SysReg {
+ const char *Name;
+ unsigned Encoding;
+ const char *AltName;
+ // FIXME: add these additional fields when needed.
+ // Privilege Access: Read, Write, Read-Only.
+ // unsigned ReadWrite;
+ // Privilege Mode: User, System or Machine.
+ // unsigned Mode;
+ // Check field name.
+ // unsigned Extra;
+ // Register number without the privilege bits.
+ // unsigned Number;
+ FeatureBitset FeaturesRequired;
+ bool isRV32Only;
+
+ bool haveRequiredFeatures(FeatureBitset ActiveFeatures) const {
+ // Not in 32-bit mode.
+ if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
+ return false;
+ // No required feature associated with the system register.
+ if (FeaturesRequired.none())
+ return true;
+ return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ }
+};
+
+#define GET_SysRegsList_DECL
+#include "RISCVGenSearchableTables.inc"
+} // end namespace RISCVSysReg
+
+namespace RISCVABI {
+
+enum ABI {
+ ABI_ILP32,
+ ABI_ILP32F,
+ ABI_ILP32D,
+ ABI_ILP32E,
+ ABI_LP64,
+ ABI_LP64F,
+ ABI_LP64D,
+ ABI_Unknown
+};
+
+// Returns the target ABI, or else a StringError if the requested ABIName is
+// not supported for the given TT and FeatureBits combination.
+ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
+ StringRef ABIName);
+
+ABI getTargetABI(StringRef ABIName);
+
+// Returns the register used to hold the stack pointer after realignment.
+MCRegister getBPReg();
+
+// Returns the register holding shadow call stack pointer.
+MCRegister getSCSPReg();
+
+} // namespace RISCVABI
+
+namespace RISCVFeatures {
+
+// Validates if the given combination of features are valid for the target
+// triple. Exits with report_fatal_error if not.
+void validate(const Triple &TT, const FeatureBitset &FeatureBits);
+
+} // namespace RISCVFeatures
+
+namespace RISCVVMVTs {
+
+constexpr MVT vint8mf8_t = MVT::nxv1i8;
+constexpr MVT vint8mf4_t = MVT::nxv2i8;
+constexpr MVT vint8mf2_t = MVT::nxv4i8;
+constexpr MVT vint8m1_t = MVT::nxv8i8;
+constexpr MVT vint8m2_t = MVT::nxv16i8;
+constexpr MVT vint8m4_t = MVT::nxv32i8;
+constexpr MVT vint8m8_t = MVT::nxv64i8;
+
+constexpr MVT vint16mf4_t = MVT::nxv1i16;
+constexpr MVT vint16mf2_t = MVT::nxv2i16;
+constexpr MVT vint16m1_t = MVT::nxv4i16;
+constexpr MVT vint16m2_t = MVT::nxv8i16;
+constexpr MVT vint16m4_t = MVT::nxv16i16;
+constexpr MVT vint16m8_t = MVT::nxv32i16;
+
+constexpr MVT vint32mf2_t = MVT::nxv1i32;
+constexpr MVT vint32m1_t = MVT::nxv2i32;
+constexpr MVT vint32m2_t = MVT::nxv4i32;
+constexpr MVT vint32m4_t = MVT::nxv8i32;
+constexpr MVT vint32m8_t = MVT::nxv16i32;
+
+constexpr MVT vint64m1_t = MVT::nxv1i64;
+constexpr MVT vint64m2_t = MVT::nxv2i64;
+constexpr MVT vint64m4_t = MVT::nxv4i64;
+constexpr MVT vint64m8_t = MVT::nxv8i64;
+
+constexpr MVT vfloat16mf4_t = MVT::nxv1f16;
+constexpr MVT vfloat16mf2_t = MVT::nxv2f16;
+constexpr MVT vfloat16m1_t = MVT::nxv4f16;
+constexpr MVT vfloat16m2_t = MVT::nxv8f16;
+constexpr MVT vfloat16m4_t = MVT::nxv16f16;
+constexpr MVT vfloat16m8_t = MVT::nxv32f16;
+
+constexpr MVT vfloat32mf2_t = MVT::nxv1f32;
+constexpr MVT vfloat32m1_t = MVT::nxv2f32;
+constexpr MVT vfloat32m2_t = MVT::nxv4f32;
+constexpr MVT vfloat32m4_t = MVT::nxv8f32;
+constexpr MVT vfloat32m8_t = MVT::nxv16f32;
+
+constexpr MVT vfloat64m1_t = MVT::nxv1f64;
+constexpr MVT vfloat64m2_t = MVT::nxv2f64;
+constexpr MVT vfloat64m4_t = MVT::nxv4f64;
+constexpr MVT vfloat64m8_t = MVT::nxv8f64;
+
+constexpr MVT vbool1_t = MVT::nxv64i1;
+constexpr MVT vbool2_t = MVT::nxv32i1;
+constexpr MVT vbool4_t = MVT::nxv16i1;
+constexpr MVT vbool8_t = MVT::nxv8i1;
+constexpr MVT vbool16_t = MVT::nxv4i1;
+constexpr MVT vbool32_t = MVT::nxv2i1;
+constexpr MVT vbool64_t = MVT::nxv1i1;
+
+} // namespace RISCVVMVTs
+
+enum class RISCVVSEW {
+ SEW_8 = 0,
+ SEW_16,
+ SEW_32,
+ SEW_64,
+ SEW_128,
+ SEW_256,
+ SEW_512,
+ SEW_1024,
+};
+
+enum class RISCVVLMUL {
+ LMUL_1 = 0,
+ LMUL_2,
+ LMUL_4,
+ LMUL_8,
+ LMUL_RESERVED,
+ LMUL_F8,
+ LMUL_F4,
+ LMUL_F2
+};
+
+namespace RISCVVType {
+// Is this a SEW value that can be encoded into the VTYPE format.
+inline static bool isValidSEW(unsigned SEW) {
+ return isPowerOf2_32(SEW) && SEW >= 8 && SEW <= 1024;
+}
+
+// Is this a LMUL value that can be encoded into the VTYPE format.
+inline static bool isValidLMUL(unsigned LMUL, bool Fractional) {
+ return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1);
+}
+
+// Encode VTYPE into the binary format used by the the VSETVLI instruction which
+// is used by our MC layer representation.
+//
+// Bits | Name | Description
+// -----+------------+------------------------------------------------
+// 7 | vma | Vector mask agnostic
+// 6 | vta | Vector tail agnostic
+// 5:3 | vsew[2:0] | Standard element width (SEW) setting
+// 2:0 | vlmul[2:0] | Vector register group multiplier (LMUL) setting
+inline static unsigned encodeVTYPE(RISCVVLMUL VLMUL, RISCVVSEW VSEW,
+ bool TailAgnostic, bool MaskAgnostic) {
+ unsigned VLMULBits = static_cast<unsigned>(VLMUL);
+ unsigned VSEWBits = static_cast<unsigned>(VSEW);
+ unsigned VTypeI = (VSEWBits << 3) | (VLMULBits & 0x7);
+ if (TailAgnostic)
+ VTypeI |= 0x40;
+ if (MaskAgnostic)
+ VTypeI |= 0x80;
+
+ return VTypeI;
+}
+
+inline static RISCVVLMUL getVLMUL(unsigned VType) {
+ unsigned VLMUL = VType & 0x7;
+ return static_cast<RISCVVLMUL>(VLMUL);
+}
+
+inline static RISCVVSEW getVSEW(unsigned VType) {
+ unsigned VSEW = (VType >> 3) & 0x7;
+ return static_cast<RISCVVSEW>(VSEW);
+}
+
+inline static bool isTailAgnostic(unsigned VType) { return VType & 0x40; }
+
+inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; }
+
+void printVType(unsigned VType, raw_ostream &OS);
+
+} // namespace RISCVVType
+
+namespace RISCVVPseudosTable {
+
+struct PseudoInfo {
+#include "MCTargetDesc/RISCVBaseInfo.h"
+ uint16_t Pseudo;
+ uint16_t BaseInstr;
+};
+
+using namespace RISCV;
+
+#define GET_RISCVVPseudosTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // end namespace RISCVVPseudosTable
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 079dc919928a..7df454be8729 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -11,9 +11,9 @@
//===----------------------------------------------------------------------===//
#include "RISCVELFStreamer.h"
-#include "MCTargetDesc/RISCVAsmBackend.h"
+#include "RISCVAsmBackend.h"
+#include "RISCVBaseInfo.h"
#include "RISCVMCTargetDesc.h"
-#include "Utils/RISCVBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index eae3e13dbe40..5f8d6e137518 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -11,8 +11,8 @@
//===----------------------------------------------------------------------===//
#include "RISCVInstPrinter.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
-#include "Utils/RISCVBaseInfo.h"
+#include "RISCVBaseInfo.h"
+#include "RISCVMCExpr.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -102,6 +102,24 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
MO.getExpr()->print(O, &MAI);
}
+void RISCVInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNo);
+ if (!MO.isImm())
+ return printOperand(MI, OpNo, STI, O);
+
+ if (PrintBranchImmAsAddress) {
+ uint64_t Target = Address + MO.getImm();
+ if (!STI.hasFeature(RISCV::Feature64Bit))
+ Target &= 0xffffffff;
+ O << formatHex(Target);
+ } else {
+ O << MO.getImm();
+ }
+}
+
void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -147,18 +165,12 @@ void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
O << "(";
printRegName(O, MO.getReg());
O << ")";
- return;
}
void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNo).getImm();
- unsigned Sew = (Imm >> 2) & 0x7;
- unsigned Lmul = Imm & 0x3;
-
- Lmul = 0x1 << Lmul;
- Sew = 0x1 << (Sew + 3);
- O << "e" << Sew << ",m" << Lmul;
+ RISCVVType::printVType(Imm, O);
}
void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo,
@@ -174,15 +186,6 @@ void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo,
O << ".t";
}
-void RISCVInstPrinter::printSImm5Plus1(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- const MCOperand &MO = MI->getOperand(OpNo);
-
- assert(MO.isImm() && "printSImm5Plus1 can only print constant operands");
- O << MO.getImm() + 1;
-}
-
const char *RISCVInstPrinter::getRegisterName(unsigned RegNo) {
return getRegisterName(RegNo, ArchRegNames ? RISCV::NoRegAltName
: RISCV::ABIRegAltName);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index fdaa00c5f8eb..d078ead2c8ad 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -32,6 +32,8 @@ public:
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O, const char *Modifier = nullptr);
+ void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printFenceArg(const MCInst *MI, unsigned OpNo,
@@ -44,10 +46,9 @@ public:
raw_ostream &O);
void printVMaskReg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printSImm5Plus1(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 816206c477df..b299541939ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -10,12 +10,11 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "MCTargetDesc/RISCVFixupKinds.h"
#include "MCTargetDesc/RISCVMCExpr.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "Utils/RISCVBaseInfo.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -84,6 +83,12 @@ public:
unsigned getVMaskReg(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+
+private:
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -106,7 +111,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
const MCSubtargetInfo &STI) const {
MCInst TmpInst;
MCOperand Func;
- Register Ra;
+ MCRegister Ra;
if (MI.getOpcode() == RISCV::PseudoTAIL) {
Func = MI.getOperand(0);
Ra = RISCV::X6;
@@ -185,6 +190,9 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS,
void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
+ verifyInstructionPredicates(MI,
+ computeAvailableFeatures(STI.getFeatureBits()));
+
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
// Get byte count of instruction.
unsigned Size = Desc.getSize();
@@ -397,4 +405,5 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo,
}
}
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "RISCVGenMCCodeEmitter.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 2a6f372e50be..8ce2184c7a41 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,7 +13,6 @@
#include "RISCVMCExpr.h"
#include "MCTargetDesc/RISCVAsmBackend.h"
-#include "RISCV.h"
#include "RISCVFixupKinds.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmLayout.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index a474224e1a4e..093118518db6 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -11,14 +11,13 @@
//===----------------------------------------------------------------------===//
#include "RISCVMCTargetDesc.h"
+#include "RISCVBaseInfo.h"
#include "RISCVELFStreamer.h"
#include "RISCVInstPrinter.h"
#include "RISCVMCAsmInfo.h"
#include "RISCVTargetStreamer.h"
#include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -56,7 +55,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
- Register SP = MRI.getDwarfRegNum(RISCV::X2, true);
+ MCRegister SP = MRI.getDwarfRegNum(RISCV::X2, true);
MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0);
MAI->addInitialFrameState(Inst);
@@ -68,7 +67,7 @@ static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
std::string CPUName = std::string(CPU);
if (CPUName.empty())
CPUName = TT.isArch64Bit() ? "generic-rv64" : "generic-rv32";
- return createRISCVMCSubtargetInfoImpl(TT, CPUName, FS);
+ return createRISCVMCSubtargetInfoImpl(TT, CPUName, /*TuneCPU*/ CPUName, FS);
}
static MCInstPrinter *createRISCVMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index f390ddb89e3c..1f3dead61011 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -8,10 +8,8 @@
#include "RISCVMatInt.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MachineValueType.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/Support/MathExtras.h"
-#include <cstdint>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index b12ae2eade99..17ca57458b49 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVMatInt.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -9,12 +9,11 @@
#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H
#define LLVM_LIB_TARGET_RISCV_MATINT_H
-#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MachineValueType.h"
#include <cstdint>
namespace llvm {
+class APInt;
namespace RISCVMatInt {
struct Inst {
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 54a2fb288579..72434a15bedb 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -11,7 +11,7 @@
//===----------------------------------------------------------------------===//
#include "RISCVTargetStreamer.h"
-#include "RISCVSubtarget.h"
+#include "RISCVMCTargetDesc.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/RISCVAttributes.h"
@@ -60,6 +60,38 @@ void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
Arch += "_d2p0";
if (STI.hasFeature(RISCV::FeatureStdExtC))
Arch += "_c2p0";
+ if (STI.hasFeature(RISCV::FeatureStdExtB))
+ Arch += "_b0p93";
+ if (STI.hasFeature(RISCV::FeatureStdExtV))
+ Arch += "_v1p0";
+ if (STI.hasFeature(RISCV::FeatureExtZfh))
+ Arch += "_zfh0p1";
+ if (STI.hasFeature(RISCV::FeatureExtZba))
+ Arch += "_zba0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbb))
+ Arch += "_zbb0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbc))
+ Arch += "_zbc0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbe))
+ Arch += "_zbe0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbf))
+ Arch += "_zbf0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbm))
+ Arch += "_zbm0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbp))
+ Arch += "_zbp0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbproposedc))
+ Arch += "_zbproposedc0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbr))
+ Arch += "_zbr0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbs))
+ Arch += "_zbs0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZbt))
+ Arch += "_zbt0p93";
+ if (STI.hasFeature(RISCV::FeatureExtZvamo))
+ Arch += "_zvamo1p0";
+ if (STI.hasFeature(RISCV::FeatureStdExtZvlsseg))
+ Arch += "_zvlsseg1p0";
emitTextAttribute(RISCVAttrs::ARCH, Arch);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h
index 9baa2cc2741a..2538d9992de7 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_RISCV_RISCV_H
#define LLVM_LIB_TARGET_RISCV_RISCV_H
-#include "Utils/RISCVBaseInfo.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -46,6 +46,9 @@ void initializeRISCVExpandPseudoPass(PassRegistry &);
FunctionPass *createRISCVExpandAtomicPseudoPass();
void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
+FunctionPass *createRISCVCleanupVSETVLIPass();
+void initializeRISCVCleanupVSETVLIPass(PassRegistry &);
+
InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
RISCVSubtarget &,
RISCVRegisterBankInfo &);
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td
index 57e7c41c4271..83811dadc9ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCV.td
@@ -41,6 +41,14 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
AssemblerPredicate<(all_of FeatureStdExtD),
"'D' (Double-Precision Floating-Point)">;
+def FeatureExtZfh
+ : SubtargetFeature<"experimental-zfh", "HasStdExtZfh", "true",
+ "'Zfh' (Half-Precision Floating-Point)",
+ [FeatureStdExtF]>;
+def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
+ AssemblerPredicate<(all_of FeatureExtZfh),
+ "'Zfh' (Half-Precision Floating-Point)">;
+
def FeatureStdExtC
: SubtargetFeature<"c", "HasStdExtC", "true",
"'C' (Compressed Instructions)">;
@@ -48,6 +56,14 @@ def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
AssemblerPredicate<(all_of FeatureStdExtC),
"'C' (Compressed Instructions)">;
+def FeatureExtZba
+ : SubtargetFeature<"experimental-zba", "HasStdExtZba", "true",
+ "'Zba' (Address calculation 'B' Instructions)">;
+def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
+ AssemblerPredicate<(all_of FeatureExtZba),
+ "'Zba' (Address calculation 'B' Instructions)">;
+def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
+
def FeatureExtZbb
: SubtargetFeature<"experimental-zbb", "HasStdExtZbb", "true",
"'Zbb' (Base 'B' Instructions)">;
@@ -115,7 +131,9 @@ def HasStdExtZbt : Predicate<"Subtarget->hasStdExtZbt()">,
// subextensions. They should be enabled if either has been specified.
def HasStdExtZbbOrZbp
: Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()">,
- AssemblerPredicate<(any_of FeatureExtZbb, FeatureExtZbp)>;
+ AssemblerPredicate<(any_of FeatureExtZbb, FeatureExtZbp),
+ "'Zbb' (Base 'B' Instructions) or "
+ "'Zbp' (Permutation 'B' Instructions)">;
def FeatureExtZbproposedc
: SubtargetFeature<"experimental-zbproposedc", "HasStdExtZbproposedc", "true",
@@ -127,7 +145,8 @@ def HasStdExtZbproposedc : Predicate<"Subtarget->hasStdExtZbproposedc()">,
def FeatureStdExtB
: SubtargetFeature<"experimental-b", "HasStdExtB", "true",
"'B' (Bit Manipulation Instructions)",
- [FeatureExtZbb,
+ [FeatureExtZba,
+ FeatureExtZbb,
FeatureExtZbc,
FeatureExtZbe,
FeatureExtZbf,
@@ -145,16 +164,30 @@ def FeatureNoRVCHints
"Disable RVC Hint Instructions.">;
def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
- "RVC Hint Instructions">;
+ "RVC Hint Instructions">;
def FeatureStdExtV
: SubtargetFeature<"experimental-v", "HasStdExtV", "true",
- "'V' (Vector Instructions)",
- [FeatureStdExtF]>;
+ "'V' (Vector Instructions)">;
def HasStdExtV : Predicate<"Subtarget->hasStdExtV()">,
AssemblerPredicate<(all_of FeatureStdExtV),
"'V' (Vector Instructions)">;
+def FeatureStdExtZvlsseg
+ : SubtargetFeature<"experimental-zvlsseg", "HasStdExtZvlsseg", "true",
+ "'Zvlsseg' (Vector segment load/store instructions)",
+ [FeatureStdExtV]>;
+def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">,
+ AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
+ "'Zvlsseg' (Vector segment load/store instructions)">;
+def FeatureExtZvamo
+ : SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true",
+ "'Zvamo'(Vector AMO Operations)",
+ [FeatureStdExtV]>;
+def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">,
+ AssemblerPredicate<(all_of FeatureExtZvamo),
+ "'Zvamo'(Vector AMO Operations)">;
+
def Feature64Bit
: SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
def IsRV64 : Predicate<"Subtarget->is64Bit()">,
@@ -164,8 +197,8 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<(all_of (not Feature64Bit)),
"RV32I Base Instruction Set">;
+defvar RV32 = DefaultMode;
def RV64 : HwMode<"+64bit">;
-def RV32 : HwMode<"-64bit">;
def FeatureRV32E
: SubtargetFeature<"e", "IsRV32E", "true",
@@ -200,31 +233,44 @@ include "RISCVRegisterInfo.td"
include "RISCVCallingConv.td"
include "RISCVInstrInfo.td"
include "RISCVRegisterBanks.td"
-include "RISCVSchedRocket32.td"
-include "RISCVSchedRocket64.td"
+include "RISCVSchedRocket.td"
+include "RISCVSchedSiFive7.td"
//===----------------------------------------------------------------------===//
// RISC-V processors supported.
//===----------------------------------------------------------------------===//
def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
-
def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
-def : ProcessorModel<"rocket-rv32", Rocket32Model, []>;
-
-def : ProcessorModel<"rocket-rv64", Rocket64Model, [Feature64Bit]>;
-
-def : ProcessorModel<"sifive-e31", Rocket32Model, [FeatureStdExtM,
- FeatureStdExtA,
- FeatureStdExtC]>;
-
-def : ProcessorModel<"sifive-u54", Rocket64Model, [Feature64Bit,
- FeatureStdExtM,
- FeatureStdExtA,
- FeatureStdExtF,
- FeatureStdExtD,
- FeatureStdExtC]>;
+def : ProcessorModel<"rocket-rv32", RocketModel, []>;
+def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
+
+def : ProcessorModel<"sifive-7-rv32", SiFive7Model, []>;
+def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit]>;
+
+def : ProcessorModel<"sifive-e31", RocketModel, [FeatureStdExtM,
+ FeatureStdExtA,
+ FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
+ FeatureStdExtM,
+ FeatureStdExtA,
+ FeatureStdExtF,
+ FeatureStdExtD,
+ FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
+ FeatureStdExtA,
+ FeatureStdExtF,
+ FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
+ FeatureStdExtM,
+ FeatureStdExtA,
+ FeatureStdExtF,
+ FeatureStdExtD,
+ FeatureStdExtC]>;
//===----------------------------------------------------------------------===//
// Define the RISC-V target.
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 8955994b1c2e..0a915cbcc1af 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -198,7 +198,9 @@ void RISCVAsmPrinter::emitAttributes() {
StringRef CPU = TM.getTargetCPU();
StringRef FS = TM.getTargetFeatureString();
const RISCVTargetMachine &RTM = static_cast<const RISCVTargetMachine &>(TM);
- const RISCVSubtarget STI(TT, CPU, FS, /*ABIName=*/"", RTM);
+ /* TuneCPU doesn't impact emission of ELF attributes, ELF attributes only
+ care about arch related features, so we can set TuneCPU as CPU. */
+ const RISCVSubtarget STI(TT, CPU, /*TuneCPU=*/CPU, FS, /*ABIName=*/"", RTM);
RTS.emitTargetAttributes(STI);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp
index c63a84739c4a..d265f3a12b7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.cpp
@@ -22,8 +22,8 @@ RISCVCallLowering::RISCVCallLowering(const RISCVTargetLowering &TLI)
: CallLowering(&TLI) {}
bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
- const Value *Val,
- ArrayRef<Register> VRegs) const {
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(RISCV::PseudoRET);
@@ -34,9 +34,10 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return true;
}
-bool RISCVCallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const {
+bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
if (F.arg_empty())
return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h
index 7ce074a61f0a..cd7fc4c76123 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCallLowering.h
@@ -28,10 +28,12 @@ public:
RISCVCallLowering(const RISCVTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
- ArrayRef<Register> VRegs) const override;
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
new file mode 100644
index 000000000000..6a12f99b8903
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
@@ -0,0 +1,131 @@
+//===- RISCVCleanupVSETVLI.cpp - Cleanup unneeded VSETVLI instructions ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function pass that removes duplicate vsetvli
+// instructions within a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-cleanup-vsetvli"
+#define RISCV_CLEANUP_VSETVLI_NAME "RISCV Cleanup VSETVLI pass"
+
+namespace {
+
+class RISCVCleanupVSETVLI : public MachineFunctionPass {
+public:
+ static char ID;
+
+ RISCVCleanupVSETVLI() : MachineFunctionPass(ID) {
+ initializeRISCVCleanupVSETVLIPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+
+ // This pass modifies the program, but does not modify the CFG
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return RISCV_CLEANUP_VSETVLI_NAME; }
+};
+
+} // end anonymous namespace
+
+char RISCVCleanupVSETVLI::ID = 0;
+
+INITIALIZE_PASS(RISCVCleanupVSETVLI, DEBUG_TYPE,
+ RISCV_CLEANUP_VSETVLI_NAME, false, false)
+
+bool RISCVCleanupVSETVLI::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ MachineInstr *PrevVSETVLI = nullptr;
+
+ for (auto MII = MBB.begin(), MIE = MBB.end(); MII != MIE;) {
+ MachineInstr &MI = *MII++;
+
+ if (MI.getOpcode() != RISCV::PseudoVSETVLI) {
+ if (PrevVSETVLI &&
+ (MI.isCall() || MI.modifiesRegister(RISCV::VL) ||
+ MI.modifiesRegister(RISCV::VTYPE))) {
+ // Old VL/VTYPE is overwritten.
+ PrevVSETVLI = nullptr;
+ }
+ continue;
+ }
+
+ // If we don't have a previous VSETVLI or the VL output isn't dead, we
+ // can't remove this VSETVLI.
+ if (!PrevVSETVLI || !MI.getOperand(0).isDead()) {
+ PrevVSETVLI = &MI;
+ continue;
+ }
+
+ Register PrevAVLReg = PrevVSETVLI->getOperand(1).getReg();
+ Register AVLReg = MI.getOperand(1).getReg();
+ int64_t PrevVTYPEImm = PrevVSETVLI->getOperand(2).getImm();
+ int64_t VTYPEImm = MI.getOperand(2).getImm();
+
+ // Does this VSETVLI use the same AVL register and VTYPE immediate?
+ if (PrevAVLReg != AVLReg || PrevVTYPEImm != VTYPEImm) {
+ PrevVSETVLI = &MI;
+ continue;
+ }
+
+ // If the AVLReg is X0 we need to look at the output VL of both VSETVLIs.
+ if (AVLReg == RISCV::X0) {
+ Register PrevOutVL = PrevVSETVLI->getOperand(0).getReg();
+ Register OutVL = MI.getOperand(0).getReg();
+ // We can't remove if the previous VSETVLI left VL unchanged and the
+ // current instruction is setting it to VLMAX. Without knowing the VL
+ // before the previous instruction we don't know if this is a change.
+ if (PrevOutVL == RISCV::X0 && OutVL != RISCV::X0) {
+ PrevVSETVLI = &MI;
+ continue;
+ }
+ }
+
+ // This VSETVLI is redundant, remove it.
+ MI.eraseFromParent();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool RISCVCleanupVSETVLI::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ // Skip if the vector extension is not enabled.
+ const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+ if (!ST.hasStdExtV())
+ return false;
+
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF)
+ Changed |= runOnMachineBasicBlock(MBB);
+
+ return Changed;
+}
+
+/// Returns an instance of the Cleanup VSETVLI pass.
+FunctionPass *llvm::createRISCVCleanupVSETVLIPass() {
+ return new RISCVCleanupVSETVLI();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 504355fb8bf8..5f50892ca886 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -59,6 +59,9 @@ private:
bool expandLoadTLSGDAddress(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
+ bool expandVSetVL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+ bool expandVMSET_VMCLR(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, unsigned Opcode);
};
char RISCVExpandPseudo::ID = 0;
@@ -99,6 +102,26 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandLoadTLSIEAddress(MBB, MBBI, NextMBBI);
case RISCV::PseudoLA_TLS_GD:
return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI);
+ case RISCV::PseudoVSETVLI:
+ return expandVSetVL(MBB, MBBI);
+ case RISCV::PseudoVMCLR_M_B1:
+ case RISCV::PseudoVMCLR_M_B2:
+ case RISCV::PseudoVMCLR_M_B4:
+ case RISCV::PseudoVMCLR_M_B8:
+ case RISCV::PseudoVMCLR_M_B16:
+ case RISCV::PseudoVMCLR_M_B32:
+ case RISCV::PseudoVMCLR_M_B64:
+ // vmclr.m vd => vmxor.mm vd, vd, vd
+ return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXOR_MM);
+ case RISCV::PseudoVMSET_M_B1:
+ case RISCV::PseudoVMSET_M_B2:
+ case RISCV::PseudoVMSET_M_B4:
+ case RISCV::PseudoVMSET_M_B8:
+ case RISCV::PseudoVMSET_M_B16:
+ case RISCV::PseudoVMSET_M_B32:
+ case RISCV::PseudoVMSET_M_B64:
+ // vmset.m vd => vmxnor.mm vd, vd, vd
+ return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXNOR_MM);
}
return false;
@@ -188,6 +211,41 @@ bool RISCVExpandPseudo::expandLoadTLSGDAddress(
RISCV::ADDI);
}
+bool RISCVExpandPseudo::expandVSetVL(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ assert(MBBI->getNumOperands() == 5 && "Unexpected instruction format");
+
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ assert(MBBI->getOpcode() == RISCV::PseudoVSETVLI &&
+ "Unexpected pseudo instruction");
+ const MCInstrDesc &Desc = TII->get(RISCV::VSETVLI);
+ assert(Desc.getNumOperands() == 3 && "Unexpected instruction format");
+
+ Register DstReg = MBBI->getOperand(0).getReg();
+ bool DstIsDead = MBBI->getOperand(0).isDead();
+ BuildMI(MBB, MBBI, DL, Desc)
+ .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+ .add(MBBI->getOperand(1)) // VL
+ .add(MBBI->getOperand(2)); // VType
+
+ MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+ return true;
+}
+
+bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ unsigned Opcode) {
+ DebugLoc DL = MBBI->getDebugLoc();
+ Register DstReg = MBBI->getOperand(0).getReg();
+ const MCInstrDesc &Desc = TII->get(Opcode);
+ BuildMI(MBB, MBBI, DL, Desc, DstReg)
+ .addReg(DstReg, RegState::Undef)
+ .addReg(DstReg, RegState::Undef);
+ MBBI->eraseFromParent(); // The pseudo instruction is gone now.
+ return true;
+}
+
} // end of anonymous namespace
INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo",
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 43adc7426c79..564d97f47d9e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -23,6 +23,105 @@
using namespace llvm;
+// For now we use x18, a.k.a s2, as pointer to shadow call stack.
+// User should explicitly set -ffixed-x18 and not use x18 in their asm.
+static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL) {
+ if (!MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack))
+ return;
+
+ const auto &STI = MF.getSubtarget<RISCVSubtarget>();
+ Register RAReg = STI.getRegisterInfo()->getRARegister();
+
+ // Do not save RA to the SCS if it's not saved to the regular stack,
+ // i.e. RA is not at risk of being overwritten.
+ std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+ if (std::none_of(CSI.begin(), CSI.end(),
+ [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
+ return;
+
+ Register SCSPReg = RISCVABI::getSCSPReg();
+
+ auto &Ctx = MF.getFunction().getContext();
+ if (!STI.isRegisterReservedByUser(SCSPReg)) {
+ Ctx.diagnose(DiagnosticInfoUnsupported{
+ MF.getFunction(), "x18 not reserved by user for Shadow Call Stack."});
+ return;
+ }
+
+ const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+ if (RVFI->useSaveRestoreLibCalls(MF)) {
+ Ctx.diagnose(DiagnosticInfoUnsupported{
+ MF.getFunction(),
+ "Shadow Call Stack cannot be combined with Save/Restore LibCalls."});
+ return;
+ }
+
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
+ int64_t SlotSize = STI.getXLen() / 8;
+ // Store return address to shadow call stack
+ // s[w|d] ra, 0(s2)
+ // addi s2, s2, [4|8]
+ BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RAReg)
+ .addReg(SCSPReg)
+ .addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI))
+ .addReg(SCSPReg, RegState::Define)
+ .addReg(SCSPReg)
+ .addImm(SlotSize);
+}
+
+static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL) {
+ if (!MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack))
+ return;
+
+ const auto &STI = MF.getSubtarget<RISCVSubtarget>();
+ Register RAReg = STI.getRegisterInfo()->getRARegister();
+
+ // See emitSCSPrologue() above.
+ std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+ if (std::none_of(CSI.begin(), CSI.end(),
+ [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; }))
+ return;
+
+ Register SCSPReg = RISCVABI::getSCSPReg();
+
+ auto &Ctx = MF.getFunction().getContext();
+ if (!STI.isRegisterReservedByUser(SCSPReg)) {
+ Ctx.diagnose(DiagnosticInfoUnsupported{
+ MF.getFunction(), "x18 not reserved by user for Shadow Call Stack."});
+ return;
+ }
+
+ const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+ if (RVFI->useSaveRestoreLibCalls(MF)) {
+ Ctx.diagnose(DiagnosticInfoUnsupported{
+ MF.getFunction(),
+ "Shadow Call Stack cannot be combined with Save/Restore LibCalls."});
+ return;
+ }
+
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit);
+ int64_t SlotSize = STI.getXLen() / 8;
+ // Load return address from shadow call stack
+ // l[w|d] ra, -[4|8](s2)
+ // addi s2, s2, -[4|8]
+ BuildMI(MBB, MI, DL, TII->get(IsRV64 ? RISCV::LD : RISCV::LW))
+ .addReg(RAReg, RegState::Define)
+ .addReg(SCSPReg)
+ .addImm(-SlotSize);
+ BuildMI(MBB, MI, DL, TII->get(RISCV::ADDI))
+ .addReg(SCSPReg, RegState::Define)
+ .addReg(SCSPReg)
+ .addImm(-SlotSize);
+}
+
// Get the ID of the libcall used for spilling and restoring callee saved
// registers. The ID is representative of the number of registers saved or
// restored by the libcall, except it is zero-indexed - ID 0 corresponds to a
@@ -39,7 +138,7 @@ static int getLibCallID(const MachineFunction &MF,
// RISCVRegisterInfo::hasReservedSpillSlot assigns negative frame indexes to
// registers which can be saved by libcall.
if (CS.getFrameIdx() < 0)
- MaxReg = std::max(MaxReg.id(), CS.getReg());
+ MaxReg = std::max(MaxReg.id(), CS.getReg().id());
if (MaxReg == RISCV::NoRegister)
return -1;
@@ -136,18 +235,12 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const {
// Determines the size of the frame and maximum call frame size.
void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
- const RISCVRegisterInfo *RI = STI.getRegisterInfo();
// Get the number of bytes to allocate from the FrameInfo.
uint64_t FrameSize = MFI.getStackSize();
// Get the alignment.
Align StackAlign = getStackAlign();
- if (RI->needsStackRealignment(MF)) {
- Align MaxStackAlign = std::max(StackAlign, MFI.getMaxAlign());
- FrameSize += (MaxStackAlign.value() - StackAlign.value());
- StackAlign = MaxStackAlign;
- }
// Set Max Call Frame Size
uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
@@ -222,15 +315,23 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
Register SPReg = getSPReg(STI);
Register BPReg = RISCVABI::getBPReg();
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+ return;
+
+ // Emit prologue for shadow call stack.
+ emitSCSPrologue(MF, MBB, MBBI, DL);
+
// Since spillCalleeSavedRegisters may have inserted a libcall, skip past
// any instructions marked as FrameSetup
while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
++MBBI;
- // Debug location must be unknown since the first debug location is used
- // to determine the end of the prologue.
- DebugLoc DL;
-
// Determine the correct frame layout
determineFrameLayout(MF);
@@ -398,6 +499,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
Register FPReg = getFPReg(STI);
Register SPReg = getSPReg(STI);
+ // All calls are tail calls in GHC calling conv, and functions have no
+ // prologue/epilogue.
+ if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+ return;
+
// Get the insert location for the epilogue. If there were no terminators in
// the block, get the last instruction.
MachineBasicBlock::iterator MBBI = MBB.end();
@@ -457,11 +563,14 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// Deallocate stack
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
+
+ // Emit epilogue for shadow call stack.
+ emitSCSEpilogue(MF, MBB, MBBI, DL);
}
-int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
@@ -513,7 +622,7 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
Offset += RVFI->getLibCallStackSize();
}
}
- return Offset;
+ return StackOffset::getFixed(Offset);
}
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
@@ -547,14 +656,14 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
for (unsigned i = 0; CSRegs[i]; ++i)
SavedRegs.set(CSRegs[i]);
- if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD() ||
- MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
+ if (MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
// If interrupt is enabled, this list contains all FP registers.
const MCPhysReg * Regs = MF.getRegInfo().getCalleeSavedRegs();
for (unsigned i = 0; Regs[i]; ++i)
- if (RISCV::FPR32RegClass.contains(Regs[i]) ||
+ if (RISCV::FPR16RegClass.contains(Regs[i]) ||
+ RISCV::FPR32RegClass.contains(Regs[i]) ||
RISCV::FPR64RegClass.contains(Regs[i]))
SavedRegs.set(Regs[i]);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 1517c847a04c..889b9ce2e1a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
class RISCVSubtarget;
@@ -29,8 +30,8 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 7570385e38e3..7b0f38671f06 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -12,8 +12,9 @@
#include "RISCVISelDAGToDAG.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "Utils/RISCVMatInt.h"
+#include "MCTargetDesc/RISCVMatInt.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
@@ -48,15 +49,439 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
return Result;
}
-// Returns true if the Node is an ISD::AND with a constant argument. If so,
-// set Mask to that constant value.
-static bool isConstantMask(SDNode *Node, uint64_t &Mask) {
- if (Node->getOpcode() == ISD::AND &&
- Node->getOperand(1).getOpcode() == ISD::Constant) {
- Mask = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
- return true;
+static RISCVVLMUL getLMUL(EVT VT) {
+ switch (VT.getSizeInBits().getKnownMinValue() / 8) {
+ default:
+ llvm_unreachable("Invalid LMUL.");
+ case 1:
+ return RISCVVLMUL::LMUL_F8;
+ case 2:
+ return RISCVVLMUL::LMUL_F4;
+ case 4:
+ return RISCVVLMUL::LMUL_F2;
+ case 8:
+ return RISCVVLMUL::LMUL_1;
+ case 16:
+ return RISCVVLMUL::LMUL_2;
+ case 32:
+ return RISCVVLMUL::LMUL_4;
+ case 64:
+ return RISCVVLMUL::LMUL_8;
}
- return false;
+}
+
+static unsigned getSubregIndexByEVT(EVT VT, unsigned Index) {
+ RISCVVLMUL LMUL = getLMUL(VT);
+ if (LMUL == RISCVVLMUL::LMUL_F8 || LMUL == RISCVVLMUL::LMUL_F4 ||
+ LMUL == RISCVVLMUL::LMUL_F2 || LMUL == RISCVVLMUL::LMUL_1) {
+ static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
+ "Unexpected subreg numbering");
+ return RISCV::sub_vrm1_0 + Index;
+ } else if (LMUL == RISCVVLMUL::LMUL_2) {
+ static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
+ "Unexpected subreg numbering");
+ return RISCV::sub_vrm2_0 + Index;
+ } else if (LMUL == RISCVVLMUL::LMUL_4) {
+ static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
+ "Unexpected subreg numbering");
+ return RISCV::sub_vrm4_0 + Index;
+ }
+ llvm_unreachable("Invalid vector type.");
+}
+
+static SDValue createTupleImpl(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+ unsigned RegClassID, unsigned SubReg0) {
+ assert(Regs.size() >= 2 && Regs.size() <= 8);
+
+ SDLoc DL(Regs[0]);
+ SmallVector<SDValue, 8> Ops;
+
+ Ops.push_back(CurDAG.getTargetConstant(RegClassID, DL, MVT::i32));
+
+ for (unsigned I = 0; I < Regs.size(); ++I) {
+ Ops.push_back(Regs[I]);
+ Ops.push_back(CurDAG.getTargetConstant(SubReg0 + I, DL, MVT::i32));
+ }
+ SDNode *N =
+ CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+ return SDValue(N, 0);
+}
+
+static SDValue createM1Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+ unsigned NF) {
+ static const unsigned RegClassIDs[] = {
+ RISCV::VRN2M1RegClassID, RISCV::VRN3M1RegClassID, RISCV::VRN4M1RegClassID,
+ RISCV::VRN5M1RegClassID, RISCV::VRN6M1RegClassID, RISCV::VRN7M1RegClassID,
+ RISCV::VRN8M1RegClassID};
+
+ return createTupleImpl(CurDAG, Regs, RegClassIDs[NF - 2], RISCV::sub_vrm1_0);
+}
+
+static SDValue createM2Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+ unsigned NF) {
+ static const unsigned RegClassIDs[] = {RISCV::VRN2M2RegClassID,
+ RISCV::VRN3M2RegClassID,
+ RISCV::VRN4M2RegClassID};
+
+ return createTupleImpl(CurDAG, Regs, RegClassIDs[NF - 2], RISCV::sub_vrm2_0);
+}
+
+static SDValue createM4Tuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+ unsigned NF) {
+ return createTupleImpl(CurDAG, Regs, RISCV::VRN2M4RegClassID,
+ RISCV::sub_vrm4_0);
+}
+
+static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
+ unsigned NF, RISCVVLMUL LMUL) {
+ switch (LMUL) {
+ default:
+ llvm_unreachable("Invalid LMUL.");
+ case RISCVVLMUL::LMUL_F8:
+ case RISCVVLMUL::LMUL_F4:
+ case RISCVVLMUL::LMUL_F2:
+ case RISCVVLMUL::LMUL_1:
+ return createM1Tuple(CurDAG, Regs, NF);
+ case RISCVVLMUL::LMUL_2:
+ return createM2Tuple(CurDAG, Regs, NF);
+ case RISCVVLMUL::LMUL_4:
+ return createM4Tuple(CurDAG, Regs, NF);
+ }
+}
+
+void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, unsigned IntNo,
+ bool IsStrided) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumValues() - 1;
+ EVT VT = Node->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 5> Operands;
+ Operands.push_back(Node->getOperand(2)); // Base pointer.
+ if (IsStrided) {
+ Operands.push_back(Node->getOperand(3)); // Stride.
+ Operands.push_back(Node->getOperand(4)); // VL.
+ } else {
+ Operands.push_back(Node->getOperand(3)); // VL.
+ }
+ Operands.push_back(SEW);
+ Operands.push_back(Node->getOperand(0)); // Chain.
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+ SDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I)
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+ VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLSEGMask(SDNode *Node, unsigned IntNo,
+ bool IsStrided) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumValues() - 1;
+ EVT VT = Node->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ SmallVector<SDValue, 7> Operands;
+ Operands.push_back(MaskedOff);
+ Operands.push_back(Node->getOperand(NF + 2)); // Base pointer.
+ if (IsStrided) {
+ Operands.push_back(Node->getOperand(NF + 3)); // Stride.
+ Operands.push_back(Node->getOperand(NF + 4)); // Mask.
+ Operands.push_back(Node->getOperand(NF + 5)); // VL.
+ } else {
+ Operands.push_back(Node->getOperand(NF + 3)); // Mask.
+ Operands.push_back(Node->getOperand(NF + 4)); // VL.
+ }
+ Operands.push_back(SEW);
+ Operands.push_back(Node->getOperand(0)); /// Chain.
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+ SDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I)
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+ VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node) {
+ SDLoc DL(Node);
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ unsigned NF = Node->getNumValues() - 2; // Do not count Chain and Glue.
+ EVT VT = Node->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 5> Operands;
+ Operands.push_back(Node->getOperand(2)); // Base pointer.
+ Operands.push_back(Node->getOperand(3)); // VL.
+ Operands.push_back(SEW);
+ Operands.push_back(Node->getOperand(0)); // Chain.
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+ SDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other,
+ MVT::Glue, Operands);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I)
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+ VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NF), SDValue(Load, 1)); // Chain.
+ ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Glue.
+ CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLSEGFFMask(SDNode *Node) {
+ SDLoc DL(Node);
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ unsigned NF = Node->getNumValues() - 2; // Do not count Chain and Glue.
+ EVT VT = Node->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ SmallVector<SDValue, 7> Operands;
+ Operands.push_back(MaskedOff);
+ Operands.push_back(Node->getOperand(NF + 2)); // Base pointer.
+ Operands.push_back(Node->getOperand(NF + 3)); // Mask.
+ Operands.push_back(Node->getOperand(NF + 4)); // VL.
+ Operands.push_back(SEW);
+ Operands.push_back(Node->getOperand(0)); /// Chain.
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+ SDNode *Load = CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other,
+ MVT::Glue, Operands);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I)
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+ VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NF), SDValue(Load, 1)); // Chain.
+ ReplaceUses(SDValue(Node, NF + 1), SDValue(Load, 2)); // Glue.
+ CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned IntNo) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumValues() - 1;
+ EVT VT = Node->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SDValue Operands[] = {
+ Node->getOperand(2), // Base pointer.
+ Node->getOperand(3), // Index.
+ Node->getOperand(4), // VL.
+ SEW, Node->getOperand(0) // Chain.
+ };
+
+ EVT IndexVT = Node->getOperand(3)->getValueType(0);
+ RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+ unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(IndexLMUL));
+ SDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I)
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+ VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVLXSEGMask(SDNode *Node, unsigned IntNo) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumValues() - 1;
+ EVT VT = Node->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue MaskedOff = createTuple(*CurDAG, Regs, NF, LMUL);
+ SDValue Operands[] = {
+ MaskedOff,
+ Node->getOperand(NF + 2), // Base pointer.
+ Node->getOperand(NF + 3), // Index.
+ Node->getOperand(NF + 4), // Mask.
+ Node->getOperand(NF + 5), // VL.
+ SEW,
+ Node->getOperand(0) // Chain.
+ };
+
+ EVT IndexVT = Node->getOperand(NF + 3)->getValueType(0);
+ RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+ unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(IndexLMUL));
+ SDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, MVT::Untyped, MVT::Other, Operands);
+ SDValue SuperReg = SDValue(Load, 0);
+ for (unsigned I = 0; I < NF; ++I)
+ ReplaceUses(SDValue(Node, I),
+ CurDAG->getTargetExtractSubreg(getSubregIndexByEVT(VT, I), DL,
+ VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NF), SDValue(Load, 1));
+ CurDAG->RemoveDeadNode(Node);
+}
+
+void RISCVDAGToDAGISel::selectVSSEG(SDNode *Node, unsigned IntNo,
+ bool IsStrided) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumOperands() - 4;
+ if (IsStrided)
+ NF--;
+ EVT VT = Node->getOperand(2)->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+ SmallVector<SDValue, 6> Operands;
+ Operands.push_back(StoreVal);
+ Operands.push_back(Node->getOperand(2 + NF)); // Base pointer.
+ if (IsStrided) {
+ Operands.push_back(Node->getOperand(3 + NF)); // Stride.
+ Operands.push_back(Node->getOperand(4 + NF)); // VL.
+ } else {
+ Operands.push_back(Node->getOperand(3 + NF)); // VL.
+ }
+ Operands.push_back(SEW);
+ Operands.push_back(Node->getOperand(0)); // Chain.
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+ SDNode *Store =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+ ReplaceNode(Node, Store);
+}
+
+void RISCVDAGToDAGISel::selectVSSEGMask(SDNode *Node, unsigned IntNo,
+ bool IsStrided) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumOperands() - 5;
+ if (IsStrided)
+ NF--;
+ EVT VT = Node->getOperand(2)->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+ SmallVector<SDValue, 7> Operands;
+ Operands.push_back(StoreVal);
+ Operands.push_back(Node->getOperand(2 + NF)); // Base pointer.
+ if (IsStrided) {
+ Operands.push_back(Node->getOperand(3 + NF)); // Stride.
+ Operands.push_back(Node->getOperand(4 + NF)); // Mask.
+ Operands.push_back(Node->getOperand(5 + NF)); // VL.
+ } else {
+ Operands.push_back(Node->getOperand(3 + NF)); // Mask.
+ Operands.push_back(Node->getOperand(4 + NF)); // VL.
+ }
+ Operands.push_back(SEW);
+ Operands.push_back(Node->getOperand(0)); // Chain.
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, ScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(RISCVVLMUL::LMUL_1));
+ SDNode *Store =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+ ReplaceNode(Node, Store);
+}
+
+void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned IntNo) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumOperands() - 5;
+ EVT VT = Node->getOperand(2)->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+ SDValue Operands[] = {
+ StoreVal,
+ Node->getOperand(2 + NF), // Base pointer.
+ Node->getOperand(3 + NF), // Index.
+ Node->getOperand(4 + NF), // VL.
+ SEW,
+ Node->getOperand(0) // Chain.
+ };
+
+ EVT IndexVT = Node->getOperand(3 + NF)->getValueType(0);
+ RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+ unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(IndexLMUL));
+ SDNode *Store =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+ ReplaceNode(Node, Store);
+}
+
+void RISCVDAGToDAGISel::selectVSXSEGMask(SDNode *Node, unsigned IntNo) {
+ SDLoc DL(Node);
+ unsigned NF = Node->getNumOperands() - 6;
+ EVT VT = Node->getOperand(2)->getValueType(0);
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ MVT XLenVT = Subtarget->getXLenVT();
+ RISCVVLMUL LMUL = getLMUL(VT);
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+ SmallVector<SDValue, 8> Regs(Node->op_begin() + 2, Node->op_begin() + 2 + NF);
+ SDValue StoreVal = createTuple(*CurDAG, Regs, NF, LMUL);
+ SDValue Operands[] = {
+ StoreVal,
+ Node->getOperand(2 + NF), // Base pointer.
+ Node->getOperand(3 + NF), // Index.
+ Node->getOperand(4 + NF), // Mask.
+ Node->getOperand(5 + NF), // VL.
+ SEW,
+ Node->getOperand(0) // Chain.
+ };
+
+ EVT IndexVT = Node->getOperand(3 + NF)->getValueType(0);
+ RISCVVLMUL IndexLMUL = getLMUL(IndexVT);
+ unsigned IndexScalarSize = IndexVT.getScalarSizeInBits();
+ const RISCVZvlssegTable::RISCVZvlsseg *P = RISCVZvlssegTable::getPseudo(
+ IntNo, IndexScalarSize, static_cast<unsigned>(LMUL),
+ static_cast<unsigned>(IndexLMUL));
+ SDNode *Store =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0), Operands);
+ ReplaceNode(Node, Store);
}
void RISCVDAGToDAGISel::Select(SDNode *Node) {
@@ -86,7 +511,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (!(-4096 <= Imm && Imm <= -2049) && !(2048 <= Imm && Imm <= 4094))
break;
// Break the imm to imm0+imm1.
- SDLoc DL(Node);
EVT VT = Node->getValueType(0);
const SDValue ImmOp0 = CurDAG->getTargetConstant(Imm - Imm / 2, DL, VT);
const SDValue ImmOp1 = CurDAG->getTargetConstant(Imm / 2, DL, VT);
@@ -102,14 +526,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
case ISD::Constant: {
auto ConstNode = cast<ConstantSDNode>(Node);
if (VT == XLenVT && ConstNode->isNullValue()) {
- SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
- RISCV::X0, XLenVT);
+ SDValue New =
+ CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, RISCV::X0, XLenVT);
ReplaceNode(Node, New.getNode());
return;
}
int64_t Imm = ConstNode->getSExtValue();
if (XLenVT == MVT::i64) {
- ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT));
+ ReplaceNode(Node, selectImm(CurDAG, DL, Imm, XLenVT));
return;
}
break;
@@ -121,38 +545,233 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
return;
}
- case ISD::SRL: {
- if (!Subtarget->is64Bit())
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ // By default we do not custom select any intrinsic.
+ default:
break;
- SDValue Op0 = Node->getOperand(0);
- SDValue Op1 = Node->getOperand(1);
- uint64_t Mask;
- // Match (srl (and val, mask), imm) where the result would be a
- // zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
- // is equivalent to this (SimplifyDemandedBits may have removed lower bits
- // from the mask that aren't necessary due to the right-shifting).
- if (Op1.getOpcode() == ISD::Constant &&
- isConstantMask(Op0.getNode(), Mask)) {
- uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();
-
- if ((Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
- SDValue ShAmtVal =
- CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
- CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
- ShAmtVal);
- return;
+
+ case Intrinsic::riscv_vsetvli: {
+ if (!Subtarget->hasStdExtV())
+ break;
+
+ assert(Node->getNumOperands() == 5);
+
+ RISCVVSEW VSEW =
+ static_cast<RISCVVSEW>(Node->getConstantOperandVal(3) & 0x7);
+ RISCVVLMUL VLMul =
+ static_cast<RISCVVLMUL>(Node->getConstantOperandVal(4) & 0x7);
+
+ unsigned VTypeI = RISCVVType::encodeVTYPE(
+ VLMul, VSEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
+ SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+ SDValue VLOperand = Node->getOperand(2);
+ if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
+ if (C->isNullValue()) {
+ VLOperand = SDValue(
+ CurDAG->getMachineNode(RISCV::ADDI, DL, XLenVT,
+ CurDAG->getRegister(RISCV::X0, XLenVT),
+ CurDAG->getTargetConstant(0, DL, XLenVT)),
+ 0);
+ }
}
+
+ ReplaceNode(Node,
+ CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
+ MVT::Other, VLOperand, VTypeIOp,
+ /* Chain */ Node->getOperand(0)));
+ return;
+ }
+ case Intrinsic::riscv_vsetvlimax: {
+ if (!Subtarget->hasStdExtV())
+ break;
+
+ assert(Node->getNumOperands() == 4);
+
+ RISCVVSEW VSEW =
+ static_cast<RISCVVSEW>(Node->getConstantOperandVal(2) & 0x7);
+ RISCVVLMUL VLMul =
+ static_cast<RISCVVLMUL>(Node->getConstantOperandVal(3) & 0x7);
+
+ unsigned VTypeI = RISCVVType::encodeVTYPE(
+ VLMul, VSEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
+ SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+ SDValue VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
+ ReplaceNode(Node,
+ CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
+ MVT::Other, VLOperand, VTypeIOp,
+ /* Chain */ Node->getOperand(0)));
+ return;
+ }
+ case Intrinsic::riscv_vlseg2:
+ case Intrinsic::riscv_vlseg3:
+ case Intrinsic::riscv_vlseg4:
+ case Intrinsic::riscv_vlseg5:
+ case Intrinsic::riscv_vlseg6:
+ case Intrinsic::riscv_vlseg7:
+ case Intrinsic::riscv_vlseg8: {
+ selectVLSEG(Node, IntNo, /*IsStrided=*/false);
+ return;
+ }
+ case Intrinsic::riscv_vlseg2_mask:
+ case Intrinsic::riscv_vlseg3_mask:
+ case Intrinsic::riscv_vlseg4_mask:
+ case Intrinsic::riscv_vlseg5_mask:
+ case Intrinsic::riscv_vlseg6_mask:
+ case Intrinsic::riscv_vlseg7_mask:
+ case Intrinsic::riscv_vlseg8_mask: {
+ selectVLSEGMask(Node, IntNo, /*IsStrided=*/false);
+ return;
+ }
+ case Intrinsic::riscv_vlsseg2:
+ case Intrinsic::riscv_vlsseg3:
+ case Intrinsic::riscv_vlsseg4:
+ case Intrinsic::riscv_vlsseg5:
+ case Intrinsic::riscv_vlsseg6:
+ case Intrinsic::riscv_vlsseg7:
+ case Intrinsic::riscv_vlsseg8: {
+ selectVLSEG(Node, IntNo, /*IsStrided=*/true);
+ return;
+ }
+ case Intrinsic::riscv_vlsseg2_mask:
+ case Intrinsic::riscv_vlsseg3_mask:
+ case Intrinsic::riscv_vlsseg4_mask:
+ case Intrinsic::riscv_vlsseg5_mask:
+ case Intrinsic::riscv_vlsseg6_mask:
+ case Intrinsic::riscv_vlsseg7_mask:
+ case Intrinsic::riscv_vlsseg8_mask: {
+ selectVLSEGMask(Node, IntNo, /*IsStrided=*/true);
+ return;
+ }
+ case Intrinsic::riscv_vloxseg2:
+ case Intrinsic::riscv_vloxseg3:
+ case Intrinsic::riscv_vloxseg4:
+ case Intrinsic::riscv_vloxseg5:
+ case Intrinsic::riscv_vloxseg6:
+ case Intrinsic::riscv_vloxseg7:
+ case Intrinsic::riscv_vloxseg8:
+ case Intrinsic::riscv_vluxseg2:
+ case Intrinsic::riscv_vluxseg3:
+ case Intrinsic::riscv_vluxseg4:
+ case Intrinsic::riscv_vluxseg5:
+ case Intrinsic::riscv_vluxseg6:
+ case Intrinsic::riscv_vluxseg7:
+ case Intrinsic::riscv_vluxseg8: {
+ selectVLXSEG(Node, IntNo);
+ return;
+ }
+ case Intrinsic::riscv_vloxseg2_mask:
+ case Intrinsic::riscv_vloxseg3_mask:
+ case Intrinsic::riscv_vloxseg4_mask:
+ case Intrinsic::riscv_vloxseg5_mask:
+ case Intrinsic::riscv_vloxseg6_mask:
+ case Intrinsic::riscv_vloxseg7_mask:
+ case Intrinsic::riscv_vloxseg8_mask:
+ case Intrinsic::riscv_vluxseg2_mask:
+ case Intrinsic::riscv_vluxseg3_mask:
+ case Intrinsic::riscv_vluxseg4_mask:
+ case Intrinsic::riscv_vluxseg5_mask:
+ case Intrinsic::riscv_vluxseg6_mask:
+ case Intrinsic::riscv_vluxseg7_mask:
+ case Intrinsic::riscv_vluxseg8_mask: {
+ selectVLXSEGMask(Node, IntNo);
+ return;
+ }
}
break;
}
- case RISCVISD::READ_CYCLE_WIDE:
- assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32");
-
- ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32,
- MVT::i32, MVT::Other,
- Node->getOperand(0)));
+ case ISD::INTRINSIC_VOID: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ case Intrinsic::riscv_vsseg2:
+ case Intrinsic::riscv_vsseg3:
+ case Intrinsic::riscv_vsseg4:
+ case Intrinsic::riscv_vsseg5:
+ case Intrinsic::riscv_vsseg6:
+ case Intrinsic::riscv_vsseg7:
+ case Intrinsic::riscv_vsseg8: {
+ selectVSSEG(Node, IntNo, /*IsStrided=*/false);
+ return;
+ }
+ case Intrinsic::riscv_vsseg2_mask:
+ case Intrinsic::riscv_vsseg3_mask:
+ case Intrinsic::riscv_vsseg4_mask:
+ case Intrinsic::riscv_vsseg5_mask:
+ case Intrinsic::riscv_vsseg6_mask:
+ case Intrinsic::riscv_vsseg7_mask:
+ case Intrinsic::riscv_vsseg8_mask: {
+ selectVSSEGMask(Node, IntNo, /*IsStrided=*/false);
+ return;
+ }
+ case Intrinsic::riscv_vssseg2:
+ case Intrinsic::riscv_vssseg3:
+ case Intrinsic::riscv_vssseg4:
+ case Intrinsic::riscv_vssseg5:
+ case Intrinsic::riscv_vssseg6:
+ case Intrinsic::riscv_vssseg7:
+ case Intrinsic::riscv_vssseg8: {
+ selectVSSEG(Node, IntNo, /*IsStrided=*/true);
+ return;
+ }
+ case Intrinsic::riscv_vssseg2_mask:
+ case Intrinsic::riscv_vssseg3_mask:
+ case Intrinsic::riscv_vssseg4_mask:
+ case Intrinsic::riscv_vssseg5_mask:
+ case Intrinsic::riscv_vssseg6_mask:
+ case Intrinsic::riscv_vssseg7_mask:
+ case Intrinsic::riscv_vssseg8_mask: {
+ selectVSSEGMask(Node, IntNo, /*IsStrided=*/true);
+ return;
+ }
+ case Intrinsic::riscv_vsoxseg2:
+ case Intrinsic::riscv_vsoxseg3:
+ case Intrinsic::riscv_vsoxseg4:
+ case Intrinsic::riscv_vsoxseg5:
+ case Intrinsic::riscv_vsoxseg6:
+ case Intrinsic::riscv_vsoxseg7:
+ case Intrinsic::riscv_vsoxseg8:
+ case Intrinsic::riscv_vsuxseg2:
+ case Intrinsic::riscv_vsuxseg3:
+ case Intrinsic::riscv_vsuxseg4:
+ case Intrinsic::riscv_vsuxseg5:
+ case Intrinsic::riscv_vsuxseg6:
+ case Intrinsic::riscv_vsuxseg7:
+ case Intrinsic::riscv_vsuxseg8: {
+ selectVSXSEG(Node, IntNo);
+ return;
+ }
+ case Intrinsic::riscv_vsoxseg2_mask:
+ case Intrinsic::riscv_vsoxseg3_mask:
+ case Intrinsic::riscv_vsoxseg4_mask:
+ case Intrinsic::riscv_vsoxseg5_mask:
+ case Intrinsic::riscv_vsoxseg6_mask:
+ case Intrinsic::riscv_vsoxseg7_mask:
+ case Intrinsic::riscv_vsoxseg8_mask:
+ case Intrinsic::riscv_vsuxseg2_mask:
+ case Intrinsic::riscv_vsuxseg3_mask:
+ case Intrinsic::riscv_vsuxseg4_mask:
+ case Intrinsic::riscv_vsuxseg5_mask:
+ case Intrinsic::riscv_vsuxseg6_mask:
+ case Intrinsic::riscv_vsuxseg7_mask:
+ case Intrinsic::riscv_vsuxseg8_mask: {
+ selectVSXSEGMask(Node, IntNo);
+ return;
+ }
+ }
+ break;
+ }
+ case RISCVISD::VLSEGFF: {
+ selectVLSEGFF(Node);
+ return;
+ }
+ case RISCVISD::VLSEGFF_MASK: {
+ selectVLSEGFFMask(Node);
return;
}
+ }
// Select the default instruction.
SelectCode(Node);
@@ -184,328 +803,202 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
return false;
}
-// Check that it is a SLOI (Shift Left Ones Immediate). We first check that
-// it is the right node tree:
+// Match (srl (and val, mask), imm) where the result would be a
+// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
+// is equivalent to this (SimplifyDemandedBits may have removed lower bits
+// from the mask that aren't necessary due to the right-shifting).
+bool RISCVDAGToDAGISel::MatchSRLIW(SDNode *N) const {
+ assert(N->getOpcode() == ISD::SRL);
+ assert(N->getOperand(0).getOpcode() == ISD::AND);
+ assert(isa<ConstantSDNode>(N->getOperand(1)));
+ assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
+
+ // The IsRV64 predicate is checked after PatFrag predicates so we can get
+ // here even on RV32.
+ if (!Subtarget->is64Bit())
+ return false;
+
+ SDValue And = N->getOperand(0);
+ uint64_t ShAmt = N->getConstantOperandVal(1);
+ uint64_t Mask = And.getConstantOperandVal(1);
+ return (Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff;
+}
+
+// Check that it is a SLOI (Shift Left Ones Immediate). A PatFrag has already
+// determined it has the right structure:
//
// (OR (SHL RS1, VC2), VC1)
//
-// and then we check that VC1, the mask used to fill with ones, is compatible
+// Check that VC1, the mask used to fill with ones, is compatible
// with VC2, the shamt:
//
-// VC1 == maskTrailingOnes<uint64_t>(VC2)
+// VC1 == maskTrailingOnes(VC2)
+//
+bool RISCVDAGToDAGISel::MatchSLOI(SDNode *N) const {
+ assert(N->getOpcode() == ISD::OR);
+ assert(N->getOperand(0).getOpcode() == ISD::SHL);
+ assert(isa<ConstantSDNode>(N->getOperand(1)));
+ assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) {
- MVT XLenVT = Subtarget->getXLenVT();
- if (N.getOpcode() == ISD::OR) {
- SDValue Or = N;
- if (Or.getOperand(0).getOpcode() == ISD::SHL) {
- SDValue Shl = Or.getOperand(0);
- if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
- isa<ConstantSDNode>(Or.getOperand(1))) {
- if (XLenVT == MVT::i64) {
- uint64_t VC1 = Or.getConstantOperandVal(1);
- uint64_t VC2 = Shl.getConstantOperandVal(1);
- if (VC1 == maskTrailingOnes<uint64_t>(VC2)) {
- RS1 = Shl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Shl.getOperand(1).getValueType());
- return true;
- }
- }
- if (XLenVT == MVT::i32) {
- uint32_t VC1 = Or.getConstantOperandVal(1);
- uint32_t VC2 = Shl.getConstantOperandVal(1);
- if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
- RS1 = Shl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Shl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
+ SDValue Shl = N->getOperand(0);
+ if (Subtarget->is64Bit()) {
+ uint64_t VC1 = N->getConstantOperandVal(1);
+ uint64_t VC2 = Shl.getConstantOperandVal(1);
+ return VC1 == maskTrailingOnes<uint64_t>(VC2);
}
- return false;
+
+ uint32_t VC1 = N->getConstantOperandVal(1);
+ uint32_t VC2 = Shl.getConstantOperandVal(1);
+ return VC1 == maskTrailingOnes<uint32_t>(VC2);
}
-// Check that it is a SROI (Shift Right Ones Immediate). We first check that
-// it is the right node tree:
+// Check that it is a SROI (Shift Right Ones Immediate). A PatFrag has already
+// determined it has the right structure:
//
// (OR (SRL RS1, VC2), VC1)
//
-// and then we check that VC1, the mask used to fill with ones, is compatible
+// Check that VC1, the mask used to fill with ones, is compatible
// with VC2, the shamt:
//
-// VC1 == maskLeadingOnes<uint64_t>(VC2)
+// VC1 == maskLeadingOnes(VC2)
+//
+bool RISCVDAGToDAGISel::MatchSROI(SDNode *N) const {
+ assert(N->getOpcode() == ISD::OR);
+ assert(N->getOperand(0).getOpcode() == ISD::SRL);
+ assert(isa<ConstantSDNode>(N->getOperand(1)));
+ assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) {
- MVT XLenVT = Subtarget->getXLenVT();
- if (N.getOpcode() == ISD::OR) {
- SDValue Or = N;
- if (Or.getOperand(0).getOpcode() == ISD::SRL) {
- SDValue Srl = Or.getOperand(0);
- if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
- isa<ConstantSDNode>(Or.getOperand(1))) {
- if (XLenVT == MVT::i64) {
- uint64_t VC1 = Or.getConstantOperandVal(1);
- uint64_t VC2 = Srl.getConstantOperandVal(1);
- if (VC1 == maskLeadingOnes<uint64_t>(VC2)) {
- RS1 = Srl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Srl.getOperand(1).getValueType());
- return true;
- }
- }
- if (XLenVT == MVT::i32) {
- uint32_t VC1 = Or.getConstantOperandVal(1);
- uint32_t VC2 = Srl.getConstantOperandVal(1);
- if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
- RS1 = Srl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Srl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
+ SDValue Srl = N->getOperand(0);
+ if (Subtarget->is64Bit()) {
+ uint64_t VC1 = N->getConstantOperandVal(1);
+ uint64_t VC2 = Srl.getConstantOperandVal(1);
+ return VC1 == maskLeadingOnes<uint64_t>(VC2);
}
- return false;
+
+ uint32_t VC1 = N->getConstantOperandVal(1);
+ uint32_t VC2 = Srl.getConstantOperandVal(1);
+ return VC1 == maskLeadingOnes<uint32_t>(VC2);
}
-// Check that it is a RORI (Rotate Right Immediate). We first check that
-// it is the right node tree:
+// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64). A PatFrag
+// has already determined it has the right structure:
//
-// (ROTL RS1, VC)
+// (OR (SRL RS1, VC2), VC1)
//
-// The compiler translates immediate rotations to the right given by the call
-// to the rotateright32/rotateright64 intrinsics as rotations to the left.
-// Since the rotation to the left can be easily emulated as a rotation to the
-// right by negating the constant, there is no encoding for ROLI.
-// We then select the immediate left rotations as RORI by the complementary
-// constant:
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+//
+// VC2 < 32
+// VC1 == maskTrailingZeros<uint64_t>(32 - VC2)
//
-// Shamt == XLen - VC
+bool RISCVDAGToDAGISel::MatchSROIW(SDNode *N) const {
+ assert(N->getOpcode() == ISD::OR);
+ assert(N->getOperand(0).getOpcode() == ISD::SRL);
+ assert(isa<ConstantSDNode>(N->getOperand(1)));
+ assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) {
- MVT XLenVT = Subtarget->getXLenVT();
- if (N.getOpcode() == ISD::ROTL) {
- if (isa<ConstantSDNode>(N.getOperand(1))) {
- if (XLenVT == MVT::i64) {
- uint64_t VC = N.getConstantOperandVal(1);
- Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N),
- N.getOperand(1).getValueType());
- RS1 = N.getOperand(0);
- return true;
- }
- if (XLenVT == MVT::i32) {
- uint32_t VC = N.getConstantOperandVal(1);
- Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N),
- N.getOperand(1).getValueType());
- RS1 = N.getOperand(0);
- return true;
- }
- }
- }
- return false;
-}
+ // The IsRV64 predicate is checked after PatFrag predicates so we can get
+ // here even on RV32.
+ if (!Subtarget->is64Bit())
+ return false;
+ SDValue Srl = N->getOperand(0);
+ uint64_t VC1 = N->getConstantOperandVal(1);
+ uint64_t VC2 = Srl.getConstantOperandVal(1);
+
+ // Immediate range should be enforced by uimm5 predicate.
+ assert(VC2 < 32 && "Unexpected immediate");
+ return VC1 == maskTrailingZeros<uint64_t>(32 - VC2);
+}
// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
// on RV64).
// SLLIUW is the same as SLLI except for the fact that it clears the bits
// XLEN-1:32 of the input RS1 before shifting.
-// We first check that it is the right node tree:
+// A PatFrag has already checked that it has the right structure:
//
// (AND (SHL RS1, VC2), VC1)
//
// We check that VC2, the shamt is less than 32, otherwise the pattern is
// exactly the same as SLLI and we give priority to that.
-// Eventually we check that that VC1, the mask used to clear the upper 32 bits
+// Eventually we check that VC1, the mask used to clear the upper 32 bits
// of RS1, is correct:
//
// VC1 == (0xFFFFFFFF << VC2)
+//
+bool RISCVDAGToDAGISel::MatchSLLIUW(SDNode *N) const {
+ assert(N->getOpcode() == ISD::AND);
+ assert(N->getOperand(0).getOpcode() == ISD::SHL);
+ assert(isa<ConstantSDNode>(N->getOperand(1)));
+ assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) {
- if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) {
- SDValue And = N;
- if (And.getOperand(0).getOpcode() == ISD::SHL) {
- SDValue Shl = And.getOperand(0);
- if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
- isa<ConstantSDNode>(And.getOperand(1))) {
- uint64_t VC1 = And.getConstantOperandVal(1);
- uint64_t VC2 = Shl.getConstantOperandVal(1);
- if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) {
- RS1 = Shl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Shl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
- return false;
-}
+ // The IsRV64 predicate is checked after PatFrag predicates so we can get
+ // here even on RV32.
+ if (!Subtarget->is64Bit())
+ return false;
-// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64).
-// We first check that it is the right node tree:
-//
-// (SIGN_EXTEND_INREG (OR (SHL RS1, VC2), VC1))
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-// VC1 == maskTrailingOnes<uint32_t>(VC2)
-
-bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
- if (Subtarget->getXLenVT() == MVT::i64 &&
- N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
- cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
- if (N.getOperand(0).getOpcode() == ISD::OR) {
- SDValue Or = N.getOperand(0);
- if (Or.getOperand(0).getOpcode() == ISD::SHL) {
- SDValue Shl = Or.getOperand(0);
- if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
- isa<ConstantSDNode>(Or.getOperand(1))) {
- uint32_t VC1 = Or.getConstantOperandVal(1);
- uint32_t VC2 = Shl.getConstantOperandVal(1);
- if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
- RS1 = Shl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Shl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
- }
- return false;
+ SDValue Shl = N->getOperand(0);
+ uint64_t VC1 = N->getConstantOperandVal(1);
+ uint64_t VC2 = Shl.getConstantOperandVal(1);
+
+ // Immediate range should be enforced by uimm5 predicate.
+ assert(VC2 < 32 && "Unexpected immediate");
+ return (VC1 >> VC2) == UINT64_C(0xFFFFFFFF);
}
-// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
-// We first check that it is the right node tree:
-//
-// (OR (SHL RS1, VC2), VC1)
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-// VC1 == maskLeadingOnes<uint32_t>(VC2)
-
-bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
- if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) {
- SDValue Or = N;
- if (Or.getOperand(0).getOpcode() == ISD::SRL) {
- SDValue Srl = Or.getOperand(0);
- if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
- isa<ConstantSDNode>(Or.getOperand(1))) {
- uint32_t VC1 = Or.getConstantOperandVal(1);
- uint32_t VC2 = Srl.getConstantOperandVal(1);
- if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
- RS1 = Srl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
- Srl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
- return false;
+bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
+ if (N.getOpcode() != ISD::SPLAT_VECTOR &&
+ N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64)
+ return false;
+ SplatVal = N.getOperand(0);
+ return true;
}
-// Check that it is a RORIW (i32 Right Rotate Immediate on RV64).
-// We first check that it is the right node tree:
-//
-// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
-// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
-//
-// Then we check that the constant operands respect these constraints:
-//
-// VC2 == 32 - VC1
-// VC3 == maskLeadingOnes<uint32_t>(VC2)
-//
-// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
-// and VC3 a 32 bit mask of (32 - VC1) leading ones.
-
-bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
- if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
- Subtarget->getXLenVT() == MVT::i64 &&
- cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
- if (N.getOperand(0).getOpcode() == ISD::OR) {
- SDValue Or = N.getOperand(0);
- if (Or.getOperand(0).getOpcode() == ISD::SHL &&
- Or.getOperand(1).getOpcode() == ISD::SRL) {
- SDValue Shl = Or.getOperand(0);
- SDValue Srl = Or.getOperand(1);
- if (Srl.getOperand(0).getOpcode() == ISD::AND) {
- SDValue And = Srl.getOperand(0);
- if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
- isa<ConstantSDNode>(Shl.getOperand(1)) &&
- isa<ConstantSDNode>(And.getOperand(1))) {
- uint32_t VC1 = Srl.getConstantOperandVal(1);
- uint32_t VC2 = Shl.getConstantOperandVal(1);
- uint32_t VC3 = And.getConstantOperandVal(1);
- if (VC2 == (32 - VC1) &&
- VC3 == maskLeadingOnes<uint32_t>(VC2)) {
- RS1 = Shl.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
- Srl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
- }
+bool RISCVDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &SplatVal) {
+ if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
+ N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64) ||
+ !isa<ConstantSDNode>(N.getOperand(0)))
+ return false;
+
+ int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+
+ // Both ISD::SPLAT_VECTOR and RISCVISD::SPLAT_VECTOR_I64 share semantics when
+ // the operand type is wider than the resulting vector element type: an
+ // implicit truncation first takes place. Therefore, perform a manual
+ // truncation/sign-extension in order to ignore any truncated bits and catch
+ // any zero-extended immediate.
+ // For example, we wish to match (i8 -1) -> (XLenVT 255) as a simm5 by first
+ // sign-extending to (XLenVT -1).
+ auto XLenVT = Subtarget->getXLenVT();
+ assert(XLenVT == N.getOperand(0).getSimpleValueType() &&
+ "Unexpected splat operand type");
+ auto EltVT = N.getValueType().getVectorElementType();
+ if (EltVT.bitsLT(XLenVT)) {
+ SplatImm = SignExtend64(SplatImm, EltVT.getSizeInBits());
}
- return false;
+
+ if (!isInt<5>(SplatImm))
+ return false;
+
+ SplatVal = CurDAG->getTargetConstant(SplatImm, SDLoc(N), XLenVT);
+ return true;
}
-// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64).
-// We first check that it is the right node tree:
-//
-// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
-// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
-//
-// Then we check that the constant operands respect these constraints:
-//
-// VC2 == 32 - VC1
-// VC3 == maskLeadingOnes<uint32_t>(VC2)
-//
-// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
-// and VC3 a 32 bit mask of (32 - VC1) leading ones.
-
-bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2,
- SDValue &Shamt) {
- if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
- Subtarget->getXLenVT() == MVT::i64 &&
- cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
- if (N.getOperand(0).getOpcode() == ISD::OR) {
- SDValue Or = N.getOperand(0);
- if (Or.getOperand(0).getOpcode() == ISD::SHL &&
- Or.getOperand(1).getOpcode() == ISD::SRL) {
- SDValue Shl = Or.getOperand(0);
- SDValue Srl = Or.getOperand(1);
- if (Srl.getOperand(0).getOpcode() == ISD::AND) {
- SDValue And = Srl.getOperand(0);
- if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
- isa<ConstantSDNode>(Shl.getOperand(1)) &&
- isa<ConstantSDNode>(And.getOperand(1))) {
- uint32_t VC1 = Srl.getConstantOperandVal(1);
- uint32_t VC2 = Shl.getConstantOperandVal(1);
- uint32_t VC3 = And.getConstantOperandVal(1);
- if (VC2 == (32 - VC1) &&
- VC3 == maskLeadingOnes<uint32_t>(VC2)) {
- RS1 = Shl.getOperand(0);
- RS2 = And.getOperand(0);
- Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
- Srl.getOperand(1).getValueType());
- return true;
- }
- }
- }
- }
- }
- }
- return false;
+bool RISCVDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &SplatVal) {
+ if ((N.getOpcode() != ISD::SPLAT_VECTOR &&
+ N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64) ||
+ !isa<ConstantSDNode>(N.getOperand(0)))
+ return false;
+
+ int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+
+ if (!isUInt<5>(SplatImm))
+ return false;
+
+ SplatVal =
+ CurDAG->getTargetConstant(SplatImm, SDLoc(N), Subtarget->getXLenVT());
+
+ return true;
}
// Merge an ADDI into the offset of a load/store instruction where possible.
@@ -536,6 +1029,7 @@ void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
case RISCV::LHU:
case RISCV::LWU:
case RISCV::LD:
+ case RISCV::FLH:
case RISCV::FLW:
case RISCV::FLD:
BaseOpIdx = 0;
@@ -545,6 +1039,7 @@ void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
case RISCV::SH:
case RISCV::SW:
case RISCV::SD:
+ case RISCV::FSH:
case RISCV::FSW:
case RISCV::FSD:
BaseOpIdx = 1;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 0ca12510a230..23601c3b8f06 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -45,14 +45,26 @@ public:
bool SelectAddrFI(SDValue Addr, SDValue &Base);
- bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
- bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt);
+ bool MatchSRLIW(SDNode *N) const;
+ bool MatchSLOI(SDNode *N) const;
+ bool MatchSROI(SDNode *N) const;
+ bool MatchSROIW(SDNode *N) const;
+ bool MatchSLLIUW(SDNode *N) const;
+
+ bool selectVSplat(SDValue N, SDValue &SplatVal);
+ bool selectVSplatSimm5(SDValue N, SDValue &SplatVal);
+ bool selectVSplatUimm5(SDValue N, SDValue &SplatVal);
+
+ void selectVLSEG(SDNode *Node, unsigned IntNo, bool IsStrided);
+ void selectVLSEGMask(SDNode *Node, unsigned IntNo, bool IsStrided);
+ void selectVLSEGFF(SDNode *Node);
+ void selectVLSEGFFMask(SDNode *Node);
+ void selectVLXSEG(SDNode *Node, unsigned IntNo);
+ void selectVLXSEGMask(SDNode *Node, unsigned IntNo);
+ void selectVSSEG(SDNode *Node, unsigned IntNo, bool IsStrided);
+ void selectVSSEGMask(SDNode *Node, unsigned IntNo, bool IsStrided);
+ void selectVSXSEG(SDNode *Node, unsigned IntNo);
+ void selectVSXSEGMask(SDNode *Node, unsigned IntNo);
// Include the pieces autogenerated from the target description.
#include "RISCVGenDAGISel.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 03d9eefd59d0..97f46d9731b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12,12 +12,12 @@
//===----------------------------------------------------------------------===//
#include "RISCVISelLowering.h"
+#include "MCTargetDesc/RISCVMatInt.h"
#include "RISCV.h"
#include "RISCVMachineFunctionInfo.h"
#include "RISCVRegisterInfo.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
-#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/CallingConvLower.h"
@@ -25,7 +25,6 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -33,6 +32,7 @@
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -83,11 +83,73 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// Set up the register classes.
addRegisterClass(XLenVT, &RISCV::GPRRegClass);
+ if (Subtarget.hasStdExtZfh())
+ addRegisterClass(MVT::f16, &RISCV::FPR16RegClass);
if (Subtarget.hasStdExtF())
addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
if (Subtarget.hasStdExtD())
addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
+ if (Subtarget.hasStdExtV()) {
+ addRegisterClass(RISCVVMVTs::vbool64_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vbool32_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vbool16_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vbool8_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vbool4_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vbool2_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vbool1_t, &RISCV::VRRegClass);
+
+ addRegisterClass(RISCVVMVTs::vint8mf8_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint8mf4_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint8mf2_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint8m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint8m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vint8m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vint8m8_t, &RISCV::VRM8RegClass);
+
+ addRegisterClass(RISCVVMVTs::vint16mf4_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint16mf2_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint16m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint16m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vint16m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vint16m8_t, &RISCV::VRM8RegClass);
+
+ addRegisterClass(RISCVVMVTs::vint32mf2_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint32m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint32m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vint32m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vint32m8_t, &RISCV::VRM8RegClass);
+
+ addRegisterClass(RISCVVMVTs::vint64m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vint64m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vint64m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vint64m8_t, &RISCV::VRM8RegClass);
+
+ if (Subtarget.hasStdExtZfh()) {
+ addRegisterClass(RISCVVMVTs::vfloat16mf4_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vfloat16mf2_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vfloat16m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vfloat16m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vfloat16m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vfloat16m8_t, &RISCV::VRM8RegClass);
+ }
+
+ if (Subtarget.hasStdExtF()) {
+ addRegisterClass(RISCVVMVTs::vfloat32mf2_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vfloat32m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vfloat32m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vfloat32m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vfloat32m8_t, &RISCV::VRM8RegClass);
+ }
+
+ if (Subtarget.hasStdExtD()) {
+ addRegisterClass(RISCVVMVTs::vfloat64m1_t, &RISCV::VRRegClass);
+ addRegisterClass(RISCVVMVTs::vfloat64m2_t, &RISCV::VRM2RegClass);
+ addRegisterClass(RISCVVMVTs::vfloat64m4_t, &RISCV::VRM4RegClass);
+ addRegisterClass(RISCVVMVTs::vfloat64m8_t, &RISCV::VRM8RegClass);
+ }
+ }
+
// Compute derived properties from the register classes.
computeRegisterProperties(STI.getRegisterInfo());
@@ -101,7 +163,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, XLenVT, Expand);
- setOperationAction(ISD::SELECT, XLenVT, Custom);
setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -112,8 +173,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
- for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
- setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ if (!Subtarget.hasStdExtZbb()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+ }
if (Subtarget.is64Bit()) {
setOperationAction(ISD::ADD, MVT::i32, Custom);
@@ -135,6 +199,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
setOperationAction(ISD::MUL, MVT::i32, Custom);
+
+ setOperationAction(ISD::SDIV, MVT::i8, Custom);
+ setOperationAction(ISD::UDIV, MVT::i8, Custom);
+ setOperationAction(ISD::UREM, MVT::i8, Custom);
+ setOperationAction(ISD::SDIV, MVT::i16, Custom);
+ setOperationAction(ISD::UDIV, MVT::i16, Custom);
+ setOperationAction(ISD::UREM, MVT::i16, Custom);
setOperationAction(ISD::SDIV, MVT::i32, Custom);
setOperationAction(ISD::UDIV, MVT::i32, Custom);
setOperationAction(ISD::UREM, MVT::i32, Custom);
@@ -149,46 +220,90 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
- if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) {
+ if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::ROTL, MVT::i32, Custom);
+ setOperationAction(ISD::ROTR, MVT::i32, Custom);
+ }
+ } else {
setOperationAction(ISD::ROTL, XLenVT, Expand);
setOperationAction(ISD::ROTR, XLenVT, Expand);
}
- if (!Subtarget.hasStdExtZbp())
- setOperationAction(ISD::BSWAP, XLenVT, Expand);
+ if (Subtarget.hasStdExtZbp()) {
+ // Custom lower bswap/bitreverse so we can convert them to GREVI to enable
+ // more combining.
+ setOperationAction(ISD::BITREVERSE, XLenVT, Custom);
+ setOperationAction(ISD::BSWAP, XLenVT, Custom);
- if (!Subtarget.hasStdExtZbb()) {
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+ setOperationAction(ISD::BSWAP, MVT::i32, Custom);
+ }
+ } else {
+ // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
+ // pattern match it directly in isel.
+ setOperationAction(ISD::BSWAP, XLenVT,
+ Subtarget.hasStdExtZbb() ? Legal : Expand);
+ }
+
+ if (Subtarget.hasStdExtZbb()) {
+ setOperationAction(ISD::SMIN, XLenVT, Legal);
+ setOperationAction(ISD::SMAX, XLenVT, Legal);
+ setOperationAction(ISD::UMIN, XLenVT, Legal);
+ setOperationAction(ISD::UMAX, XLenVT, Legal);
+ } else {
setOperationAction(ISD::CTTZ, XLenVT, Expand);
setOperationAction(ISD::CTLZ, XLenVT, Expand);
setOperationAction(ISD::CTPOP, XLenVT, Expand);
}
- if (Subtarget.hasStdExtZbp())
- setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
-
if (Subtarget.hasStdExtZbt()) {
setOperationAction(ISD::FSHL, XLenVT, Legal);
setOperationAction(ISD::FSHR, XLenVT, Legal);
+ setOperationAction(ISD::SELECT, XLenVT, Legal);
+
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::FSHL, MVT::i32, Custom);
+ setOperationAction(ISD::FSHR, MVT::i32, Custom);
+ }
+ } else {
+ setOperationAction(ISD::SELECT, XLenVT, Custom);
}
- ISD::CondCode FPCCToExtend[] = {
+ ISD::CondCode FPCCToExpand[] = {
ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
- ISD::SETGE, ISD::SETNE};
+ ISD::SETGE, ISD::SETNE, ISD::SETO, ISD::SETUO};
- ISD::NodeType FPOpToExtend[] = {
+ ISD::NodeType FPOpToExpand[] = {
ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM, ISD::FP16_TO_FP,
ISD::FP_TO_FP16};
+ if (Subtarget.hasStdExtZfh())
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+
+ if (Subtarget.hasStdExtZfh()) {
+ setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+ for (auto CC : FPCCToExpand)
+ setCondCodeAction(CC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::f16, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+ for (auto Op : FPOpToExpand)
+ setOperationAction(Op, MVT::f16, Expand);
+ }
+
if (Subtarget.hasStdExtF()) {
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
- for (auto CC : FPCCToExtend)
+ for (auto CC : FPCCToExpand)
setCondCodeAction(CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
- for (auto Op : FPOpToExtend)
+ for (auto Op : FPOpToExpand)
setOperationAction(Op, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
@@ -200,21 +315,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtD()) {
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
- for (auto CC : FPCCToExtend)
+ for (auto CC : FPCCToExpand)
setCondCodeAction(CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- for (auto Op : FPOpToExtend)
+ for (auto Op : FPOpToExpand)
setOperationAction(Op, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
}
- if (Subtarget.is64Bit() &&
- !(Subtarget.hasStdExtD() || Subtarget.hasStdExtF())) {
+ if (Subtarget.is64Bit()) {
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
@@ -224,6 +338,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
setOperationAction(ISD::BlockAddress, XLenVT, Custom);
setOperationAction(ISD::ConstantPool, XLenVT, Custom);
+ setOperationAction(ISD::JumpTable, XLenVT, Custom);
setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);
@@ -245,25 +360,133 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setBooleanContents(ZeroOrOneBooleanContent);
+ if (Subtarget.hasStdExtV()) {
+ setBooleanVectorContents(ZeroOrOneBooleanContent);
+
+ setOperationAction(ISD::VSCALE, XLenVT, Custom);
+
+ // RVV intrinsics may have illegal operands.
+ // We also need to custom legalize vmv.x.s.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
+
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+ }
+
+ for (auto VT : MVT::integer_scalable_vector_valuetypes()) {
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+
+ if (isTypeLegal(VT)) {
+ // Custom-lower extensions and truncations from/to mask types.
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+
+ // We custom-lower all legally-typed vector truncates:
+ // 1. Mask VTs are custom-expanded into a series of standard nodes
+ // 2. Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR"
+ // nodes which truncate by one power of two at a time.
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+ // Custom-lower insert/extract operations to simplify patterns.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+ }
+
+ // We must custom-lower certain vXi64 operations on RV32 due to the vector
+ // element type being illegal.
+ if (!Subtarget.is64Bit()) {
+ setOperationAction(ISD::SPLAT_VECTOR, MVT::i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::i64, Custom);
+ }
+
+ // Expand various CCs to best match the RVV ISA, which natively supports UNE
+ // but no other unordered comparisons, and supports all ordered comparisons
+ // except ONE. Additionally, we expand GT,OGT,GE,OGE for optimization
+ // purposes; they are expanded to their swapped-operand CCs (LT,OLT,LE,OLE),
+ // and we pattern-match those back to the "original", swapping operands once
+ // more. This way we catch both operations and both "vf" and "fv" forms with
+ // fewer patterns.
+ ISD::CondCode VFPCCToExpand[] = {
+ ISD::SETO, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
+ ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUO,
+ ISD::SETGT, ISD::SETOGT, ISD::SETGE, ISD::SETOGE,
+ };
+
+ // Sets common operation actions on RVV floating-point vector types.
+ const auto SetCommonVFPActions = [&](MVT VT) {
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+ // Custom-lower insert/extract operations to simplify patterns.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ for (auto CC : VFPCCToExpand)
+ setCondCodeAction(CC, VT, Expand);
+ };
+
+ if (Subtarget.hasStdExtZfh()) {
+ for (auto VT : {RISCVVMVTs::vfloat16mf4_t, RISCVVMVTs::vfloat16mf2_t,
+ RISCVVMVTs::vfloat16m1_t, RISCVVMVTs::vfloat16m2_t,
+ RISCVVMVTs::vfloat16m4_t, RISCVVMVTs::vfloat16m8_t})
+ SetCommonVFPActions(VT);
+ }
+
+ if (Subtarget.hasStdExtF()) {
+ for (auto VT : {RISCVVMVTs::vfloat32mf2_t, RISCVVMVTs::vfloat32m1_t,
+ RISCVVMVTs::vfloat32m2_t, RISCVVMVTs::vfloat32m4_t,
+ RISCVVMVTs::vfloat32m8_t})
+ SetCommonVFPActions(VT);
+ }
+
+ if (Subtarget.hasStdExtD()) {
+ for (auto VT : {RISCVVMVTs::vfloat64m1_t, RISCVVMVTs::vfloat64m2_t,
+ RISCVVMVTs::vfloat64m4_t, RISCVVMVTs::vfloat64m8_t})
+ SetCommonVFPActions(VT);
+ }
+ }
+
// Function alignments.
const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4);
setMinFunctionAlignment(FunctionAlignment);
setPrefFunctionAlignment(FunctionAlignment);
- // Effectively disable jump table generation.
- setMinimumJumpTableEntries(INT_MAX);
+ setMinimumJumpTableEntries(5);
// Jumps are expensive, compared to logic
setJumpIsExpensive();
// We can use any register for comparisons
setHasMultipleConditionRegisters();
+
+ setTargetDAGCombine(ISD::SETCC);
+ if (Subtarget.hasStdExtZbp()) {
+ setTargetDAGCombine(ISD::OR);
+ }
}
EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
+ if (Subtarget.hasStdExtV())
+ return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
@@ -367,8 +590,18 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
}
+bool RISCVTargetLowering::isCheapToSpeculateCttz() const {
+ return Subtarget.hasStdExtZbb();
+}
+
+bool RISCVTargetLowering::isCheapToSpeculateCtlz() const {
+ return Subtarget.hasStdExtZbb();
+}
+
bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
+ if (VT == MVT::f16 && !Subtarget.hasStdExtZfh())
+ return false;
if (VT == MVT::f32 && !Subtarget.hasStdExtF())
return false;
if (VT == MVT::f64 && !Subtarget.hasStdExtD())
@@ -379,7 +612,8 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
}
bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
- return (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
+ return (VT == MVT::f16 && Subtarget.hasStdExtZfh()) ||
+ (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
(VT == MVT::f64 && Subtarget.hasStdExtD());
}
@@ -433,6 +667,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerBlockAddress(Op, DAG);
case ISD::ConstantPool:
return lowerConstantPool(Op, DAG);
+ case ISD::JumpTable:
+ return lowerJumpTable(Op, DAG);
case ISD::GlobalTLSAddress:
return lowerGlobalTLSAddress(Op, DAG);
case ISD::SELECT:
@@ -450,18 +686,105 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::SRL_PARTS:
return lowerShiftRightParts(Op, DAG, false);
case ISD::BITCAST: {
- assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
+ assert(((Subtarget.is64Bit() && Subtarget.hasStdExtF()) ||
+ Subtarget.hasStdExtZfh()) &&
"Unexpected custom legalisation");
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
- if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32)
- return SDValue();
- SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
- SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
- return FPConv;
+ if (Op.getValueType() == MVT::f16 && Subtarget.hasStdExtZfh()) {
+ if (Op0.getValueType() != MVT::i16)
+ return SDValue();
+ SDValue NewOp0 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Op0);
+ SDValue FPConv = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, NewOp0);
+ return FPConv;
+ } else if (Op.getValueType() == MVT::f32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtF()) {
+ if (Op0.getValueType() != MVT::i32)
+ return SDValue();
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+ SDValue FPConv =
+ DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
+ return FPConv;
+ }
+ return SDValue();
}
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN:
+ return LowerINTRINSIC_W_CHAIN(Op, DAG);
+ case ISD::BSWAP:
+ case ISD::BITREVERSE: {
+ // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
+ assert(Subtarget.hasStdExtZbp() && "Unexpected custom legalisation");
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ // Start with the maximum immediate value which is the bitwidth - 1.
+ unsigned Imm = VT.getSizeInBits() - 1;
+ // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits.
+ if (Op.getOpcode() == ISD::BSWAP)
+ Imm &= ~0x7U;
+ return DAG.getNode(RISCVISD::GREVI, DL, VT, Op.getOperand(0),
+ DAG.getTargetConstant(Imm, DL, Subtarget.getXLenVT()));
+ }
+ case ISD::TRUNCATE: {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ // Only custom-lower vector truncates
+ if (!VT.isVector())
+ return Op;
+
+ // Truncates to mask types are handled differently
+ if (VT.getVectorElementType() == MVT::i1)
+ return lowerVectorMaskTrunc(Op, DAG);
+
+ // RVV only has truncates which operate from SEW*2->SEW, so lower arbitrary
+ // truncates as a series of "RISCVISD::TRUNCATE_VECTOR" nodes which
+ // truncate by one power of two at a time.
+ EVT DstEltVT = VT.getVectorElementType();
+
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ EVT SrcEltVT = SrcVT.getVectorElementType();
+
+ assert(DstEltVT.bitsLT(SrcEltVT) &&
+ isPowerOf2_64(DstEltVT.getSizeInBits()) &&
+ isPowerOf2_64(SrcEltVT.getSizeInBits()) &&
+ "Unexpected vector truncate lowering");
+
+ SDValue Result = Src;
+ LLVMContext &Context = *DAG.getContext();
+ const ElementCount Count = SrcVT.getVectorElementCount();
+ do {
+ SrcEltVT = EVT::getIntegerVT(Context, SrcEltVT.getSizeInBits() / 2);
+ EVT ResultVT = EVT::getVectorVT(Context, SrcEltVT, Count);
+ Result = DAG.getNode(RISCVISD::TRUNCATE_VECTOR, DL, ResultVT, Result);
+ } while (SrcEltVT != DstEltVT);
+
+ return Result;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ 1);
+ case ISD::SIGN_EXTEND:
+ return lowerVectorMaskExt(Op, DAG, /*ExtVal*/ -1);
+ case ISD::SPLAT_VECTOR:
+ return lowerSPLATVECTOR(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT:
+ return lowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::VSCALE: {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ SDValue VLENB = DAG.getNode(RISCVISD::READ_VLENB, DL, VT);
+ // We define our scalable vector types for lmul=1 to use a 64 bit known
+ // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
+ // vscale as VLENB / 8.
+ SDValue VScale = DAG.getNode(ISD::SRL, DL, VT, VLENB,
+ DAG.getConstant(3, DL, VT));
+ return DAG.getNode(ISD::MUL, DL, VT, VScale, Op.getOperand(0));
+ }
}
}
@@ -482,6 +805,11 @@ static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
N->getOffset(), Flags);
}
+static SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty,
+ SelectionDAG &DAG, unsigned Flags) {
+ return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags);
+}
+
template <class NodeTy>
SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
bool IsLocal) const {
@@ -559,6 +887,13 @@ SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
return getAddr(N, DAG);
}
+SDValue RISCVTargetLowering::lowerJumpTable(SDValue Op,
+ SelectionDAG &DAG) const {
+ JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+
+ return getAddr(N, DAG);
+}
+
SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
SelectionDAG &DAG,
bool UseGOT) const {
@@ -642,6 +977,10 @@ SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());
+ if (DAG.getMachineFunction().getFunction().getCallingConv() ==
+ CallingConv::GHC)
+ report_fatal_error("In GHC calling convention TLS is not supported");
+
SDValue Addr;
switch (Model) {
case TLSModel::LocalExec:
@@ -689,9 +1028,8 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
normaliseSetCC(LHS, RHS, CCVal);
SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
- return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
+ return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
}
// Otherwise:
@@ -700,10 +1038,9 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Zero = DAG.getConstant(0, DL, XLenVT);
SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};
- return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
+ return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
}
SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -865,10 +1202,226 @@ SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
return DAG.getMergeValues(Parts, DL);
}
+// Custom-lower a SPLAT_VECTOR where XLEN<SEW, as the SEW element type is
+// illegal (currently only vXi64 RV32).
+// FIXME: We could also catch non-constant sign-extended i32 values and lower
+// them to SPLAT_VECTOR_I64
+SDValue RISCVTargetLowering::lowerSPLATVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VecVT = Op.getValueType();
+ assert(!Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64 &&
+ "Unexpected SPLAT_VECTOR lowering");
+ SDValue SplatVal = Op.getOperand(0);
+
+ // If we can prove that the value is a sign-extended 32-bit value, lower this
+ // as a custom node in order to try and match RVV vector/scalar instructions.
+ if (auto *CVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+ if (isInt<32>(CVal->getSExtValue()))
+ return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
+ DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32));
+ }
+
+ if (SplatVal.getOpcode() == ISD::SIGN_EXTEND &&
+ SplatVal.getOperand(0).getValueType() == MVT::i32) {
+ return DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
+ SplatVal.getOperand(0));
+ }
+
+ // Else, on RV32 we lower an i64-element SPLAT_VECTOR thus, being careful not
+ // to accidentally sign-extend the 32-bit halves to the e64 SEW:
+ // vmv.v.x vX, hi
+ // vsll.vx vX, vX, /*32*/
+ // vmv.v.x vY, lo
+ // vsll.vx vY, vY, /*32*/
+ // vsrl.vx vY, vY, /*32*/
+ // vor.vv vX, vX, vY
+ SDValue One = DAG.getConstant(1, DL, MVT::i32);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+ SDValue ThirtyTwoV = DAG.getConstant(32, DL, VecVT);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, SplatVal, Zero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, SplatVal, One);
+
+ Lo = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Lo);
+ Lo = DAG.getNode(ISD::SHL, DL, VecVT, Lo, ThirtyTwoV);
+ Lo = DAG.getNode(ISD::SRL, DL, VecVT, Lo, ThirtyTwoV);
+
+ if (isNullConstant(Hi))
+ return Lo;
+
+ Hi = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, VecVT, Hi, ThirtyTwoV);
+
+ return DAG.getNode(ISD::OR, DL, VecVT, Lo, Hi);
+}
+
+// Custom-lower extensions from mask vectors by using a vselect either with 1
+// for zero/any-extension or -1 for sign-extension:
+// (vXiN = (s|z)ext vXi1:vmask) -> (vXiN = vselect vmask, (-1 or 1), 0)
+// Note that any-extension is lowered identically to zero-extension.
+SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
+ int64_t ExtTrueVal) const {
+ SDLoc DL(Op);
+ EVT VecVT = Op.getValueType();
+ SDValue Src = Op.getOperand(0);
+ // Only custom-lower extensions from mask types
+ if (!Src.getValueType().isVector() ||
+ Src.getValueType().getVectorElementType() != MVT::i1)
+ return Op;
+
+ // Be careful not to introduce illegal scalar types at this stage, and be
+ // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
+ // illegal and must be expanded. Since we know that the constants are
+ // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
+ bool IsRV32E64 =
+ !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
+ SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+ SDValue SplatTrueVal = DAG.getConstant(ExtTrueVal, DL, Subtarget.getXLenVT());
+
+ if (!IsRV32E64) {
+ SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
+ SplatTrueVal = DAG.getSplatVector(VecVT, DL, SplatTrueVal);
+ } else {
+ SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
+ SplatTrueVal =
+ DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatTrueVal);
+ }
+
+ return DAG.getNode(ISD::VSELECT, DL, VecVT, Src, SplatTrueVal, SplatZero);
+}
+
+// Custom-lower truncations from vectors to mask vectors by using a mask and a
+// setcc operation:
+// (vXi1 = trunc vXiN vec) -> (vXi1 = setcc (and vec, 1), 0, ne)
+SDValue RISCVTargetLowering::lowerVectorMaskTrunc(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT MaskVT = Op.getValueType();
+ // Only expect to custom-lower truncations to mask types
+ assert(MaskVT.isVector() && MaskVT.getVectorElementType() == MVT::i1 &&
+ "Unexpected type for vector mask lowering");
+ SDValue Src = Op.getOperand(0);
+ EVT VecVT = Src.getValueType();
+
+ // Be careful not to introduce illegal scalar types at this stage, and be
+ // careful also about splatting constants as on RV32, vXi64 SPLAT_VECTOR is
+ // illegal and must be expanded. Since we know that the constants are
+ // sign-extended 32-bit values, we use SPLAT_VECTOR_I64 directly.
+ bool IsRV32E64 =
+ !Subtarget.is64Bit() && VecVT.getVectorElementType() == MVT::i64;
+ SDValue SplatOne = DAG.getConstant(1, DL, Subtarget.getXLenVT());
+ SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+
+ if (!IsRV32E64) {
+ SplatOne = DAG.getSplatVector(VecVT, DL, SplatOne);
+ SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
+ } else {
+ SplatOne = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatOne);
+ SplatZero = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, SplatZero);
+ }
+
+ SDValue Trunc = DAG.getNode(ISD::AND, DL, VecVT, Src, SplatOne);
+
+ return DAG.getSetCC(DL, MaskVT, Trunc, SplatZero, ISD::SETNE);
+}
+
+SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VecVT = Op.getValueType();
+ SDValue Vec = Op.getOperand(0);
+ SDValue Val = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ // Custom-legalize INSERT_VECTOR_ELT where XLEN>=SEW, so that the vector is
+ // first slid down into position, the value is inserted into the first
+ // position, and the vector is slid back up. We do this to simplify patterns.
+ // (slideup vec, (insertelt (slidedown impdef, vec, idx), val, 0), idx),
+ if (Subtarget.is64Bit() || VecVT.getVectorElementType() != MVT::i64) {
+ if (isNullConstant(Idx))
+ return Op;
+ SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
+ DAG.getUNDEF(VecVT), Vec, Idx);
+ SDValue InsertElt0 =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Slidedown, Val,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+
+ return DAG.getNode(RISCVISD::VSLIDEUP, DL, VecVT, Vec, InsertElt0, Idx);
+ }
+
+ // Custom-legalize INSERT_VECTOR_ELT where XLEN<SEW, as the SEW element type
+ // is illegal (currently only vXi64 RV32).
+ // Since there is no easy way of getting a single element into a vector when
+ // XLEN<SEW, we lower the operation to the following sequence:
+ // splat vVal, rVal
+ // vid.v vVid
+ // vmseq.vx mMask, vVid, rIdx
+ // vmerge.vvm vDest, vSrc, vVal, mMask
+ // This essentially merges the original vector with the inserted element by
+ // using a mask whose only set bit is that corresponding to the insert
+ // index.
+ SDValue SplattedVal = DAG.getSplatVector(VecVT, DL, Val);
+ SDValue SplattedIdx = DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT, Idx);
+
+ SDValue VID = DAG.getNode(RISCVISD::VID, DL, VecVT);
+ auto SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VecVT);
+ SDValue Mask = DAG.getSetCC(DL, SetCCVT, VID, SplattedIdx, ISD::SETEQ);
+
+ return DAG.getNode(ISD::VSELECT, DL, VecVT, Mask, SplattedVal, Vec);
+}
+
+// Custom-lower EXTRACT_VECTOR_ELT operations to slide the vector down, then
+// extract the first element: (extractelt (slidedown vec, idx), 0). This is
+// done to maintain partity with the legalization of RV32 vXi64 legalization.
+SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Idx = Op.getOperand(1);
+ if (isNullConstant(Idx))
+ return Op;
+
+ SDValue Vec = Op.getOperand(0);
+ EVT EltVT = Op.getValueType();
+ EVT VecVT = Vec.getValueType();
+ SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
+ DAG.getUNDEF(VecVT), Vec, Idx);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Slidedown,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+}
+
SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc DL(Op);
+
+ if (Subtarget.hasStdExtV()) {
+ // Some RVV intrinsics may claim that they want an integer operand to be
+ // extended.
+ if (const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+ RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo)) {
+ if (II->ExtendedOperand) {
+ assert(II->ExtendedOperand < Op.getNumOperands());
+ SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
+ SDValue &ScalarOp = Operands[II->ExtendedOperand];
+ EVT OpVT = ScalarOp.getValueType();
+ if (OpVT == MVT::i8 || OpVT == MVT::i16 ||
+ (OpVT == MVT::i32 && Subtarget.is64Bit())) {
+ // If the operand is a constant, sign extend to increase our chances
+ // of being able to use a .vi instruction. ANY_EXTEND would become a
+ // a zero extend and the simm5 check in isel would fail.
+ // FIXME: Should we ignore the upper bits in isel instead?
+ unsigned ExtOpc = isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND
+ : ISD::ANY_EXTEND;
+ ScalarOp = DAG.getNode(ExtOpc, DL, Subtarget.getXLenVT(), ScalarOp);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ Operands);
+ }
+ }
+ }
+ }
+
switch (IntNo) {
default:
return SDValue(); // Don't custom lower most intrinsics.
@@ -876,6 +1429,151 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getRegister(RISCV::X4, PtrVT);
}
+ case Intrinsic::riscv_vmv_x_s:
+ assert(Op.getValueType() == Subtarget.getXLenVT() && "Unexpected VT!");
+ return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
+ Op.getOperand(1));
+ }
+}
+
+SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ SDLoc DL(Op);
+
+ if (Subtarget.hasStdExtV()) {
+ // Some RVV intrinsics may claim that they want an integer operand to be
+ // extended.
+ if (const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+ RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo)) {
+ if (II->ExtendedOperand) {
+ // The operands start from the second argument in INTRINSIC_W_CHAIN.
+ unsigned ExtendOp = II->ExtendedOperand + 1;
+ assert(ExtendOp < Op.getNumOperands());
+ SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
+ SDValue &ScalarOp = Operands[ExtendOp];
+ EVT OpVT = ScalarOp.getValueType();
+ if (OpVT == MVT::i8 || OpVT == MVT::i16 ||
+ (OpVT == MVT::i32 && Subtarget.is64Bit())) {
+ // If the operand is a constant, sign extend to increase our chances
+ // of being able to use a .vi instruction. ANY_EXTEND would become a
+ // a zero extend and the simm5 check in isel would fail.
+ // FIXME: Should we ignore the upper bits in isel instead?
+ unsigned ExtOpc = isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND
+ : ISD::ANY_EXTEND;
+ ScalarOp = DAG.getNode(ExtOpc, DL, Subtarget.getXLenVT(), ScalarOp);
+ return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
+ Operands);
+ }
+ }
+ }
+ }
+
+ unsigned NF = 1;
+ switch (IntNo) {
+ default:
+ return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::riscv_vleff: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other, MVT::Glue);
+ SDValue Load = DAG.getNode(RISCVISD::VLEFF, DL, VTs, Op.getOperand(0),
+ Op.getOperand(2), Op.getOperand(3));
+ VTs = DAG.getVTList(Op->getValueType(1), MVT::Other);
+ SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs, Load.getValue(2));
+ return DAG.getMergeValues({Load, ReadVL, Load.getValue(1)}, DL);
+ }
+ case Intrinsic::riscv_vleff_mask: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other, MVT::Glue);
+ SDValue Load = DAG.getNode(RISCVISD::VLEFF_MASK, DL, VTs, Op.getOperand(0),
+ Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4), Op.getOperand(5));
+ VTs = DAG.getVTList(Op->getValueType(1), MVT::Other);
+ SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs, Load.getValue(2));
+ return DAG.getMergeValues({Load, ReadVL, Load.getValue(1)}, DL);
+ }
+ case Intrinsic::riscv_vlseg8ff:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg7ff:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg6ff:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg5ff:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg4ff:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg3ff:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg2ff: {
+ NF++;
+ SDLoc DL(Op);
+ SmallVector<EVT, 8> EVTs(NF, Op.getValueType());
+ EVTs.push_back(MVT::Other);
+ EVTs.push_back(MVT::Glue);
+ SDVTList VTs = DAG.getVTList(EVTs);
+ SDValue Load =
+ DAG.getNode(RISCVISD::VLSEGFF, DL, VTs, Op.getOperand(0),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ VTs = DAG.getVTList(Op->getValueType(NF), MVT::Other);
+ SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs,
+ /*Glue*/ Load.getValue(NF + 1));
+ SmallVector<SDValue, 8> Results;
+ for (unsigned i = 0; i < NF; ++i)
+ Results.push_back(Load.getValue(i));
+ Results.push_back(ReadVL);
+ Results.push_back(Load.getValue(NF)); // Chain.
+ return DAG.getMergeValues(Results, DL);
+ }
+ case Intrinsic::riscv_vlseg8ff_mask:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg7ff_mask:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg6ff_mask:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg5ff_mask:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg4ff_mask:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg3ff_mask:
+ NF++;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::riscv_vlseg2ff_mask: {
+ NF++;
+ SDLoc DL(Op);
+ SmallVector<EVT, 8> EVTs(NF, Op.getValueType());
+ EVTs.push_back(MVT::Other);
+ EVTs.push_back(MVT::Glue);
+ SDVTList VTs = DAG.getVTList(EVTs);
+ SmallVector<SDValue, 13> LoadOps;
+ LoadOps.push_back(Op.getOperand(0)); // Chain.
+ LoadOps.push_back(Op.getOperand(1)); // Intrinsic ID.
+ for (unsigned i = 0; i < NF; ++i)
+ LoadOps.push_back(Op.getOperand(2 + i)); // MaskedOff.
+ LoadOps.push_back(Op.getOperand(2 + NF)); // Base.
+ LoadOps.push_back(Op.getOperand(3 + NF)); // Mask.
+ LoadOps.push_back(Op.getOperand(4 + NF)); // VL.
+ SDValue Load = DAG.getNode(RISCVISD::VLSEGFF_MASK, DL, VTs, LoadOps);
+ VTs = DAG.getVTList(Op->getValueType(NF), MVT::Other);
+ SDValue ReadVL = DAG.getNode(RISCVISD::READ_VL, DL, VTs,
+ /*Glue*/ Load.getValue(NF + 1));
+ SmallVector<SDValue, 8> Results;
+ for (unsigned i = 0; i < NF; ++i)
+ Results.push_back(Load.getValue(i));
+ Results.push_back(ReadVL);
+ Results.push_back(Load.getValue(NF)); // Chain.
+ return DAG.getMergeValues(Results, DL);
+ }
}
}
@@ -897,6 +1595,14 @@ static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
return RISCVISD::DIVUW;
case ISD::UREM:
return RISCVISD::REMUW;
+ case ISD::ROTL:
+ return RISCVISD::ROLW;
+ case ISD::ROTR:
+ return RISCVISD::RORW;
+ case RISCVISD::GREVI:
+ return RISCVISD::GREVIW;
+ case RISCVISD::GORCI:
+ return RISCVISD::GORCIW;
}
}
@@ -905,14 +1611,15 @@ static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
// later one because the fact the operation was originally of type i32 is
// lost.
-static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
+static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG,
+ unsigned ExtOpc = ISD::ANY_EXTEND) {
SDLoc DL(N);
RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
- SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
- SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewOp0 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(0));
+ SDValue NewOp1 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(1));
SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
// ReplaceNodeResults requires we maintain the same type for the return value.
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
+ return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes);
}
// Converts the given 32-bit operation to a i64 operation with signed extension
@@ -942,6 +1649,13 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
+ // If the FP type needs to be softened, emit a library call using the 'si'
+ // version. If we left it to default legalization we'd end up with 'di'. If
+ // the FP type doesn't need to be softened just let generic type
+ // legalization promote the result type.
+ if (getTypeAction(*DAG.getContext(), Op0.getValueType()) !=
+ TargetLowering::TypeSoftenFloat)
+ return;
RTLIB::Libcall LC;
if (N->getOpcode() == ISD::FP_TO_SINT ||
N->getOpcode() == ISD::STRICT_FP_TO_SINT)
@@ -991,31 +1705,377 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
return;
Results.push_back(customLegalizeToWOp(N, DAG));
break;
+ case ISD::ROTL:
+ case ISD::ROTR:
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ Results.push_back(customLegalizeToWOp(N, DAG));
+ break;
case ISD::SDIV:
case ISD::UDIV:
- case ISD::UREM:
- assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
- Subtarget.hasStdExtM() && "Unexpected custom legalisation");
+ case ISD::UREM: {
+ MVT VT = N->getSimpleValueType(0);
+ assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+ Subtarget.is64Bit() && Subtarget.hasStdExtM() &&
+ "Unexpected custom legalisation");
if (N->getOperand(0).getOpcode() == ISD::Constant ||
N->getOperand(1).getOpcode() == ISD::Constant)
return;
- Results.push_back(customLegalizeToWOp(N, DAG));
+
+ // If the input is i32, use ANY_EXTEND since the W instructions don't read
+ // the upper 32 bits. For other types we need to sign or zero extend
+ // based on the opcode.
+ unsigned ExtOpc = ISD::ANY_EXTEND;
+ if (VT != MVT::i32)
+ ExtOpc = N->getOpcode() == ISD::SDIV ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND;
+
+ Results.push_back(customLegalizeToWOp(N, DAG, ExtOpc));
break;
+ }
case ISD::BITCAST: {
+ assert(((N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtF()) ||
+ (N->getValueType(0) == MVT::i16 && Subtarget.hasStdExtZfh())) &&
+ "Unexpected custom legalisation");
+ SDValue Op0 = N->getOperand(0);
+ if (N->getValueType(0) == MVT::i16 && Subtarget.hasStdExtZfh()) {
+ if (Op0.getValueType() != MVT::f16)
+ return;
+ SDValue FPConv =
+ DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, Subtarget.getXLenVT(), Op0);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FPConv));
+ } else if (N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtF()) {
+ if (Op0.getValueType() != MVT::f32)
+ return;
+ SDValue FPConv =
+ DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
+ }
+ break;
+ }
+ case RISCVISD::GREVI:
+ case RISCVISD::GORCI: {
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
- Subtarget.hasStdExtF() && "Unexpected custom legalisation");
+ "Unexpected custom legalisation");
+ // This is similar to customLegalizeToWOp, except that we pass the second
+ // operand (a TargetConstant) straight through: it is already of type
+ // XLenVT.
SDLoc DL(N);
- SDValue Op0 = N->getOperand(0);
- if (Op0.getValueType() != MVT::f32)
- return;
- SDValue FPConv =
- DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
+ RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
+ SDValue NewOp0 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue NewRes =
+ DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, N->getOperand(1));
+ // ReplaceNodeResults requires we maintain the same type for the return
+ // value.
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+ break;
+ }
+ case ISD::BSWAP:
+ case ISD::BITREVERSE: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtZbp() && "Unexpected custom legalisation");
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
+ N->getOperand(0));
+ unsigned Imm = N->getOpcode() == ISD::BITREVERSE ? 31 : 24;
+ SDValue GREVIW = DAG.getNode(RISCVISD::GREVIW, DL, MVT::i64, NewOp0,
+ DAG.getTargetConstant(Imm, DL,
+ Subtarget.getXLenVT()));
+ // ReplaceNodeResults requires we maintain the same type for the return
+ // value.
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, GREVIW));
+ break;
+ }
+ case ISD::FSHL:
+ case ISD::FSHR: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ Subtarget.hasStdExtZbt() && "Unexpected custom legalisation");
+ SDValue NewOp0 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue NewOp1 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewOp2 =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+ // FSLW/FSRW take a 6 bit shift amount but i32 FSHL/FSHR only use 5 bits.
+ // Mask the shift amount to 5 bits.
+ NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2,
+ DAG.getConstant(0x1f, DL, MVT::i64));
+ unsigned Opc =
+ N->getOpcode() == ISD::FSHL ? RISCVISD::FSLW : RISCVISD::FSRW;
+ SDValue NewOp = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, NewOp2);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewOp));
+ break;
+ }
+ case ISD::EXTRACT_VECTOR_ELT: {
+ // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN<SEW, as the SEW element
+ // type is illegal (currently only vXi64 RV32).
+ // With vmv.x.s, when SEW > XLEN, only the least-significant XLEN bits are
+ // transferred to the destination register. We issue two of these from the
+ // upper- and lower- halves of the SEW-bit vector element, slid down to the
+ // first element.
+ SDLoc DL(N);
+ SDValue Vec = N->getOperand(0);
+ SDValue Idx = N->getOperand(1);
+ EVT VecVT = Vec.getValueType();
+ assert(!Subtarget.is64Bit() && N->getValueType(0) == MVT::i64 &&
+ VecVT.getVectorElementType() == MVT::i64 &&
+ "Unexpected EXTRACT_VECTOR_ELT legalization");
+
+ SDValue Slidedown = Vec;
+ // Unless the index is known to be 0, we must slide the vector down to get
+ // the desired element into index 0.
+ if (!isNullConstant(Idx))
+ Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN, DL, VecVT,
+ DAG.getUNDEF(VecVT), Vec, Idx);
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ // Extract the lower XLEN bits of the correct vector element.
+ SDValue EltLo = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Slidedown, Idx);
+
+ // To extract the upper XLEN bits of the vector element, shift the first
+ // element right by 32 bits and re-extract the lower XLEN bits.
+ SDValue ThirtyTwoV =
+ DAG.getNode(RISCVISD::SPLAT_VECTOR_I64, DL, VecVT,
+ DAG.getConstant(32, DL, Subtarget.getXLenVT()));
+ SDValue LShr32 = DAG.getNode(ISD::SRL, DL, VecVT, Slidedown, ThirtyTwoV);
+
+ SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32, Idx);
+
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, EltLo, EltHi));
+ break;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ llvm_unreachable(
+ "Don't know how to custom type legalize this intrinsic!");
+ case Intrinsic::riscv_vmv_x_s: {
+ EVT VT = N->getValueType(0);
+ assert((VT == MVT::i8 || VT == MVT::i16 ||
+ (Subtarget.is64Bit() && VT == MVT::i32)) &&
+ "Unexpected custom legalisation!");
+ SDValue Extract = DAG.getNode(RISCVISD::VMV_X_S, DL,
+ Subtarget.getXLenVT(), N->getOperand(1));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Extract));
+ break;
+ }
+ }
break;
}
}
}
+// A structure to hold one of the bit-manipulation patterns below. Together, a
+// SHL and non-SHL pattern may form a bit-manipulation pair on a single source:
+// (or (and (shl x, 1), 0xAAAAAAAA),
+// (and (srl x, 1), 0x55555555))
+struct RISCVBitmanipPat {
+ SDValue Op;
+ unsigned ShAmt;
+ bool IsSHL;
+
+ bool formsPairWith(const RISCVBitmanipPat &Other) const {
+ return Op == Other.Op && ShAmt == Other.ShAmt && IsSHL != Other.IsSHL;
+ }
+};
+
+// Matches any of the following bit-manipulation patterns:
+// (and (shl x, 1), (0x55555555 << 1))
+// (and (srl x, 1), 0x55555555)
+// (shl (and x, 0x55555555), 1)
+// (srl (and x, (0x55555555 << 1)), 1)
+// where the shift amount and mask may vary thus:
+// [1] = 0x55555555 / 0xAAAAAAAA
+// [2] = 0x33333333 / 0xCCCCCCCC
+// [4] = 0x0F0F0F0F / 0xF0F0F0F0
+// [8] = 0x00FF00FF / 0xFF00FF00
+// [16] = 0x0000FFFF / 0xFFFFFFFF
+// [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
+static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
+ Optional<uint64_t> Mask;
+ // Optionally consume a mask around the shift operation.
+ if (Op.getOpcode() == ISD::AND && isa<ConstantSDNode>(Op.getOperand(1))) {
+ Mask = Op.getConstantOperandVal(1);
+ Op = Op.getOperand(0);
+ }
+ if (Op.getOpcode() != ISD::SHL && Op.getOpcode() != ISD::SRL)
+ return None;
+ bool IsSHL = Op.getOpcode() == ISD::SHL;
+
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return None;
+ auto ShAmt = Op.getConstantOperandVal(1);
+
+ if (!isPowerOf2_64(ShAmt))
+ return None;
+
+ // These are the unshifted masks which we use to match bit-manipulation
+ // patterns. They may be shifted left in certain circumstances.
+ static const uint64_t BitmanipMasks[] = {
+ 0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+ 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
+ };
+
+ unsigned MaskIdx = Log2_64(ShAmt);
+ if (MaskIdx >= array_lengthof(BitmanipMasks))
+ return None;
+
+ auto Src = Op.getOperand(0);
+
+ unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+ auto ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
+ // The expected mask is shifted left when the AND is found around SHL
+ // patterns.
+ // ((x >> 1) & 0x55555555)
+ // ((x << 1) & 0xAAAAAAAA)
+ bool SHLExpMask = IsSHL;
+
+ if (!Mask) {
+ // Sometimes LLVM keeps the mask as an operand of the shift, typically when
+ // the mask is all ones: consume that now.
+ if (Src.getOpcode() == ISD::AND && isa<ConstantSDNode>(Src.getOperand(1))) {
+ Mask = Src.getConstantOperandVal(1);
+ Src = Src.getOperand(0);
+ // The expected mask is now in fact shifted left for SRL, so reverse the
+ // decision.
+ // ((x & 0xAAAAAAAA) >> 1)
+ // ((x & 0x55555555) << 1)
+ SHLExpMask = !SHLExpMask;
+ } else {
+ // Use a default shifted mask of all-ones if there's no AND, truncated
+ // down to the expected width. This simplifies the logic later on.
+ Mask = maskTrailingOnes<uint64_t>(Width);
+ *Mask &= (IsSHL ? *Mask << ShAmt : *Mask >> ShAmt);
+ }
+ }
+
+ if (SHLExpMask)
+ ExpMask <<= ShAmt;
+
+ if (Mask != ExpMask)
+ return None;
+
+ return RISCVBitmanipPat{Src, (unsigned)ShAmt, IsSHL};
+}
+
+// Match the following pattern as a GREVI(W) operation
+// (or (BITMANIP_SHL x), (BITMANIP_SRL x))
+static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ EVT VT = Op.getValueType();
+
+ if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
+ auto LHS = matchRISCVBitmanipPat(Op.getOperand(0));
+ auto RHS = matchRISCVBitmanipPat(Op.getOperand(1));
+ if (LHS && RHS && LHS->formsPairWith(*RHS)) {
+ SDLoc DL(Op);
+ return DAG.getNode(
+ RISCVISD::GREVI, DL, VT, LHS->Op,
+ DAG.getTargetConstant(LHS->ShAmt, DL, Subtarget.getXLenVT()));
+ }
+ }
+ return SDValue();
+}
+
+// Matches any the following pattern as a GORCI(W) operation
+// 1. (or (GREVI x, shamt), x) if shamt is a power of 2
+// 2. (or x, (GREVI x, shamt)) if shamt is a power of 2
+// 3. (or (or (BITMANIP_SHL x), x), (BITMANIP_SRL x))
+// Note that with the variant of 3.,
+// (or (or (BITMANIP_SHL x), (BITMANIP_SRL x)), x)
+// the inner pattern will first be matched as GREVI and then the outer
+// pattern will be matched to GORC via the first rule above.
+// 4. (or (rotl/rotr x, bitwidth/2), x)
+static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ EVT VT = Op.getValueType();
+
+ if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ auto MatchOROfReverse = [&](SDValue Reverse, SDValue X) {
+ if (Reverse.getOpcode() == RISCVISD::GREVI && Reverse.getOperand(0) == X &&
+ isPowerOf2_32(Reverse.getConstantOperandVal(1)))
+ return DAG.getNode(RISCVISD::GORCI, DL, VT, X, Reverse.getOperand(1));
+ // We can also form GORCI from ROTL/ROTR by half the bitwidth.
+ if ((Reverse.getOpcode() == ISD::ROTL ||
+ Reverse.getOpcode() == ISD::ROTR) &&
+ Reverse.getOperand(0) == X &&
+ isa<ConstantSDNode>(Reverse.getOperand(1))) {
+ uint64_t RotAmt = Reverse.getConstantOperandVal(1);
+ if (RotAmt == (VT.getSizeInBits() / 2))
+ return DAG.getNode(
+ RISCVISD::GORCI, DL, VT, X,
+ DAG.getTargetConstant(RotAmt, DL, Subtarget.getXLenVT()));
+ }
+ return SDValue();
+ };
+
+ // Check for either commutable permutation of (or (GREVI x, shamt), x)
+ if (SDValue V = MatchOROfReverse(Op0, Op1))
+ return V;
+ if (SDValue V = MatchOROfReverse(Op1, Op0))
+ return V;
+
+ // OR is commutable so canonicalize its OR operand to the left
+ if (Op0.getOpcode() != ISD::OR && Op1.getOpcode() == ISD::OR)
+ std::swap(Op0, Op1);
+ if (Op0.getOpcode() != ISD::OR)
+ return SDValue();
+ SDValue OrOp0 = Op0.getOperand(0);
+ SDValue OrOp1 = Op0.getOperand(1);
+ auto LHS = matchRISCVBitmanipPat(OrOp0);
+ // OR is commutable so swap the operands and try again: x might have been
+ // on the left
+ if (!LHS) {
+ std::swap(OrOp0, OrOp1);
+ LHS = matchRISCVBitmanipPat(OrOp0);
+ }
+ auto RHS = matchRISCVBitmanipPat(Op1);
+ if (LHS && RHS && LHS->formsPairWith(*RHS) && LHS->Op == OrOp1) {
+ return DAG.getNode(
+ RISCVISD::GORCI, DL, VT, LHS->Op,
+ DAG.getTargetConstant(LHS->ShAmt, DL, Subtarget.getXLenVT()));
+ }
+ }
+ return SDValue();
+}
+
+// Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
+// non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
+// Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
+// not undo itself, but they are redundant.
+static SDValue combineGREVI_GORCI(SDNode *N, SelectionDAG &DAG) {
+ unsigned ShAmt1 = N->getConstantOperandVal(1);
+ SDValue Src = N->getOperand(0);
+
+ if (Src.getOpcode() != N->getOpcode())
+ return SDValue();
+
+ unsigned ShAmt2 = Src.getConstantOperandVal(1);
+ Src = Src.getOperand(0);
+
+ unsigned CombinedShAmt;
+ if (N->getOpcode() == RISCVISD::GORCI || N->getOpcode() == RISCVISD::GORCIW)
+ CombinedShAmt = ShAmt1 | ShAmt2;
+ else
+ CombinedShAmt = ShAmt1 ^ ShAmt2;
+
+ if (CombinedShAmt == 0)
+ return Src;
+
+ SDLoc DL(N);
+ return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), Src,
+ DAG.getTargetConstant(CombinedShAmt, DL,
+ N->getOperand(1).getValueType()));
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -1067,17 +2127,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
}
case RISCVISD::SLLW:
case RISCVISD::SRAW:
- case RISCVISD::SRLW: {
+ case RISCVISD::SRLW:
+ case RISCVISD::ROLW:
+ case RISCVISD::RORW: {
// Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
- if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) ||
- (SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
- return SDValue();
+ if (SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI) ||
+ SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ break;
+ }
+ case RISCVISD::FSLW:
+ case RISCVISD::FSRW: {
+ // Only the lower 32 bits of Values and lower 6 bits of shift amount are
+ // read.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue ShAmt = N->getOperand(2);
+ APInt OpMask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
+ APInt ShAmtMask = APInt::getLowBitsSet(ShAmt.getValueSizeInBits(), 6);
+ if (SimplifyDemandedBits(Op0, OpMask, DCI) ||
+ SimplifyDemandedBits(Op1, OpMask, DCI) ||
+ SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
break;
}
+ case RISCVISD::GREVIW:
+ case RISCVISD::GORCIW: {
+ // Only the lower 32 bits of the first operand are read
+ SDValue Op0 = N->getOperand(0);
+ APInt Mask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
+ if (SimplifyDemandedBits(Op0, Mask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return combineGREVI_GORCI(N, DCI.DAG);
+ }
case RISCVISD::FMV_X_ANYEXTW_RV64: {
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
@@ -1085,9 +2181,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
// conversion is unnecessary and can be replaced with an ANY_EXTEND
// of the FMV_W_X_RV64 operand.
if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
- SDValue AExtOp =
- DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
- return DCI.CombineTo(N, AExtOp);
+ assert(Op0.getOperand(0).getValueType() == MVT::i64 &&
+ "Unexpected value type!");
+ return Op0.getOperand(0);
}
// This is a target-specific version of a DAGCombine performed in
@@ -1100,15 +2196,61 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
Op0.getOperand(0));
APInt SignBit = APInt::getSignMask(32).sext(64);
- if (Op0.getOpcode() == ISD::FNEG) {
- return DCI.CombineTo(N,
- DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
- DAG.getConstant(SignBit, DL, MVT::i64)));
- }
+ if (Op0.getOpcode() == ISD::FNEG)
+ return DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
+ DAG.getConstant(SignBit, DL, MVT::i64));
+
assert(Op0.getOpcode() == ISD::FABS);
- return DCI.CombineTo(N,
- DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
- DAG.getConstant(~SignBit, DL, MVT::i64)));
+ return DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
+ DAG.getConstant(~SignBit, DL, MVT::i64));
+ }
+ case RISCVISD::GREVI:
+ case RISCVISD::GORCI:
+ return combineGREVI_GORCI(N, DCI.DAG);
+ case ISD::OR:
+ if (auto GREV = combineORToGREV(SDValue(N, 0), DCI.DAG, Subtarget))
+ return GREV;
+ if (auto GORC = combineORToGORC(SDValue(N, 0), DCI.DAG, Subtarget))
+ return GORC;
+ break;
+ case RISCVISD::SELECT_CC: {
+ // Transform
+ // (select_cc (xor X, 1), 0, setne, trueV, falseV) ->
+ // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
+ // This can occur when legalizing some floating point comparisons.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ auto CCVal = static_cast<ISD::CondCode>(N->getConstantOperandVal(2));
+ APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+ if (ISD::isIntEqualitySetCC(CCVal) && isNullConstant(RHS) &&
+ LHS.getOpcode() == ISD::XOR && isOneConstant(LHS.getOperand(1)) &&
+ DAG.MaskedValueIsZero(LHS.getOperand(0), Mask)) {
+ SDLoc DL(N);
+ CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+ SDValue TargetCC = DAG.getConstant(CCVal, DL, Subtarget.getXLenVT());
+ return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
+ {LHS.getOperand(0), RHS, TargetCC, N->getOperand(3),
+ N->getOperand(4)});
+ }
+ break;
+ }
+ case ISD::SETCC: {
+ // (setcc X, 1, setne) -> (setcc X, 0, seteq) if we can prove X is 0/1.
+ // Comparing with 0 may allow us to fold into bnez/beqz.
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getValueType().isScalableVector())
+ break;
+ auto CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+ if (isOneConstant(RHS) && ISD::isIntEqualitySetCC(CC) &&
+ DAG.MaskedValueIsZero(LHS, Mask)) {
+ SDLoc DL(N);
+ SDValue Zero = DAG.getConstant(0, DL, LHS.getValueType());
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
+ return DAG.getSetCC(DL, N->getValueType(0), LHS, Zero, CC);
+ }
+ break;
}
}
@@ -1129,7 +2271,7 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (C1 && C2) {
- APInt C1Int = C1->getAPIntValue();
+ const APInt &C1Int = C1->getAPIntValue();
APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
// We can materialise `c1 << c2` into an add immediate, so it's "free",
@@ -1161,6 +2303,116 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
return true;
}
+bool RISCVTargetLowering::targetShrinkDemandedConstant(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const {
+ // Delay this optimization as late as possible.
+ if (!TLO.LegalOps)
+ return false;
+
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return false;
+
+ // Only handle AND for now.
+ if (Op.getOpcode() != ISD::AND)
+ return false;
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!C)
+ return false;
+
+ const APInt &Mask = C->getAPIntValue();
+
+ // Clear all non-demanded bits initially.
+ APInt ShrunkMask = Mask & DemandedBits;
+
+ // If the shrunk mask fits in sign extended 12 bits, let the target
+ // independent code apply it.
+ if (ShrunkMask.isSignedIntN(12))
+ return false;
+
+ // Try to make a smaller immediate by setting undemanded bits.
+
+ // We need to be able to make a negative number through a combination of mask
+ // and undemanded bits.
+ APInt ExpandedMask = Mask | ~DemandedBits;
+ if (!ExpandedMask.isNegative())
+ return false;
+
+ // What is the fewest number of bits we need to represent the negative number.
+ unsigned MinSignedBits = ExpandedMask.getMinSignedBits();
+
+ // Try to make a 12 bit negative immediate. If that fails try to make a 32
+ // bit negative immediate unless the shrunk immediate already fits in 32 bits.
+ APInt NewMask = ShrunkMask;
+ if (MinSignedBits <= 12)
+ NewMask.setBitsFrom(11);
+ else if (MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32))
+ NewMask.setBitsFrom(31);
+ else
+ return false;
+
+ // Sanity check that our new mask is a subset of the demanded mask.
+ assert(NewMask.isSubsetOf(ExpandedMask));
+
+ // If we aren't changing the mask, just return true to keep it and prevent
+ // the caller from optimizing.
+ if (NewMask == Mask)
+ return true;
+
+ // Replace the constant with the new mask.
+ SDLoc DL(Op);
+ SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
+ SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+}
+
+void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned BitWidth = Known.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ Known.resetAll();
+ switch (Opc) {
+ default: break;
+ case RISCVISD::REMUW: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ // We only care about the lower 32 bits.
+ Known = KnownBits::urem(Known.trunc(32), Known2.trunc(32));
+ // Restore the original width by sign extending.
+ Known = Known.sext(BitWidth);
+ break;
+ }
+ case RISCVISD::DIVUW: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ // We only care about the lower 32 bits.
+ Known = KnownBits::udiv(Known.trunc(32), Known2.trunc(32));
+ // Restore the original width by sign extending.
+ Known = Known.sext(BitWidth);
+ break;
+ }
+ case RISCVISD::READ_VLENB:
+ // We assume VLENB is at least 8 bytes.
+ // FIXME: The 1.0 draft spec defines minimum VLEN as 128 bits.
+ Known.Zero.setLowBits(3);
+ break;
+ }
+}
+
unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
@@ -1173,10 +2425,25 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
case RISCVISD::DIVW:
case RISCVISD::DIVUW:
case RISCVISD::REMUW:
+ case RISCVISD::ROLW:
+ case RISCVISD::RORW:
+ case RISCVISD::GREVIW:
+ case RISCVISD::GORCIW:
+ case RISCVISD::FSLW:
+ case RISCVISD::FSRW:
// TODO: As the result is sign-extended, this is conservatively correct. A
// more precise answer could be calculated for SRAW depending on known
// bits in the shift amount.
return 33;
+ case RISCVISD::VMV_X_S:
+ // The number of sign bits of the scalar result is computed by obtaining the
+ // element type of the input vector operand, subtracting its width from the
+ // XLEN, and then adding one (sign bit within the element type). If the
+ // element type is wider than XLen, the least-significant XLEN bits are
+ // taken.
+ if (Op.getOperand(0).getScalarValueSizeInBits() > Subtarget.getXLen())
+ return 1;
+ return Subtarget.getXLen() - Op.getOperand(0).getScalarValueSizeInBits() + 1;
}
return 1;
@@ -1260,17 +2527,19 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
RI);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
- MachineMemOperand::MOLoad, 8, Align(8));
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMOLo =
+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8));
+ MachineMemOperand *MMOHi = MF.getMachineMemOperand(
+ MPI.getWithOffset(4), MachineMemOperand::MOLoad, 4, Align(8));
BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
.addFrameIndex(FI)
.addImm(0)
- .addMemOperand(MMO);
+ .addMemOperand(MMOLo);
BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
.addFrameIndex(FI)
.addImm(4)
- .addMemOperand(MMO);
+ .addMemOperand(MMOHi);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
@@ -1290,19 +2559,21 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
- MachineMemOperand::MOStore, 8, Align(8));
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMOLo =
+ MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Align(8));
+ MachineMemOperand *MMOHi = MF.getMachineMemOperand(
+ MPI.getWithOffset(4), MachineMemOperand::MOStore, 4, Align(8));
BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
.addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
.addFrameIndex(FI)
.addImm(0)
- .addMemOperand(MMO);
+ .addMemOperand(MMOLo);
BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
.addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
.addFrameIndex(FI)
.addImm(4)
- .addMemOperand(MMO);
+ .addMemOperand(MMOHi);
TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
@@ -1313,6 +2584,7 @@ static bool isSelectPseudo(MachineInstr &MI) {
default:
return false;
case RISCV::Select_GPR_Using_CC_GPR:
+ case RISCV::Select_FPR16_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
return true;
@@ -1442,9 +2714,80 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
return TailMBB;
}
+static MachineBasicBlock *addVSetVL(MachineInstr &MI, MachineBasicBlock *BB,
+ int VLIndex, unsigned SEWIndex,
+ RISCVVLMUL VLMul, bool WritesElement0) {
+ MachineFunction &MF = *BB->getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+ unsigned SEW = MI.getOperand(SEWIndex).getImm();
+ assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+ RISCVVSEW ElementWidth = static_cast<RISCVVSEW>(Log2_32(SEW / 8));
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // VL and VTYPE are alive here.
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII.get(RISCV::PseudoVSETVLI));
+
+ if (VLIndex >= 0) {
+ // Set VL (rs1 != X0).
+ Register DestReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ MIB.addReg(DestReg, RegState::Define | RegState::Dead)
+ .addReg(MI.getOperand(VLIndex).getReg());
+ } else
+ // With no VL operator in the pseudo, do not modify VL (rd = X0, rs1 = X0).
+ MIB.addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill);
+
+ // Default to tail agnostic unless the destination is tied to a source. In
+ // that case the user would have some control over the tail values. The tail
+ // policy is also ignored on instructions that only update element 0 like
+ // vmv.s.x or reductions so use agnostic there to match the common case.
+ // FIXME: This is conservatively correct, but we might want to detect that
+ // the input is undefined.
+ bool TailAgnostic = true;
+ unsigned UseOpIdx;
+ if (MI.isRegTiedToUseOperand(0, &UseOpIdx) && !WritesElement0) {
+ TailAgnostic = false;
+ // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
+ const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
+ MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg());
+ if (UseMI && UseMI->isImplicitDef())
+ TailAgnostic = true;
+ }
+
+ // For simplicity we reuse the vtype representation here.
+ MIB.addImm(RISCVVType::encodeVTYPE(VLMul, ElementWidth,
+ /*TailAgnostic*/ TailAgnostic,
+ /*MaskAgnostic*/ false));
+
+ // Remove (now) redundant operands from pseudo
+ MI.getOperand(SEWIndex).setImm(-1);
+ if (VLIndex >= 0) {
+ MI.getOperand(VLIndex).setReg(RISCV::NoRegister);
+ MI.getOperand(VLIndex).setIsKill(false);
+ }
+
+ return BB;
+}
+
MachineBasicBlock *
RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ if (TSFlags & RISCVII::HasSEWOpMask) {
+ unsigned NumOperands = MI.getNumExplicitOperands();
+ int VLIndex = (TSFlags & RISCVII::HasVLOpMask) ? NumOperands - 2 : -1;
+ unsigned SEWIndex = NumOperands - 1;
+ bool WritesElement0 = TSFlags & RISCVII::WritesElement0Mask;
+
+ RISCVVLMUL VLMul = static_cast<RISCVVLMUL>((TSFlags & RISCVII::VLMulMask) >>
+ RISCVII::VLMulShift);
+ return addVSetVL(MI, BB, VLIndex, SEWIndex, VLMul, WritesElement0);
+ }
+
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected instr type to insert");
@@ -1453,6 +2796,7 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
"ReadCycleWrite is only to be used on riscv32");
return emitReadCycleWidePseudo(MI, BB);
case RISCV::Select_GPR_Using_CC_GPR:
+ case RISCV::Select_FPR16_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
return emitSelectPseudo(MI, BB);
@@ -1492,6 +2836,10 @@ static const MCPhysReg ArgGPRs[] = {
RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
};
+static const MCPhysReg ArgFPR16s[] = {
+ RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H,
+ RISCV::F14_H, RISCV::F15_H, RISCV::F16_H, RISCV::F17_H
+};
static const MCPhysReg ArgFPR32s[] = {
RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F,
RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F
@@ -1500,6 +2848,17 @@ static const MCPhysReg ArgFPR64s[] = {
RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D,
RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D
};
+// This is an interim calling convention and it may be changed in the future.
+static const MCPhysReg ArgVRs[] = {
+ RISCV::V8, RISCV::V9, RISCV::V10, RISCV::V11, RISCV::V12, RISCV::V13,
+ RISCV::V14, RISCV::V15, RISCV::V16, RISCV::V17, RISCV::V18, RISCV::V19,
+ RISCV::V20, RISCV::V21, RISCV::V22, RISCV::V23};
+static const MCPhysReg ArgVRM2s[] = {RISCV::V8M2, RISCV::V10M2, RISCV::V12M2,
+ RISCV::V14M2, RISCV::V16M2, RISCV::V18M2,
+ RISCV::V20M2, RISCV::V22M2};
+static const MCPhysReg ArgVRM4s[] = {RISCV::V8M4, RISCV::V12M4, RISCV::V16M4,
+ RISCV::V20M4};
+static const MCPhysReg ArgVRM8s[] = {RISCV::V8M8, RISCV::V16M8};
// Pass a 2*XLEN argument that has been split into two XLEN values through
// registers or the stack as necessary.
@@ -1544,7 +2903,8 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
- bool IsRet, Type *OrigTy) {
+ bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
+ Optional<unsigned> FirstMaskArgument) {
unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
assert(XLen == 32 || XLen == 64);
MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
@@ -1554,9 +2914,9 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
if (IsRet && ValNo > 1)
return true;
- // UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
- // variadic argument, or if no F32 argument registers are available.
- bool UseGPRForF32 = true;
+ // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a
+ // variadic argument, or if no F16/F32 argument registers are available.
+ bool UseGPRForF16_F32 = true;
// UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
// variadic argument, or if no F64 argument registers are available.
bool UseGPRForF64 = true;
@@ -1569,24 +2929,26 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
break;
case RISCVABI::ABI_ILP32F:
case RISCVABI::ABI_LP64F:
- UseGPRForF32 = !IsFixed;
+ UseGPRForF16_F32 = !IsFixed;
break;
case RISCVABI::ABI_ILP32D:
case RISCVABI::ABI_LP64D:
- UseGPRForF32 = !IsFixed;
+ UseGPRForF16_F32 = !IsFixed;
UseGPRForF64 = !IsFixed;
break;
}
- if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
- UseGPRForF32 = true;
- if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
+ // FPR16, FPR32, and FPR64 alias each other.
+ if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s)) {
+ UseGPRForF16_F32 = true;
UseGPRForF64 = true;
+ }
- // From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
- // variables rather than directly checking against the target ABI.
+ // From this point on, rely on UseGPRForF16_F32, UseGPRForF64 and
+ // similar local variables rather than directly checking against the target
+ // ABI.
- if (UseGPRForF32 && ValVT == MVT::f32) {
+ if (UseGPRForF16_F32 && (ValVT == MVT::f16 || ValVT == MVT::f32)) {
LocVT = XLenVT;
LocInfo = CCValAssign::BCvt;
} else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
@@ -1669,11 +3031,40 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
// Allocate to a register if possible, or else a stack slot.
Register Reg;
- if (ValVT == MVT::f32 && !UseGPRForF32)
- Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
+ if (ValVT == MVT::f16 && !UseGPRForF16_F32)
+ Reg = State.AllocateReg(ArgFPR16s);
+ else if (ValVT == MVT::f32 && !UseGPRForF16_F32)
+ Reg = State.AllocateReg(ArgFPR32s);
else if (ValVT == MVT::f64 && !UseGPRForF64)
- Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
- else
+ Reg = State.AllocateReg(ArgFPR64s);
+ else if (ValVT.isScalableVector()) {
+ const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
+ if (RC == &RISCV::VRRegClass) {
+ // Assign the first mask argument to V0.
+ // This is an interim calling convention and it may be changed in the
+ // future.
+ if (FirstMaskArgument.hasValue() &&
+ ValNo == FirstMaskArgument.getValue()) {
+ Reg = State.AllocateReg(RISCV::V0);
+ } else {
+ Reg = State.AllocateReg(ArgVRs);
+ }
+ } else if (RC == &RISCV::VRM2RegClass) {
+ Reg = State.AllocateReg(ArgVRM2s);
+ } else if (RC == &RISCV::VRM4RegClass) {
+ Reg = State.AllocateReg(ArgVRM4s);
+ } else if (RC == &RISCV::VRM8RegClass) {
+ Reg = State.AllocateReg(ArgVRM8s);
+ } else {
+ llvm_unreachable("Unhandled class register for ValueType");
+ }
+ if (!Reg) {
+ LocInfo = CCValAssign::Indirect;
+ // Try using a GPR to pass the address
+ Reg = State.AllocateReg(ArgGPRs);
+ LocVT = XLenVT;
+ }
+ } else
Reg = State.AllocateReg(ArgGPRs);
unsigned StackOffset =
Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8));
@@ -1696,16 +3087,18 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
return false;
}
- assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) &&
- "Expected an XLenVT at this stage");
+ assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT ||
+ (TLI.getSubtarget().hasStdExtV() && ValVT.isScalableVector())) &&
+ "Expected an XLenVT or scalable vector types at this stage");
if (Reg) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
- // When an f32 or f64 is passed on the stack, no bit-conversion is needed.
- if (ValVT == MVT::f32 || ValVT == MVT::f64) {
+ // When a floating-point value is passed on the stack, no bit-conversion is
+ // needed.
+ if (ValVT.isFloatingPoint()) {
LocVT = ValVT;
LocInfo = CCValAssign::Full;
}
@@ -1713,12 +3106,27 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
return false;
}
+template <typename ArgTy>
+static Optional<unsigned> preAssignMask(const ArgTy &Args) {
+ for (const auto &ArgIdx : enumerate(Args)) {
+ MVT ArgVT = ArgIdx.value().VT;
+ if (ArgVT.isScalableVector() &&
+ ArgVT.getVectorElementType().SimpleTy == MVT::i1)
+ return ArgIdx.index();
+ }
+ return None;
+}
+
void RISCVTargetLowering::analyzeInputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
unsigned NumArgs = Ins.size();
FunctionType *FType = MF.getFunction().getFunctionType();
+ Optional<unsigned> FirstMaskArgument;
+ if (Subtarget.hasStdExtV())
+ FirstMaskArgument = preAssignMask(Ins);
+
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Ins[i].VT;
ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
@@ -1731,7 +3139,8 @@ void RISCVTargetLowering::analyzeInputArgs(
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
+ ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this,
+ FirstMaskArgument)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << '\n');
llvm_unreachable(nullptr);
@@ -1745,6 +3154,10 @@ void RISCVTargetLowering::analyzeOutputArgs(
CallLoweringInfo *CLI) const {
unsigned NumArgs = Outs.size();
+ Optional<unsigned> FirstMaskArgument;
+ if (Subtarget.hasStdExtV())
+ FirstMaskArgument = preAssignMask(Outs);
+
for (unsigned i = 0; i != NumArgs; i++) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
@@ -1752,7 +3165,8 @@ void RISCVTargetLowering::analyzeOutputArgs(
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
- ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
+ ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this,
+ FirstMaskArgument)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << "\n");
llvm_unreachable(nullptr);
@@ -1770,11 +3184,12 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
- if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+ if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)
+ Val = DAG.getNode(RISCVISD::FMV_H_X, DL, MVT::f16, Val);
+ else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
- break;
- }
- Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ else
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
}
return Val;
@@ -1783,28 +3198,13 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
// The caller is responsible for loading the full value if the argument is
// passed with CCValAssign::Indirect.
static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
- const CCValAssign &VA, const SDLoc &DL) {
+ const CCValAssign &VA, const SDLoc &DL,
+ const RISCVTargetLowering &TLI) {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
EVT LocVT = VA.getLocVT();
SDValue Val;
- const TargetRegisterClass *RC;
-
- switch (LocVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("Unexpected register type");
- case MVT::i32:
- case MVT::i64:
- RC = &RISCV::GPRRegClass;
- break;
- case MVT::f32:
- RC = &RISCV::FPR32RegClass;
- break;
- case MVT::f64:
- RC = &RISCV::FPR64RegClass;
- break;
- }
-
+ const TargetRegisterClass *RC = TLI.getRegClassFor(LocVT.getSimpleVT());
Register VReg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
@@ -1825,11 +3225,12 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
- if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+ if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16)
+ Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, VA.getLocVT(), Val);
+ else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
- break;
- }
- Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
+ else
+ Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
break;
}
return Val;
@@ -1920,6 +3321,18 @@ static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
}
}
+ if (LocVT == MVT::f16) {
+ static const MCPhysReg FPR16List[] = {
+ RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H,
+ RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H, RISCV::F1_H,
+ RISCV::F2_H, RISCV::F3_H, RISCV::F4_H, RISCV::F5_H, RISCV::F6_H,
+ RISCV::F7_H, RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H};
+ if (unsigned Reg = State.AllocateReg(FPR16List)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
if (LocVT == MVT::f32) {
static const MCPhysReg FPR32List[] = {
RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
@@ -1959,22 +3372,71 @@ static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
return true; // CC didn't match.
}
+static bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+ if (LocVT == MVT::i32 || LocVT == MVT::i64) {
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim
+ // s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11
+ static const MCPhysReg GPRList[] = {
+ RISCV::X9, RISCV::X18, RISCV::X19, RISCV::X20, RISCV::X21, RISCV::X22,
+ RISCV::X23, RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27};
+ if (unsigned Reg = State.AllocateReg(GPRList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ if (LocVT == MVT::f32) {
+ // Pass in STG registers: F1, ..., F6
+ // fs0 ... fs5
+ static const MCPhysReg FPR32List[] = {RISCV::F8_F, RISCV::F9_F,
+ RISCV::F18_F, RISCV::F19_F,
+ RISCV::F20_F, RISCV::F21_F};
+ if (unsigned Reg = State.AllocateReg(FPR32List)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ if (LocVT == MVT::f64) {
+ // Pass in STG registers: D1, ..., D6
+ // fs6 ... fs11
+ static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D,
+ RISCV::F24_D, RISCV::F25_D,
+ RISCV::F26_D, RISCV::F27_D};
+ if (unsigned Reg = State.AllocateReg(FPR64List)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ report_fatal_error("No registers left in GHC calling convention");
+ return true;
+}
+
// Transform physical registers into virtual registers.
SDValue RISCVTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
switch (CallConv) {
default:
report_fatal_error("Unsupported calling convention");
case CallingConv::C:
case CallingConv::Fast:
break;
+ case CallingConv::GHC:
+ if (!MF.getSubtarget().getFeatureBits()[RISCV::FeatureStdExtF] ||
+ !MF.getSubtarget().getFeatureBits()[RISCV::FeatureStdExtD])
+ report_fatal_error(
+ "GHC calling convention requires the F and D instruction set extensions");
}
- MachineFunction &MF = DAG.getMachineFunction();
-
const Function &Func = MF.getFunction();
if (Func.hasFnAttribute("interrupt")) {
if (!Func.arg_empty())
@@ -2001,6 +3463,8 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
if (CallConv == CallingConv::Fast)
CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
+ else if (CallConv == CallingConv::GHC)
+ CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_GHC);
else
analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
@@ -2012,7 +3476,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
else if (VA.isRegLoc())
- ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
+ ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, *this);
else
ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
@@ -2201,6 +3665,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
if (CallConv == CallingConv::Fast)
ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
+ else if (CallConv == CallingConv::GHC)
+ ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_GHC);
else
analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
@@ -2458,12 +3924,18 @@ bool RISCVTargetLowering::CanLowerReturn(
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+
+ Optional<unsigned> FirstMaskArgument;
+ if (Subtarget.hasStdExtV())
+ FirstMaskArgument = preAssignMask(Outs);
+
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
- ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
+ ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr,
+ *this, FirstMaskArgument))
return false;
}
return true;
@@ -2488,6 +3960,9 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
nullptr);
+ if (CallConv == CallingConv::GHC && !RVLocs.empty())
+ report_fatal_error("GHC functions return void only");
+
SDValue Glue;
SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -2574,7 +4049,7 @@ void RISCVTargetLowering::validateCCReservedRegs(
const Function &F = MF.getFunction();
const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();
- if (std::any_of(std::begin(Regs), std::end(Regs), [&STI](auto Reg) {
+ if (llvm::any_of(Regs, [&STI](auto Reg) {
return STI.isRegisterReservedByUser(Reg.first);
}))
F.getContext().diagnose(DiagnosticInfoUnsupported{
@@ -2586,47 +4061,57 @@ bool RISCVTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
}
const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define NODE_NAME_CASE(NODE) \
+ case RISCVISD::NODE: \
+ return "RISCVISD::" #NODE;
+ // clang-format off
switch ((RISCVISD::NodeType)Opcode) {
case RISCVISD::FIRST_NUMBER:
break;
- case RISCVISD::RET_FLAG:
- return "RISCVISD::RET_FLAG";
- case RISCVISD::URET_FLAG:
- return "RISCVISD::URET_FLAG";
- case RISCVISD::SRET_FLAG:
- return "RISCVISD::SRET_FLAG";
- case RISCVISD::MRET_FLAG:
- return "RISCVISD::MRET_FLAG";
- case RISCVISD::CALL:
- return "RISCVISD::CALL";
- case RISCVISD::SELECT_CC:
- return "RISCVISD::SELECT_CC";
- case RISCVISD::BuildPairF64:
- return "RISCVISD::BuildPairF64";
- case RISCVISD::SplitF64:
- return "RISCVISD::SplitF64";
- case RISCVISD::TAIL:
- return "RISCVISD::TAIL";
- case RISCVISD::SLLW:
- return "RISCVISD::SLLW";
- case RISCVISD::SRAW:
- return "RISCVISD::SRAW";
- case RISCVISD::SRLW:
- return "RISCVISD::SRLW";
- case RISCVISD::DIVW:
- return "RISCVISD::DIVW";
- case RISCVISD::DIVUW:
- return "RISCVISD::DIVUW";
- case RISCVISD::REMUW:
- return "RISCVISD::REMUW";
- case RISCVISD::FMV_W_X_RV64:
- return "RISCVISD::FMV_W_X_RV64";
- case RISCVISD::FMV_X_ANYEXTW_RV64:
- return "RISCVISD::FMV_X_ANYEXTW_RV64";
- case RISCVISD::READ_CYCLE_WIDE:
- return "RISCVISD::READ_CYCLE_WIDE";
+ NODE_NAME_CASE(RET_FLAG)
+ NODE_NAME_CASE(URET_FLAG)
+ NODE_NAME_CASE(SRET_FLAG)
+ NODE_NAME_CASE(MRET_FLAG)
+ NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(SELECT_CC)
+ NODE_NAME_CASE(BuildPairF64)
+ NODE_NAME_CASE(SplitF64)
+ NODE_NAME_CASE(TAIL)
+ NODE_NAME_CASE(SLLW)
+ NODE_NAME_CASE(SRAW)
+ NODE_NAME_CASE(SRLW)
+ NODE_NAME_CASE(DIVW)
+ NODE_NAME_CASE(DIVUW)
+ NODE_NAME_CASE(REMUW)
+ NODE_NAME_CASE(ROLW)
+ NODE_NAME_CASE(RORW)
+ NODE_NAME_CASE(FSLW)
+ NODE_NAME_CASE(FSRW)
+ NODE_NAME_CASE(FMV_H_X)
+ NODE_NAME_CASE(FMV_X_ANYEXTH)
+ NODE_NAME_CASE(FMV_W_X_RV64)
+ NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
+ NODE_NAME_CASE(READ_CYCLE_WIDE)
+ NODE_NAME_CASE(GREVI)
+ NODE_NAME_CASE(GREVIW)
+ NODE_NAME_CASE(GORCI)
+ NODE_NAME_CASE(GORCIW)
+ NODE_NAME_CASE(VMV_X_S)
+ NODE_NAME_CASE(SPLAT_VECTOR_I64)
+ NODE_NAME_CASE(READ_VLENB)
+ NODE_NAME_CASE(TRUNCATE_VECTOR)
+ NODE_NAME_CASE(VLEFF)
+ NODE_NAME_CASE(VLEFF_MASK)
+ NODE_NAME_CASE(VLSEGFF)
+ NODE_NAME_CASE(VLSEGFF_MASK)
+ NODE_NAME_CASE(READ_VL)
+ NODE_NAME_CASE(VSLIDEUP)
+ NODE_NAME_CASE(VSLIDEDOWN)
+ NODE_NAME_CASE(VID)
}
+ // clang-format on
return nullptr;
+#undef NODE_NAME_CASE
}
/// getConstraintType - Given a constraint letter, return the type of
@@ -2661,6 +4146,8 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'r':
return std::make_pair(0U, &RISCV::GPRRegClass);
case 'f':
+ if (Subtarget.hasStdExtZfh() && VT == MVT::f16)
+ return std::make_pair(0U, &RISCV::FPR16RegClass);
if (Subtarget.hasStdExtF() && VT == MVT::f32)
return std::make_pair(0U, &RISCV::FPR32RegClass);
if (Subtarget.hasStdExtD() && VT == MVT::f64)
@@ -2675,7 +4162,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// official names. However, other frontends like `rustc` do not. This allows
// users of these frontends to use the ABI names for registers in LLVM-style
// register constraints.
- Register XRegFromAlias = StringSwitch<Register>(Constraint.lower())
+ unsigned XRegFromAlias = StringSwitch<unsigned>(Constraint.lower())
.Case("{zero}", RISCV::X0)
.Case("{ra}", RISCV::X1)
.Case("{sp}", RISCV::X2)
@@ -2719,46 +4206,50 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
//
// The second case is the ABI name of the register, so that frontends can also
// use the ABI names in register constraint lists.
- if (Subtarget.hasStdExtF() || Subtarget.hasStdExtD()) {
- std::pair<Register, Register> FReg =
- StringSwitch<std::pair<Register, Register>>(Constraint.lower())
- .Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D})
- .Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D})
- .Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D})
- .Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D})
- .Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D})
- .Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D})
- .Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D})
- .Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D})
- .Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D})
- .Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D})
- .Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D})
- .Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D})
- .Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D})
- .Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D})
- .Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D})
- .Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D})
- .Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D})
- .Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D})
- .Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D})
- .Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D})
- .Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D})
- .Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D})
- .Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D})
- .Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D})
- .Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D})
- .Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D})
- .Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D})
- .Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D})
- .Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D})
- .Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D})
- .Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D})
- .Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D})
- .Default({RISCV::NoRegister, RISCV::NoRegister});
- if (FReg.first != RISCV::NoRegister)
- return Subtarget.hasStdExtD()
- ? std::make_pair(FReg.second, &RISCV::FPR64RegClass)
- : std::make_pair(FReg.first, &RISCV::FPR32RegClass);
+ if (Subtarget.hasStdExtF()) {
+ unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
+ .Cases("{f0}", "{ft0}", RISCV::F0_F)
+ .Cases("{f1}", "{ft1}", RISCV::F1_F)
+ .Cases("{f2}", "{ft2}", RISCV::F2_F)
+ .Cases("{f3}", "{ft3}", RISCV::F3_F)
+ .Cases("{f4}", "{ft4}", RISCV::F4_F)
+ .Cases("{f5}", "{ft5}", RISCV::F5_F)
+ .Cases("{f6}", "{ft6}", RISCV::F6_F)
+ .Cases("{f7}", "{ft7}", RISCV::F7_F)
+ .Cases("{f8}", "{fs0}", RISCV::F8_F)
+ .Cases("{f9}", "{fs1}", RISCV::F9_F)
+ .Cases("{f10}", "{fa0}", RISCV::F10_F)
+ .Cases("{f11}", "{fa1}", RISCV::F11_F)
+ .Cases("{f12}", "{fa2}", RISCV::F12_F)
+ .Cases("{f13}", "{fa3}", RISCV::F13_F)
+ .Cases("{f14}", "{fa4}", RISCV::F14_F)
+ .Cases("{f15}", "{fa5}", RISCV::F15_F)
+ .Cases("{f16}", "{fa6}", RISCV::F16_F)
+ .Cases("{f17}", "{fa7}", RISCV::F17_F)
+ .Cases("{f18}", "{fs2}", RISCV::F18_F)
+ .Cases("{f19}", "{fs3}", RISCV::F19_F)
+ .Cases("{f20}", "{fs4}", RISCV::F20_F)
+ .Cases("{f21}", "{fs5}", RISCV::F21_F)
+ .Cases("{f22}", "{fs6}", RISCV::F22_F)
+ .Cases("{f23}", "{fs7}", RISCV::F23_F)
+ .Cases("{f24}", "{fs8}", RISCV::F24_F)
+ .Cases("{f25}", "{fs9}", RISCV::F25_F)
+ .Cases("{f26}", "{fs10}", RISCV::F26_F)
+ .Cases("{f27}", "{fs11}", RISCV::F27_F)
+ .Cases("{f28}", "{ft8}", RISCV::F28_F)
+ .Cases("{f29}", "{ft9}", RISCV::F29_F)
+ .Cases("{f30}", "{ft10}", RISCV::F30_F)
+ .Cases("{f31}", "{ft11}", RISCV::F31_F)
+ .Default(RISCV::NoRegister);
+ if (FReg != RISCV::NoRegister) {
+ assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg");
+ if (Subtarget.hasStdExtD()) {
+ unsigned RegNo = FReg - RISCV::F0_F;
+ unsigned DReg = RISCV::F0_D + RegNo;
+ return std::make_pair(DReg, &RISCV::FPR64RegClass);
+ }
+ return std::make_pair(FReg, &RISCV::FPR32RegClass);
+ }
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -2974,6 +4465,27 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
return Result;
}
+bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
+ VT = VT.getScalarType();
+
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f16:
+ return Subtarget.hasStdExtZfh();
+ case MVT::f32:
+ return Subtarget.hasStdExtF();
+ case MVT::f64:
+ return Subtarget.hasStdExtD();
+ default:
+ break;
+ }
+
+ return false;
+}
+
Register RISCVTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
return RISCV::X10;
@@ -2994,20 +4506,39 @@ bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
return true;
}
+bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+ if (Subtarget.is64Bit() && Type == MVT::i32)
+ return true;
+
+ return IsSigned;
+}
+
bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const {
// Check integral scalar types.
if (VT.isScalarInteger()) {
- // Do not perform the transformation on riscv32 with the M extension.
- if (!Subtarget.is64Bit() && Subtarget.hasStdExtM())
+ // Omit the optimization if the sub target has the M extension and the data
+ // size exceeds XLen.
+ if (Subtarget.hasStdExtM() && VT.getSizeInBits() > Subtarget.getXLen())
return false;
if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
- if (ConstNode->getAPIntValue().getBitWidth() > 8 * sizeof(int64_t))
+ // Break the MUL to a SLLI and an ADD/SUB.
+ const APInt &Imm = ConstNode->getAPIntValue();
+ if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
+ (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
+ return true;
+ // Omit the following optimization if the sub target has the M extension
+ // and the data size >= XLen.
+ if (Subtarget.hasStdExtM() && VT.getSizeInBits() >= Subtarget.getXLen())
return false;
- int64_t Imm = ConstNode->getSExtValue();
- if (isPowerOf2_64(Imm + 1) || isPowerOf2_64(Imm - 1) ||
- isPowerOf2_64(1 - Imm) || isPowerOf2_64(-1 - Imm))
+ // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
+ // a pair of LUI/ADDI.
+ if (!Imm.isSignedIntN(12) && Imm.countTrailingZeros() < 12) {
+ APInt ImmS = Imm.ashr(Imm.countTrailingZeros());
+ if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
+ (1 - ImmS).isPowerOf2())
return true;
+ }
}
}
@@ -3032,3 +4563,19 @@ RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
StringRef(RegName) + "\"."));
return Reg;
}
+
+namespace llvm {
+namespace RISCVVIntrinsicsTable {
+
+#define GET_RISCVVIntrinsicsTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVVIntrinsicsTable
+
+namespace RISCVZvlssegTable {
+
+#define GET_RISCVZvlssegTable_IMPL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVZvlssegTable
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e420e879efc9..40b1a45c6d15 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -28,6 +28,12 @@ enum NodeType : unsigned {
SRET_FLAG,
MRET_FLAG,
CALL,
+ /// Select with condition operator - This selects between a true value and
+ /// a false value (ops #3 and #4) based on the boolean result of comparing
+ /// the lhs and rhs (ops #0 and #1) of a conditional expression with the
+ /// condition code in op #2, a XLenVT constant from the ISD::CondCode enum.
+ /// The lhs and rhs are XLenVT integers. The true and false values can be
+ /// integer or floating point.
SELECT_CC,
BuildPairF64,
SplitF64,
@@ -38,22 +44,75 @@ enum NodeType : unsigned {
SRAW,
SRLW,
// 32-bit operations from RV64M that can't be simply matched with a pattern
- // at instruction selection time.
+ // at instruction selection time. These have undefined behavior for division
+ // by 0 or overflow (divw) like their target independent counterparts.
DIVW,
DIVUW,
REMUW,
- // FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
- // is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
+ // RV64IB rotates, directly matching the semantics of the named RISC-V
+ // instructions.
+ ROLW,
+ RORW,
+ // RV64IB funnel shifts, with the semantics of the named RISC-V instructions,
+ // but the same operand order as fshl/fshr intrinsics.
+ FSRW,
+ FSLW,
+ // FPR<->GPR transfer operations when the FPR is smaller than XLEN, needed as
+ // XLEN is the only legal integer width.
+ //
+ // FMV_H_X matches the semantics of the FMV.H.X.
+ // FMV_X_ANYEXTH is similar to FMV.X.H but has an any-extended result.
+ // FMV_W_X_RV64 matches the semantics of the FMV.W.X.
// FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
+ //
// This is a more convenient semantic for producing dagcombines that remove
// unnecessary GPR->FPR->GPR moves.
+ FMV_H_X,
+ FMV_X_ANYEXTH,
FMV_W_X_RV64,
FMV_X_ANYEXTW_RV64,
// READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
// (returns (Lo, Hi)). It takes a chain operand.
- READ_CYCLE_WIDE
+ READ_CYCLE_WIDE,
+ // Generalized Reverse and Generalized Or-Combine - directly matching the
+ // semantics of the named RISC-V instructions. Lowered as custom nodes as
+ // TableGen chokes when faced with commutative permutations in deeply-nested
+ // DAGs. Each node takes an input operand and a TargetConstant immediate
+ // shift amount, and outputs a bit-manipulated version of input. All operands
+ // are of type XLenVT.
+ GREVI,
+ GREVIW,
+ GORCI,
+ GORCIW,
+ // Vector Extension
+ // VMV_X_S matches the semantics of vmv.x.s. The result is always XLenVT
+ // sign extended from the vector element size. NOTE: The result size will
+ // never be less than the vector element size.
+ VMV_X_S,
+ // Splats an i64 scalar to a vector type (with element type i64) where the
+ // scalar is a sign-extended i32.
+ SPLAT_VECTOR_I64,
+ // Read VLENB CSR
+ READ_VLENB,
+ // Truncates a RVV integer vector by one power-of-two.
+ TRUNCATE_VECTOR,
+ // Unit-stride fault-only-first load
+ VLEFF,
+ VLEFF_MASK,
+ // Unit-stride fault-only-first segment load
+ VLSEGFF,
+ VLSEGFF_MASK,
+ // read vl CSR
+ READ_VL,
+ // Matches the semantics of vslideup/vslidedown. The first operand is the
+ // pass-thru operand, the second is the source vector, and the third is the
+ // XLenVT index (either constant or non-constant).
+ VSLIDEUP,
+ VSLIDEDOWN,
+ // Matches the semantics of the unmasked vid.v instruction.
+ VID,
};
-}
+} // namespace RISCVISD
class RISCVTargetLowering : public TargetLowering {
const RISCVSubtarget &Subtarget;
@@ -62,6 +121,8 @@ public:
explicit RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI);
+ const RISCVSubtarget &getSubtarget() const { return Subtarget; }
+
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const override;
@@ -74,6 +135,8 @@ public:
bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
@@ -86,6 +149,15 @@ public:
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const override;
+
+ void computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const override;
unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
const APInt &DemandedElts,
const SelectionDAG &DAG,
@@ -126,6 +198,9 @@ public:
Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
+
ISD::NodeType getExtendForAtomicOps() const override {
return ISD::SIGN_EXTEND;
}
@@ -153,6 +228,7 @@ public:
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
bool shouldExtendTypeInLibCall(EVT Type) const override;
+ bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
/// Returns the register with the specified architectural or ABI name. This
/// method is necessary to lower the llvm.read_register.* and
@@ -220,6 +296,7 @@ private:
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
@@ -227,7 +304,14 @@ private:
SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
+ SDValue lowerSPLATVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
+ int64_t ExtTrueVal) const;
+ SDValue lowerVectorMaskTrunc(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
@@ -239,6 +323,37 @@ private:
const SmallVectorImpl<std::pair<llvm::Register, llvm::SDValue>> &Regs,
MachineFunction &MF) const;
};
+
+namespace RISCVVIntrinsicsTable {
+
+struct RISCVVIntrinsicInfo {
+ unsigned int IntrinsicID;
+ unsigned int ExtendedOperand;
+};
+
+using namespace RISCV;
+
+#define GET_RISCVVIntrinsicsTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // end namespace RISCVVIntrinsicsTable
+
+namespace RISCVZvlssegTable {
+
+struct RISCVZvlsseg {
+ unsigned int IntrinsicID;
+ unsigned int SEW;
+ unsigned int LMUL;
+ unsigned int IndexLMUL;
+ unsigned int Pseudo;
+};
+
+using namespace RISCV;
+
+#define GET_RISCVZvlssegTable_DECL
+#include "RISCVGenSearchableTables.inc"
+
+} // namespace RISCVZvlssegTable
}
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index a47945a6a515..7be74b79d99b 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -49,18 +49,61 @@ def InstFormatCB : InstFormat<15>;
def InstFormatCJ : InstFormat<16>;
def InstFormatOther : InstFormat<17>;
-class RISCVVConstraint<bits<4> val> {
- bits<4> Value = val;
+class RISCVVConstraint<bits<3> val> {
+ bits<3> Value = val;
}
-def NoConstraint : RISCVVConstraint<0>;
-def WidenV : RISCVVConstraint<1>;
-def WidenW : RISCVVConstraint<2>;
-def WidenCvt : RISCVVConstraint<3>;
-def Narrow : RISCVVConstraint<4>;
-def Iota : RISCVVConstraint<5>;
-def SlideUp : RISCVVConstraint<6>;
-def Vrgather : RISCVVConstraint<7>;
-def Vcompress : RISCVVConstraint<8>;
+def NoConstraint : RISCVVConstraint<0b000>;
+def VS2Constraint : RISCVVConstraint<0b001>;
+def VS1Constraint : RISCVVConstraint<0b010>;
+def VMConstraint : RISCVVConstraint<0b100>;
+
+// Illegal instructions:
+//
+// * The destination vector register group for a masked vector instruction
+// cannot overlap the source mask register (v0), unless the destination vector
+// register is being written with a mask value (e.g., comparisons) or the
+// scalar result of a reduction.
+//
+// * Widening: The destination EEW is greater than the source EEW, the source
+// EMUL is at least 1. The destination vector register group cannot overlap
+// with the source vector register groups besides the highest-numbered part of
+// the destination register group.
+//
+// * Narrowing: The destination EEW is smaller than the source EEW. The
+// destination vector register group cannot overlap with the source vector
+// register groups besides the lowest-numbered part of the source register
+// group.
+//
+// * vmsbf.m/vmsif.m/vmsof.m: The destination register cannot overlap the
+// source register and, if masked, cannot overlap the mask register ('v0').
+//
+// * viota: The destination register cannot overlap the source register and,
+// if masked, cannot overlap the mask register ('v0').
+//
+// * v[f]slide[1]up: The destination vector register group for vslideup cannot
+// overlap the source vector register group.
+//
+// * vrgather: The destination vector register group cannot overlap with the
+// source vector register groups.
+//
+// * vcompress: The destination vector register group cannot overlap the
+// source vector register group or the source mask register
+def WidenV : RISCVVConstraint<!or(VS2Constraint.Value,
+ VS1Constraint.Value,
+ VMConstraint.Value)>;
+def WidenW : RISCVVConstraint<!or(VS1Constraint.Value,
+ VMConstraint.Value)>;
+def WidenCvt : RISCVVConstraint<!or(VS2Constraint.Value,
+ VMConstraint.Value)>;
+def Iota : RISCVVConstraint<!or(VS2Constraint.Value,
+ VMConstraint.Value)>;
+def SlideUp : RISCVVConstraint<!or(VS2Constraint.Value,
+ VMConstraint.Value)>;
+def Vrgather : RISCVVConstraint<!or(VS2Constraint.Value,
+ VS1Constraint.Value,
+ VMConstraint.Value)>;
+def Vcompress : RISCVVConstraint<!or(VS2Constraint.Value,
+ VS1Constraint.Value)>;
// The following opcode names match those given in Table 19.1 in the
// RISC-V User-level ISA specification ("RISC-V base opcode map").
@@ -116,7 +159,25 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
// Defaults
RISCVVConstraint RVVConstraint = NoConstraint;
- let TSFlags{8-5} = RVVConstraint.Value;
+ let TSFlags{7-5} = RVVConstraint.Value;
+
+ bits<3> VLMul = 0;
+ let TSFlags{10-8} = VLMul;
+
+ bit HasDummyMask = 0;
+ let TSFlags{11} = HasDummyMask;
+
+ bit WritesElement0 = 0;
+ let TSFlags{12} = WritesElement0;
+
+ bit HasMergeOp = 0;
+ let TSFlags{13} = HasMergeOp;
+
+ bit HasSEWOp = 0;
+ let TSFlags{14} = HasSEWOp;
+
+ bit HasVLOp = 0;
+ let TSFlags{15} = HasVLOp;
}
// Pseudo instructions
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
index e5f154966ba6..147993127e78 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -21,20 +21,18 @@ def OPIVX : RISCVVFormat<0b100>;
def OPFVF : RISCVVFormat<0b101>;
def OPMVX : RISCVVFormat<0b110>;
-class RISCVMOP<bits<3> val> {
- bits<3> Value = val;
+class RISCVMOP<bits<2> val> {
+ bits<2> Value = val;
}
-def MOPLDUnitStrideU : RISCVMOP<0b000>;
-def MOPLDStridedU : RISCVMOP<0b010>;
-def MOPLDIndexedU : RISCVMOP<0b011>;
-def MOPLDUnitStrideS : RISCVMOP<0b100>;
-def MOPLDStridedS : RISCVMOP<0b110>;
-def MOPLDIndexedS : RISCVMOP<0b111>;
-
-def MOPSTUnitStride : RISCVMOP<0b000>;
-def MOPSTStrided : RISCVMOP<0b010>;
-def MOPSTIndexedOrder: RISCVMOP<0b011>;
-def MOPSTIndexedUnOrd: RISCVMOP<0b111>;
+def MOPLDUnitStride : RISCVMOP<0b00>;
+def MOPLDIndexedUnord : RISCVMOP<0b01>;
+def MOPLDStrided : RISCVMOP<0b10>;
+def MOPLDIndexedOrder : RISCVMOP<0b11>;
+
+def MOPSTUnitStride : RISCVMOP<0b00>;
+def MOPSTIndexedUnord : RISCVMOP<0b01>;
+def MOPSTStrided : RISCVMOP<0b10>;
+def MOPSTIndexedOrder : RISCVMOP<0b11>;
class RISCVLSUMOP<bits<5> val> {
bits<5> Value = val;
@@ -45,13 +43,30 @@ def LUMOPUnitStrideFF: RISCVLSUMOP<0b10000>;
def SUMOPUnitStride : RISCVLSUMOP<0b00000>;
def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
-class RISCVWidth<bits<3> val> {
- bits<3> Value = val;
+class RISCVAMOOP<bits<5> val> {
+ bits<5> Value = val;
+}
+def AMOOPVamoSwap : RISCVAMOOP<0b00001>;
+def AMOOPVamoAdd : RISCVAMOOP<0b00000>;
+def AMOOPVamoXor : RISCVAMOOP<0b00100>;
+def AMOOPVamoAnd : RISCVAMOOP<0b01100>;
+def AMOOPVamoOr : RISCVAMOOP<0b01000>;
+def AMOOPVamoMin : RISCVAMOOP<0b10000>;
+def AMOOPVamoMax : RISCVAMOOP<0b10100>;
+def AMOOPVamoMinu : RISCVAMOOP<0b11000>;
+def AMOOPVamoMaxu : RISCVAMOOP<0b11100>;
+
+class RISCVWidth<bits<4> val> {
+ bits<4> Value = val;
}
-def LSWidthVByte : RISCVWidth<0b000>;
-def LSWidthVHalf : RISCVWidth<0b101>;
-def LSWidthVWord : RISCVWidth<0b110>;
-def LSWidthVSEW : RISCVWidth<0b111>;
+def LSWidth8 : RISCVWidth<0b0000>;
+def LSWidth16 : RISCVWidth<0b0101>;
+def LSWidth32 : RISCVWidth<0b0110>;
+def LSWidth64 : RISCVWidth<0b0111>;
+def LSWidth128 : RISCVWidth<0b1000>;
+def LSWidth256 : RISCVWidth<0b1101>;
+def LSWidth512 : RISCVWidth<0b1110>;
+def LSWidth1024 : RISCVWidth<0b1111>;
class RVInstSetVLi<dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
@@ -103,6 +118,7 @@ class RVInstVV<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
let Opcode = OPC_OP_V.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
class RVInstVX<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
@@ -122,6 +138,7 @@ class RVInstVX<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
let Opcode = OPC_OP_V.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
class RVInstV2<bits<6> funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins,
@@ -140,6 +157,7 @@ class RVInstV2<bits<6> funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins,
let Opcode = OPC_OP_V.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
class RVInstIVI<bits<6> funct6, dag outs, dag ins, string opcodestr,
@@ -159,6 +177,7 @@ class RVInstIVI<bits<6> funct6, dag outs, dag ins, string opcodestr,
let Opcode = OPC_OP_V.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
class RVInstV<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
@@ -177,10 +196,11 @@ class RVInstV<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
let Opcode = OPC_OP_V.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
-class RVInstVLU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP lumop,
- RISCVWidth width, dag outs, dag ins, string opcodestr,
+class RVInstVLU<bits<3> nf, bit mew, RISCVLSUMOP lumop,
+ bits<3> width, dag outs, dag ins, string opcodestr,
string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
bits<5> rs1;
@@ -188,18 +208,20 @@ class RVInstVLU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP lumop,
bit vm;
let Inst{31-29} = nf;
- let Inst{28-26} = mop.Value;
+ let Inst{28} = mew;
+ let Inst{27-26} = MOPLDUnitStride.Value;
let Inst{25} = vm;
let Inst{24-20} = lumop.Value;
let Inst{19-15} = rs1;
- let Inst{14-12} = width.Value;
+ let Inst{14-12} = width;
let Inst{11-7} = vd;
let Opcode = OPC_LOAD_FP.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
-class RVInstVLS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVLS<bits<3> nf, bit mew, bits<3> width,
dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
bits<5> rs2;
@@ -208,18 +230,20 @@ class RVInstVLS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
bit vm;
let Inst{31-29} = nf;
- let Inst{28-26} = mop.Value;
+ let Inst{28} = mew;
+ let Inst{27-26} = MOPLDStrided.Value;
let Inst{25} = vm;
let Inst{24-20} = rs2;
let Inst{19-15} = rs1;
- let Inst{14-12} = width.Value;
+ let Inst{14-12} = width;
let Inst{11-7} = vd;
let Opcode = OPC_LOAD_FP.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
-class RVInstVLX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVLX<bits<3> nf, bit mew, RISCVMOP mop, bits<3> width,
dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
bits<5> vs2;
@@ -228,19 +252,21 @@ class RVInstVLX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
bit vm;
let Inst{31-29} = nf;
- let Inst{28-26} = mop.Value;
+ let Inst{28} = mew;
+ let Inst{27-26} = mop.Value;
let Inst{25} = vm;
let Inst{24-20} = vs2;
let Inst{19-15} = rs1;
- let Inst{14-12} = width.Value;
+ let Inst{14-12} = width;
let Inst{11-7} = vd;
let Opcode = OPC_LOAD_FP.Value;
let Uses = [VTYPE, VL];
+ let RVVConstraint = VMConstraint;
}
-class RVInstVSU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP sumop,
- RISCVWidth width, dag outs, dag ins, string opcodestr,
+class RVInstVSU<bits<3> nf, bit mew, RISCVLSUMOP sumop,
+ bits<3> width, dag outs, dag ins, string opcodestr,
string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
bits<5> rs1;
@@ -248,18 +274,19 @@ class RVInstVSU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP sumop,
bit vm;
let Inst{31-29} = nf;
- let Inst{28-26} = mop.Value;
+ let Inst{28} = mew;
+ let Inst{27-26} = MOPSTUnitStride.Value;
let Inst{25} = vm;
let Inst{24-20} = sumop.Value;
let Inst{19-15} = rs1;
- let Inst{14-12} = width.Value;
+ let Inst{14-12} = width;
let Inst{11-7} = vs3;
let Opcode = OPC_STORE_FP.Value;
let Uses = [VTYPE, VL];
}
-class RVInstVSS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVSS<bits<3> nf, bit mew, bits<3> width,
dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
bits<5> rs2;
@@ -268,18 +295,19 @@ class RVInstVSS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
bit vm;
let Inst{31-29} = nf;
- let Inst{28-26} = mop.Value;
+ let Inst{28} = mew;
+ let Inst{27-26} = MOPSTStrided.Value;
let Inst{25} = vm;
let Inst{24-20} = rs2;
let Inst{19-15} = rs1;
- let Inst{14-12} = width.Value;
+ let Inst{14-12} = width;
let Inst{11-7} = vs3;
let Opcode = OPC_STORE_FP.Value;
let Uses = [VTYPE, VL];
}
-class RVInstVSX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+class RVInstVSX<bits<3> nf, bit mew, RISCVMOP mop, bits<3> width,
dag outs, dag ins, string opcodestr, string argstr>
: RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
bits<5> vs2;
@@ -288,13 +316,33 @@ class RVInstVSX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
bit vm;
let Inst{31-29} = nf;
- let Inst{28-26} = mop.Value;
+ let Inst{28} = mew;
+ let Inst{27-26} = mop.Value;
let Inst{25} = vm;
let Inst{24-20} = vs2;
let Inst{19-15} = rs1;
- let Inst{14-12} = width.Value;
+ let Inst{14-12} = width;
let Inst{11-7} = vs3;
let Opcode = OPC_STORE_FP.Value;
let Uses = [VTYPE, VL];
}
+
+class RVInstVAMO<RISCVAMOOP amoop, bits<3> width, dag outs,
+ dag ins, string opcodestr, string argstr>
+ : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+ bits<5> vs2;
+ bits<5> rs1;
+ bit wd;
+ bit vm;
+
+ let Inst{31-27} = amoop.Value;
+ let Inst{26} = wd;
+ let Inst{25} = vm;
+ let Inst{24-20} = vs2;
+ let Inst{19-15} = rs1;
+ let Inst{14-12} = width;
+ let Opcode = OPC_AMO.Value;
+
+ let Uses = [VTYPE, VL];
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7b6ea002c7b7..45a5e10e26a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -11,10 +11,10 @@
//===----------------------------------------------------------------------===//
#include "RISCVInstrInfo.h"
+#include "MCTargetDesc/RISCVMatInt.h"
#include "RISCV.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
-#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -45,6 +45,7 @@ unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
case RISCV::LBU:
case RISCV::LH:
case RISCV::LHU:
+ case RISCV::FLH:
case RISCV::LW:
case RISCV::FLW:
case RISCV::LWU:
@@ -70,6 +71,7 @@ unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
case RISCV::SB:
case RISCV::SH:
case RISCV::SW:
+ case RISCV::FSH:
case RISCV::FSW:
case RISCV::SD:
case RISCV::FSD:
@@ -96,18 +98,37 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- // FPR->FPR copies
+ // FPR->FPR copies and VR->VR copies.
unsigned Opc;
- if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
+ bool IsScalableVector = false;
+ if (RISCV::FPR16RegClass.contains(DstReg, SrcReg))
+ Opc = RISCV::FSGNJ_H;
+ else if (RISCV::FPR32RegClass.contains(DstReg, SrcReg))
Opc = RISCV::FSGNJ_S;
else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg))
Opc = RISCV::FSGNJ_D;
- else
+ else if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV1R_V;
+ IsScalableVector = true;
+ } else if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV2R_V;
+ IsScalableVector = true;
+ } else if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV4R_V;
+ IsScalableVector = true;
+ } else if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
+ Opc = RISCV::PseudoVMV8R_V;
+ IsScalableVector = true;
+ } else
llvm_unreachable("Impossible reg-to-reg copy");
- BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
- .addReg(SrcReg, getKillRegState(KillSrc))
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (IsScalableVector)
+ BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else
+ BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addReg(SrcReg, getKillRegState(KillSrc));
}
void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -119,11 +140,18 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
if (I != MBB.end())
DL = I->getDebugLoc();
- unsigned Opcode;
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ unsigned Opcode;
if (RISCV::GPRRegClass.hasSubClassEq(RC))
Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
RISCV::SW : RISCV::SD;
+ else if (RISCV::FPR16RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::FSH;
else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
Opcode = RISCV::FSW;
else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
@@ -134,7 +162,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(IsKill))
.addFrameIndex(FI)
- .addImm(0);
+ .addImm(0)
+ .addMemOperand(MMO);
}
void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -146,11 +175,18 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (I != MBB.end())
DL = I->getDebugLoc();
- unsigned Opcode;
+ MachineFunction *MF = MBB.getParent();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+ unsigned Opcode;
if (RISCV::GPRRegClass.hasSubClassEq(RC))
Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
RISCV::LW : RISCV::LD;
+ else if (RISCV::FPR16RegClass.hasSubClassEq(RC))
+ Opcode = RISCV::FLH;
else if (RISCV::FPR32RegClass.hasSubClassEq(RC))
Opcode = RISCV::FLW;
else if (RISCV::FPR64RegClass.hasSubClassEq(RC))
@@ -158,7 +194,10 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
else
llvm_unreachable("Can't load this register from stack slot");
- BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
+ BuildMI(MBB, I, DL, get(Opcode), DstReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
}
void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
@@ -512,17 +551,48 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
const unsigned Opcode = MI.getOpcode();
- switch(Opcode) {
- default:
- break;
- case RISCV::ADDI:
- case RISCV::ORI:
- case RISCV::XORI:
- return (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0);
+ switch (Opcode) {
+ default:
+ break;
+ case RISCV::FSGNJ_D:
+ case RISCV::FSGNJ_S:
+ // The canonical floating-point move is fsgnj rd, rs, rs.
+ return MI.getOperand(1).isReg() && MI.getOperand(2).isReg() &&
+ MI.getOperand(1).getReg() == MI.getOperand(2).getReg();
+ case RISCV::ADDI:
+ case RISCV::ORI:
+ case RISCV::XORI:
+ return (MI.getOperand(1).isReg() &&
+ MI.getOperand(1).getReg() == RISCV::X0) ||
+ (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0);
}
return MI.isAsCheapAsAMove();
}
+Optional<DestSourcePair>
+RISCVInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
+ if (MI.isMoveReg())
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case RISCV::ADDI:
+ // Operand 1 can be a frameindex but callers expect registers
+ if (MI.getOperand(1).isReg() && MI.getOperand(2).isImm() &&
+ MI.getOperand(2).getImm() == 0)
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+ break;
+ case RISCV::FSGNJ_D:
+ case RISCV::FSGNJ_S:
+ // The canonical floating-point move is fsgnj rd, rs, rs.
+ if (MI.getOperand(1).isReg() && MI.getOperand(2).isReg() &&
+ MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+ break;
+ }
+ return None;
+}
+
bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
const MCInstrInfo *MCII = STI.getInstrInfo();
@@ -551,15 +621,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
case RISCVOp::OPERAND_SIMM12:
Ok = isInt<12>(Imm);
break;
- case RISCVOp::OPERAND_SIMM13_LSB0:
- Ok = isShiftedInt<12, 1>(Imm);
- break;
case RISCVOp::OPERAND_UIMM20:
Ok = isUInt<20>(Imm);
break;
- case RISCVOp::OPERAND_SIMM21_LSB0:
- Ok = isShiftedInt<20, 1>(Imm);
- break;
case RISCVOp::OPERAND_UIMMLOG2XLEN:
if (STI.getTargetTriple().isArch64Bit())
Ok = isUInt<6>(Imm);
@@ -699,10 +763,7 @@ outliner::OutlinedFunction RISCVInstrInfo::getOutliningCandidateInfo(
return !LRU.available(RISCV::X5);
};
- RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
- RepeatedSequenceLocs.end(),
- CannotInsertCall),
- RepeatedSequenceLocs.end());
+ llvm::erase_if(RepeatedSequenceLocs, CannotInsertCall);
// If the sequence doesn't have enough candidates left, then we're done.
if (RepeatedSequenceLocs.size() < 2)
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 21bc508cdc9c..0b034210aa55 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -83,6 +83,9 @@ public:
bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+ Optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
+
bool verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const override;
@@ -134,23 +137,5 @@ protected:
const RISCVSubtarget &STI;
};
-namespace RISCV {
-// Match with the definitions in RISCVInstrFormatsV.td
-enum RVVConstraintType {
- NoConstraint = 0,
- WidenV = 1,
- WidenW = 2,
- WidenCvt = 3,
- Narrow = 4,
- Iota = 5,
- SlideUp = 6,
- Vrgather = 7,
- Vcompress = 8,
-
- ConstraintOffset = 5,
- ConstraintMask = 0b1111
-};
-} // end namespace RISCV
-
} // end namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8547f791092b..a07b589e77fb 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -25,6 +25,8 @@ def SDT_RISCVCall : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
SDTCisSameAs<0, 4>,
SDTCisSameAs<4, 5>]>;
+def SDT_RISCVReadCycleWide : SDTypeProfile<2, 0, [SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
// Target-independent nodes, but with target-specific formats.
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
@@ -44,8 +46,7 @@ def riscv_sret_flag : SDNode<"RISCVISD::SRET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
def riscv_mret_flag : SDNode<"RISCVISD::MRET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
-def riscv_selectcc : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC,
- [SDNPInGlue]>;
+def riscv_selectcc : SDNode<"RISCVISD::SELECT_CC", SDT_RISCVSelectCC>;
def riscv_tail : SDNode<"RISCVISD::TAIL", SDT_RISCVCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
@@ -53,6 +54,10 @@ def riscv_sllw : SDNode<"RISCVISD::SLLW", SDTIntShiftOp>;
def riscv_sraw : SDNode<"RISCVISD::SRAW", SDTIntShiftOp>;
def riscv_srlw : SDNode<"RISCVISD::SRLW", SDTIntShiftOp>;
+def riscv_read_cycle_wide : SDNode<"RISCVISD::READ_CYCLE_WIDE",
+ SDT_RISCVReadCycleWide,
+ [SDNPHasChain, SDNPSideEffect]>;
+
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -161,6 +166,7 @@ def simm12_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
// A 13-bit signed immediate where the least significant bit is zero.
def simm13_lsb0 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
+ let PrintMethod = "printBranchOperand";
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
let MCOperandPredicate = [{
@@ -169,8 +175,7 @@ def simm13_lsb0 : Operand<OtherVT> {
return isShiftedInt<12, 1>(Imm);
return MCOp.isBareSymbolRef();
}];
- let OperandType = "OPERAND_SIMM13_LSB0";
- let OperandNamespace = "RISCVOp";
+ let OperandType = "OPERAND_PCREL";
}
class UImm20Operand : Operand<XLenVT> {
@@ -200,6 +205,7 @@ def Simm21Lsb0JALAsmOperand : SImmAsmOperand<21, "Lsb0JAL"> {
// A 21-bit signed immediate where the least significant bit is zero.
def simm21_lsb0_jal : Operand<OtherVT> {
let ParserMatchClass = Simm21Lsb0JALAsmOperand;
+ let PrintMethod = "printBranchOperand";
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
let MCOperandPredicate = [{
@@ -208,8 +214,7 @@ def simm21_lsb0_jal : Operand<OtherVT> {
return isShiftedInt<20, 1>(Imm);
return MCOp.isBareSymbolRef();
}];
- let OperandType = "OPERAND_SIMM21_LSB0";
- let OperandNamespace = "RISCVOp";
+ let OperandType = "OPERAND_PCREL";
}
def BareSymbol : AsmOperandClass {
@@ -291,6 +296,11 @@ def immbottomxlenset : ImmLeaf<XLenVT, [{
return countTrailingOnes<uint64_t>(Imm) >= 5;
}]>;
+// A 6-bit constant greater than 32.
+def uimm6gt32 : ImmLeaf<XLenVT, [{
+ return isUInt<6>(Imm) && Imm > 32;
+}]>;
+
// Addressing modes.
// Necessary because a frameindex can't be matched directly in a pattern.
def AddrFI : ComplexPattern<iPTR, 1, "SelectAddrFI", [frameindex], []>;
@@ -316,6 +326,25 @@ def NegImm : SDNodeXForm<imm, [{
N->getValueType(0));
}]>;
+// Return an immediate value minus 32.
+def ImmSub32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() - 32, SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+// Return an immediate subtracted from XLen.
+def ImmSubFromXLen : SDNodeXForm<imm, [{
+ uint64_t XLen = Subtarget->getXLen();
+ return CurDAG->getTargetConstant(XLen - N->getZExtValue(), SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+// Return an immediate subtracted from 32.
+def ImmSubFrom32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N),
+ N->getValueType(0));
+}]>;
+
//===----------------------------------------------------------------------===//
// Instruction Formats
//===----------------------------------------------------------------------===//
@@ -368,12 +397,14 @@ class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
: RVInstR<funct7, funct3, OPC_OP, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
opcodestr, "$rd, $rs1, $rs2">;
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+let hasNoSchedulingInfo = 1,
+ hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
class CSR_ir<bits<3> funct3, string opcodestr>
: RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd), (ins csr_sysreg:$imm12, GPR:$rs1),
opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR, ReadCSR]>;
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+let hasNoSchedulingInfo = 1,
+ hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
class CSR_ii<bits<3> funct3, string opcodestr>
: RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd),
(ins csr_sysreg:$imm12, uimm5:$rs1),
@@ -791,6 +822,11 @@ def : MnemonicAlias<"move", "mv">;
def : MnemonicAlias<"scall", "ecall">;
def : MnemonicAlias<"sbreak", "ebreak">;
+// This alias was added to the spec in December 2020. Don't print it by default
+// to allow assembly we print to be compatible with versions of GNU assembler
+// that don't support this alias.
+def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF), 0>;
+
//===----------------------------------------------------------------------===//
// Pseudo-instructions and codegen patterns
//
@@ -815,18 +851,30 @@ def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
return isOrEquivalentToAdd(N);
}]>;
def assertsexti32 : PatFrag<(ops node:$src), (assertsext node:$src), [{
- return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
+ return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
}]>;
def sexti32 : PatFrags<(ops node:$src),
[(sext_inreg node:$src, i32),
(assertsexti32 node:$src)]>;
def assertzexti32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
- return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32;
+ return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLE(MVT::i32);
}]>;
def zexti32 : PatFrags<(ops node:$src),
[(and node:$src, 0xffffffff),
(assertzexti32 node:$src)]>;
+def SRLIWPat : PatFrag<(ops node:$A, node:$B),
+ (srl (and node:$A, imm), node:$B), [{
+ return MatchSRLIW(N);
+}]>;
+
+// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
+// on RV64). Also used to optimize the same sequence without SLLIUW.
+def SLLIUWPat : PatFrag<(ops node:$A, node:$B),
+ (and (shl node:$A, node:$B), imm), [{
+ return MatchSLLIUW(N);
+}]>;
+
/// Immediates
def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
@@ -857,6 +905,10 @@ class shiftop<SDPatternOperator operator>
: PatFrags<(ops node:$val, node:$count),
[(operator node:$val, node:$count),
(operator node:$val, (and node:$count, immbottomxlenset))]>;
+class shiftopw<SDPatternOperator operator>
+ : PatFrags<(ops node:$val, node:$count),
+ [(operator node:$val, node:$count),
+ (operator node:$val, (and node:$count, (XLenVT 31)))]>;
def : PatGprGpr<shiftop<shl>, SLL>;
def : PatGprGpr<shiftop<srl>, SRL>;
@@ -873,10 +925,10 @@ def PseudoAddTPRel : Pseudo<(outs GPR:$rd),
/// FrameIndex calculations
-def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12),
- (ADDI (i32 AddrFI:$Rs), simm12:$imm12)>;
-def : Pat<(IsOrAdd (i32 AddrFI:$Rs), simm12:$imm12),
- (ADDI (i32 AddrFI:$Rs), simm12:$imm12)>;
+def : Pat<(add (XLenVT AddrFI:$Rs), simm12:$imm12),
+ (ADDI (XLenVT AddrFI:$Rs), simm12:$imm12)>;
+def : Pat<(IsOrAdd (XLenVT AddrFI:$Rs), simm12:$imm12),
+ (ADDI (XLenVT AddrFI:$Rs), simm12:$imm12)>;
/// Setcc
@@ -938,15 +990,18 @@ def : BccSwapPat<setle, BGE>;
def : BccSwapPat<setugt, BLTU>;
def : BccSwapPat<setule, BGEU>;
-// An extra pattern is needed for a brcond without a setcc (i.e. where the
+// Extra patterns are needed for a brcond without a setcc (i.e. where the
// condition was calculated elsewhere).
def : Pat<(brcond GPR:$cond, bb:$imm12), (BNE GPR:$cond, X0, bb:$imm12)>;
+// In this pattern, the `(xor $cond, 1)` functions like (boolean) `not`, as the
+// `brcond` only uses the lowest bit.
+def : Pat<(brcond (XLenVT (xor GPR:$cond, 1)), bb:$imm12),
+ (BEQ GPR:$cond, X0, bb:$imm12)>;
let isBarrier = 1, isBranch = 1, isTerminator = 1 in
def PseudoBR : Pseudo<(outs), (ins simm21_lsb0_jal:$imm20), [(br bb:$imm20)]>,
PseudoInstExpansion<(JAL X0, simm21_lsb0_jal:$imm20)>;
-let isCall = 1, Defs=[X1] in
let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
def PseudoBRIND : Pseudo<(outs), (ins GPR:$rs1, simm12:$imm12), []>,
PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>;
@@ -1038,6 +1093,25 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
"la.tls.gd", "$dst, $src">;
+
+/// Sign/Zero Extends
+
+// There are single-instruction versions of these in Zbb, so disable these
+// Pseudos if that extension is present.
+let hasSideEffects = 0, mayLoad = 0,
+ mayStore = 0, isCodeGenOnly = 0, isAsmParserOnly = 1 in {
+def PseudoSEXT_B : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "sext.b", "$rd, $rs">;
+def PseudoSEXT_H : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "sext.h", "$rd, $rs">;
+// rv64's sext.w is defined above, using InstAlias<"sext.w ...
+// zext.b is defined above, using InstAlias<"zext.b ...
+def PseudoZEXT_H : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "zext.h", "$rd, $rs">;
+} // hasSideEffects = 0, ...
+
+let Predicates = [IsRV64], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+ isCodeGenOnly = 0, isAsmParserOnly = 1 in {
+def PseudoZEXT_W : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "zext.w", "$rd, $rs">;
+} // Predicates = [IsRV64], ...
+
/// Loads
multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
@@ -1108,12 +1182,23 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
/// RV64 patterns
+let Predicates = [IsRV64, NotHasStdExtZba] in {
+def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;
+
+// If we're shifting a 32-bit zero extended value left by 0-31 bits, use 2
+// shifts instead of 3. This can occur when unsigned is used to index an array.
+def : Pat<(shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt),
+ (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
+// shl/and can appear in the other order too.
+def : Pat<(SLLIUWPat GPR:$rs1, uimm5:$shamt),
+ (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
+}
+
let Predicates = [IsRV64] in {
/// sext and zext
def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>;
-def : Pat<(and GPR:$rs1, 0xffffffff), (SRLI (SLLI GPR:$rs1, 32), 32)>;
/// ALU operations
@@ -1125,14 +1210,18 @@ def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32),
(SUBW GPR:$rs1, GPR:$rs2)>;
def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
(SLLIW GPR:$rs1, uimm5:$shamt)>;
-// (srl (zexti32 ...), uimm5:$shamt) is matched with custom code due to the
-// need to undo manipulation of the mask value performed by DAGCombine.
+def : Pat<(SRLIWPat GPR:$rs1, uimm5:$shamt),
+ (SRLIW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(srl (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt),
+ (SRLIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
(SRAIW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(sra (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt),
+ (SRAIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
-def : PatGprGpr<riscv_sllw, SLLW>;
-def : PatGprGpr<riscv_srlw, SRLW>;
-def : PatGprGpr<riscv_sraw, SRAW>;
+def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
+def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
+def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
/// Loads
@@ -1153,9 +1242,10 @@ let Predicates = [IsRV64] in
def : Pat<(readcyclecounter), (CSRRS CYCLE.Encoding, X0)>;
// On RV32, ReadCycleWide will be expanded to the suggested loop reading both
// halves of the 64-bit "cycle" CSR.
-let Predicates = [IsRV32], usesCustomInserter = 1, hasSideEffects = 0,
-mayLoad = 0, mayStore = 0, hasNoSchedulingInfo = 1 in
-def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins), [], "", "">;
+let Predicates = [IsRV32], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
+def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins),
+ [(set GPR:$lo, GPR:$hi, (riscv_read_cycle_wide))],
+ "", "">;
/// traps
@@ -1178,3 +1268,4 @@ include "RISCVInstrInfoD.td"
include "RISCVInstrInfoC.td"
include "RISCVInstrInfoB.td"
include "RISCVInstrInfoV.td"
+include "RISCVInstrInfoZfh.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index afac509f743d..1bc288b5177c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -7,16 +7,21 @@
//===----------------------------------------------------------------------===//
//
// This file describes the RISC-V instructions from the standard 'B' Bitmanip
-// extension, version 0.92.
+// extension, version 0.93.
// This version is still experimental as the 'B' extension hasn't been
// ratified yet.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Operand definitions.
+// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
+def riscv_rolw : SDNode<"RISCVISD::ROLW", SDTIntShiftOp>;
+def riscv_rorw : SDNode<"RISCVISD::RORW", SDTIntShiftOp>;
+def riscv_fslw : SDNode<"RISCVISD::FSLW", SDTIntShiftDOp>;
+def riscv_fsrw : SDNode<"RISCVISD::FSRW", SDTIntShiftDOp>;
+
def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
let Name = "UImmLog2XLenHalf";
let RenderMethod = "addImmOperands";
@@ -40,6 +45,63 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
}];
}
+
+// Check that it is a SLOI (Shift Left Ones Immediate).
+def SLOIPat : PatFrag<(ops node:$A, node:$B),
+ (or (shl node:$A, node:$B), imm), [{
+ return MatchSLOI(N);
+}]>;
+
+// Check that it is a SROI (Shift Right Ones Immediate).
+def SROIPat : PatFrag<(ops node:$A, node:$B),
+ (or (srl node:$A, node:$B), imm), [{
+ return MatchSROI(N);
+}]>;
+
+// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
+def SROIWPat : PatFrag<(ops node:$A, node:$B),
+ (or (srl node:$A, node:$B), imm), [{
+ return MatchSROIW(N);
+}]>;
+
+// Checks if this mask has a single 0 bit and cannot be used with ANDI.
+def BCLRMask : ImmLeaf<XLenVT, [{
+ if (Subtarget->is64Bit())
+ return !isInt<12>(Imm) && isPowerOf2_64(~Imm);
+ return !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+}]>;
+
+// Checks if this mask has a single 1 bit and cannot be used with ORI/XORI.
+def BSETINVMask : ImmLeaf<XLenVT, [{
+ if (Subtarget->is64Bit())
+ return !isInt<12>(Imm) && isPowerOf2_64(Imm);
+ return !isInt<12>(Imm) && isPowerOf2_32(Imm);
+}]>;
+
+def BCLRXForm : SDNodeXForm<imm, [{
+ // Find the lowest 0.
+ return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(),
+ SDLoc(N), N->getValueType(0));
+}]>;
+
+def BSETINVXForm : SDNodeXForm<imm, [{
+ // Find the lowest 1.
+ return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(),
+ SDLoc(N), N->getValueType(0));
+}]>;
+
+// Similar to above, but makes sure the immediate has 33 sign bits. When used
+// with an AND/OR/XOR where the other operand has at least 33 sign bits, the
+// result will have 33 sign bits. This can match BCLRIW/BSETIW/BINVIW.
+def BCLRWMask : ImmLeaf<i64, [{
+ // After checking the sign bits, truncate to 32 bits for power of 2 check.
+ return isInt<32>(Imm) && !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+}]>;
+
+def BSETINVWMask : ImmLeaf<i64, [{
+ return isInt<32>(Imm) && !isInt<12>(Imm) && isPowerOf2_32(Imm);
+}]>;
+
//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
@@ -56,11 +118,6 @@ class RVBUnary<bits<7> funct7, bits<5> funct5, bits<3> funct3,
}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBALUW_ri<bits<3> funct3, string opcodestr>
- : RVInstI<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
- (ins GPR:$rs1, simm12:$imm12), opcodestr, "$rd, $rs1, $imm12">;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBShift_ri<bits<5> funct5, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
: RVInstI<funct3, opcode, (outs GPR:$rd),
@@ -147,10 +204,16 @@ def ORN : ALU_rr<0b0100000, 0b110, "orn">, Sched<[]>;
def XNOR : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp]
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZba] in {
+def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">, Sched<[]>;
+def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">, Sched<[]>;
+def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, Sched<[]>;
+} // Predicates = [HasStdExtZba]
+
+let Predicates = [HasStdExtZbp] in {
def SLO : ALU_rr<0b0010000, 0b001, "slo">, Sched<[]>;
def SRO : ALU_rr<0b0010000, 0b101, "sro">, Sched<[]>;
-} // Predicates = [HasStdExtZbb]
+} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbbOrZbp] in {
def ROL : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
@@ -158,10 +221,10 @@ def ROR : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbs] in {
-def SBCLR : ALU_rr<0b0100100, 0b001, "sbclr">, Sched<[]>;
-def SBSET : ALU_rr<0b0010100, 0b001, "sbset">, Sched<[]>;
-def SBINV : ALU_rr<0b0110100, 0b001, "sbinv">, Sched<[]>;
-def SBEXT : ALU_rr<0b0100100, 0b101, "sbext">, Sched<[]>;
+def BCLR : ALU_rr<0b0100100, 0b001, "bclr">, Sched<[]>;
+def BSET : ALU_rr<0b0010100, 0b001, "bset">, Sched<[]>;
+def BINV : ALU_rr<0b0110100, 0b001, "binv">, Sched<[]>;
+def BEXT : ALU_rr<0b0100100, 0b101, "bext">, Sched<[]>;
} // Predicates = [HasStdExtZbs]
let Predicates = [HasStdExtZbp] in {
@@ -169,19 +232,25 @@ def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbp] in {
+def XPERMN : ALU_rr<0b0010100, 0b010, "xperm.n">, Sched<[]>;
+def XPERMB : ALU_rr<0b0010100, 0b100, "xperm.b">, Sched<[]>;
+def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
+
+let Predicates = [HasStdExtZbp] in {
def SLOI : RVBShift_ri<0b00100, 0b001, OPC_OP_IMM, "sloi">, Sched<[]>;
def SROI : RVBShift_ri<0b00100, 0b101, OPC_OP_IMM, "sroi">, Sched<[]>;
-} // Predicates = [HasStdExtZbb]
+} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbbOrZbp] in
def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;
let Predicates = [HasStdExtZbs] in {
-def SBCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "sbclri">, Sched<[]>;
-def SBSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "sbseti">, Sched<[]>;
-def SBINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "sbinvi">, Sched<[]>;
-def SBEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "sbexti">, Sched<[]>;
+def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, Sched<[]>;
+def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">, Sched<[]>;
+def BINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "binvi">, Sched<[]>;
+def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">, Sched<[]>;
} // Predicates = [HasStdExtZbs]
let Predicates = [HasStdExtZbp] in {
@@ -207,7 +276,7 @@ def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
Sched<[]>;
def CTZ : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
Sched<[]>;
-def PCNT : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "pcnt">,
+def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "cpop">,
Sched<[]>;
} // Predicates = [HasStdExtZbb]
@@ -256,8 +325,8 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>;
let Predicates = [HasStdExtZbb] in {
def MIN : ALU_rr<0b0000101, 0b100, "min">, Sched<[]>;
-def MAX : ALU_rr<0b0000101, 0b101, "max">, Sched<[]>;
-def MINU : ALU_rr<0b0000101, 0b110, "minu">, Sched<[]>;
+def MINU : ALU_rr<0b0000101, 0b101, "minu">, Sched<[]>;
+def MAX : ALU_rr<0b0000101, 0b110, "max">, Sched<[]>;
def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, Sched<[]>;
} // Predicates = [HasStdExtZbb]
@@ -267,23 +336,23 @@ def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbe] in {
-def BDEP : ALU_rr<0b0100100, 0b110, "bdep">, Sched<[]>;
-def BEXT : ALU_rr<0b0000100, 0b110, "bext">, Sched<[]>;
+// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
+// bext in the 0.93 spec.
+def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>;
+def BCOMPRESS : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>;
} // Predicates = [HasStdExtZbe]
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbp] in {
def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp]
+def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbm, IsRV64] in {
def BMATOR : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
def BMATXOR : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
} // Predicates = [HasStdExtZbm, IsRV64]
-let Predicates = [HasStdExtZbbOrZbp] in
-def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
-
let Predicates = [HasStdExtZbf] in
def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;
@@ -292,19 +361,18 @@ def SHFLI : RVBShfl_ri<0b000010, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
def UNSHFLI : RVBShfl_ri<0b000010, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def ADDIWU : RVBALUW_ri<0b100, "addiwu">, Sched<[]>;
-def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slliu.w">, Sched<[]>;
-def ADDWU : ALUW_rr<0b0000101, 0b000, "addwu">, Sched<[]>;
-def SUBWU : ALUW_rr<0b0100101, 0b000, "subwu">, Sched<[]>;
-def ADDUW : ALUW_rr<0b0000100, 0b000, "addu.w">, Sched<[]>;
-def SUBUW : ALUW_rr<0b0100100, 0b000, "subu.w">, Sched<[]>;
+let Predicates = [HasStdExtZba, IsRV64] in {
+def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">, Sched<[]>;
+def ADDUW : ALUW_rr<0b0000100, 0b000, "add.uw">, Sched<[]>;
+def SH1ADDUW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">, Sched<[]>;
+def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">, Sched<[]>;
+def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, Sched<[]>;
} // Predicates = [HasStdExtZbb, IsRV64]
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbp, IsRV64] in {
def SLOW : ALUW_rr<0b0010000, 0b001, "slow">, Sched<[]>;
def SROW : ALUW_rr<0b0010000, 0b101, "srow">, Sched<[]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[]>;
@@ -312,10 +380,10 @@ def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
let Predicates = [HasStdExtZbs, IsRV64] in {
-def SBCLRW : ALUW_rr<0b0100100, 0b001, "sbclrw">, Sched<[]>;
-def SBSETW : ALUW_rr<0b0010100, 0b001, "sbsetw">, Sched<[]>;
-def SBINVW : ALUW_rr<0b0110100, 0b001, "sbinvw">, Sched<[]>;
-def SBEXTW : ALUW_rr<0b0100100, 0b101, "sbextw">, Sched<[]>;
+def BCLRW : ALUW_rr<0b0100100, 0b001, "bclrw">, Sched<[]>;
+def BSETW : ALUW_rr<0b0010100, 0b001, "bsetw">, Sched<[]>;
+def BINVW : ALUW_rr<0b0110100, 0b001, "binvw">, Sched<[]>;
+def BEXTW : ALUW_rr<0b0100100, 0b101, "bextw">, Sched<[]>;
} // Predicates = [HasStdExtZbs, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
@@ -323,20 +391,24 @@ def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
} // Predicates = [HasStdExtZbp, IsRV64]
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
def SLOIW : RVBShiftW_ri<0b0010000, 0b001, OPC_OP_IMM_32, "sloiw">, Sched<[]>;
def SROIW : RVBShiftW_ri<0b0010000, 0b101, OPC_OP_IMM_32, "sroiw">, Sched<[]>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;
let Predicates = [HasStdExtZbs, IsRV64] in {
-def SBCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "sbclriw">,
+def BCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "bclriw">,
Sched<[]>;
-def SBSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "sbsetiw">,
+def BSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "bsetiw">,
Sched<[]>;
-def SBINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "sbinviw">,
+def BINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "binviw">,
Sched<[]>;
} // Predicates = [HasStdExtZbs, IsRV64]
@@ -359,34 +431,77 @@ def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
"clzw">, Sched<[]>;
def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
"ctzw">, Sched<[]>;
-def PCNTW : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
- "pcntw">, Sched<[]>;
+def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
+ "cpopw">, Sched<[]>;
} // Predicates = [HasStdExtZbb, IsRV64]
-let Predicates = [HasStdExtZbc, IsRV64] in {
-def CLMULW : ALUW_rr<0b0000101, 0b001, "clmulw">, Sched<[]>;
-def CLMULRW : ALUW_rr<0b0000101, 0b010, "clmulrw">, Sched<[]>;
-def CLMULHW : ALUW_rr<0b0000101, 0b011, "clmulhw">, Sched<[]>;
-} // Predicates = [HasStdExtZbc, IsRV64]
-
let Predicates = [HasStdExtZbp, IsRV64] in {
def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbe, IsRV64] in {
-def BDEPW : ALUW_rr<0b0100100, 0b110, "bdepw">, Sched<[]>;
-def BEXTW : ALUW_rr<0b0000100, 0b110, "bextw">, Sched<[]>;
+// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with
+// bextw in the 0.93 spec.
+def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>;
+def BCOMPRESSW : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>;
} // Predicates = [HasStdExtZbe, IsRV64]
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let Predicates = [HasStdExtZbp, IsRV64] in {
def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbf, IsRV64] in
def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def ZEXTH_RV32 : RVInstR<0b0000100, 0b100, OPC_OP, (outs GPR:$rd),
+ (ins GPR:$rs1), "zext.h", "$rd, $rs1">, Sched<[]> {
+ let rs2 = 0b00000;
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd),
+ (ins GPR:$rs1), "zext.h", "$rd, $rs1">, Sched<[]> {
+ let rs2 = 0b00000;
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+// We treat rev8 and orc.b as standalone instructions even though they use a
+// portion of the encodings for grevi and gorci. This allows us to support only
+// those encodings when only Zbb is enabled. We do this even when grevi and
+// gorci are available with Zbp. Trying to use 'HasStdExtZbb, NotHasStdExtZbp'
+// causes diagnostics to suggest that Zbp rather than Zbb is required for rev8
+// or gorci. Since Zbb is closer to being finalized than Zbp this will be
+// misleading to users.
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def REV8_RV32 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+ "rev8", "$rd, $rs1">, Sched<[]> {
+ let imm12 = { 0b01101, 0b0011000 };
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def REV8_RV64 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+ "rev8", "$rd, $rs1">, Sched<[]> {
+ let imm12 = { 0b01101, 0b0111000 };
+}
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def ORCB : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+ "orc.b", "$rd, $rs1">, Sched<[]> {
+ let imm12 = { 0b00101, 0b0000111 };
+}
+} // Predicates = [HasStdExtZbbOrZbp]
+
//===----------------------------------------------------------------------===//
// Future compressed instructions
//===----------------------------------------------------------------------===//
@@ -415,208 +530,123 @@ def C_NOT : RVBInstC<0b00, "c.not">, Sched<[]>;
def C_NEG : RVBInstC<0b01, "c.neg">, Sched<[]>;
} // DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC]
-let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in
+let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZba, HasStdExtC, IsRV64] in
def C_ZEXTW : RVBInstC<0b10, "c.zext.w">, Sched<[]>;
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZbb, IsRV32] in {
-def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
-def : InstAlias<"zext.h $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
-} // Predicates = [HasStdExtZbb, IsRV32]
-
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
-def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
-def : InstAlias<"zext.w $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+let Predicates = [HasStdExtZba, IsRV64] in {
+// NOTE: The 0.93 spec shows zext.w as an alias of pack/packw. It has been
+// changed to add.uw in a draft after 0.94.
+def : InstAlias<"zext.w $rd, $rs", (ADDUW GPR:$rd, GPR:$rs, X0)>;
+}
-let Predicates = [HasStdExtZbbOrZbp] in {
-def : InstAlias<"rev.p $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00001)>,
- Sched<[]>;
-def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>,
- Sched<[]>;
-def : InstAlias<"rev.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00011)>,
- Sched<[]>;
-def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>,
- Sched<[]>;
-def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>,
- Sched<[]>;
-def : InstAlias<"rev.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00111)>,
- Sched<[]>;
-def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>,
- Sched<[]>;
-def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>,
- Sched<[]>;
-def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>,
- Sched<[]>;
-def : InstAlias<"rev.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01111)>,
- Sched<[]>;
-
-def : InstAlias<"zip.n $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0001)>,
- Sched<[]>;
-def : InstAlias<"unzip.n $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>,
- Sched<[]>;
-def : InstAlias<"zip2.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0010)>,
- Sched<[]>;
-def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>,
- Sched<[]>;
-def : InstAlias<"zip.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0011)>,
- Sched<[]>;
-def : InstAlias<"unzip.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>,
- Sched<[]>;
-def : InstAlias<"zip4.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0100)>,
- Sched<[]>;
-def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>,
- Sched<[]>;
-def : InstAlias<"zip2.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0110)>,
- Sched<[]>;
-def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>,
- Sched<[]>;
-def : InstAlias<"zip.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0111)>,
- Sched<[]>;
-def : InstAlias<"unzip.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>,
- Sched<[]>;
-
-def : InstAlias<"orc.p $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00001)>,
- Sched<[]>;
-def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>,
- Sched<[]>;
-def : InstAlias<"orc.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00011)>,
- Sched<[]>;
-def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>,
- Sched<[]>;
-def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>,
- Sched<[]>;
-def : InstAlias<"orc.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00111)>,
- Sched<[]>;
-def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>,
- Sched<[]>;
-def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>,
- Sched<[]>;
-def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>,
- Sched<[]>;
-def : InstAlias<"orc.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01111)>,
- Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp]
+let Predicates = [HasStdExtZbp] in {
+def : InstAlias<"rev.p $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00001)>;
+def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>;
+def : InstAlias<"rev.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00011)>;
+def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>;
+def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>;
+def : InstAlias<"rev.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00111)>;
+def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"rev.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01111)>;
+
+def : InstAlias<"zip.n $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0001)>;
+def : InstAlias<"unzip.n $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>;
+def : InstAlias<"zip2.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0010)>;
+def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>;
+def : InstAlias<"zip.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0011)>;
+def : InstAlias<"unzip.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>;
+def : InstAlias<"zip4.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0100)>;
+def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>;
+def : InstAlias<"zip2.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0110)>;
+def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>;
+def : InstAlias<"zip.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0111)>;
+def : InstAlias<"unzip.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>;
+
+def : InstAlias<"orc.p $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00001)>;
+def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>;
+def : InstAlias<"orc.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00011)>;
+def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>;
+def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>;
+// orc.b is considered an instruction rather than an alias.
+def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"orc.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01111)>;
+} // Predicates = [HasStdExtZbp]
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
-def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
-def : InstAlias<"rev8 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
-def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
-def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
-def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
-
-def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1000)>,
- Sched<[]>;
-def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>,
- Sched<[]>;
-def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1100)>,
- Sched<[]>;
-def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>,
- Sched<[]>;
-def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1110)>,
- Sched<[]>;
-def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>,
- Sched<[]>;
-def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1111)>,
- Sched<[]>;
-def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>,
- Sched<[]>;
-
-def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
-def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
-def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
-def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
-def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>;
+// rev8 is considered an instruction rather than an alias.
+def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11111)>;
+
+def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1000)>;
+def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>;
+def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1100)>;
+def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>;
+def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1110)>;
+def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>;
+def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1111)>;
+def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>;
+
+def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>;
+def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11000)>;
+def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11111)>;
+} // Predicates = [HasStdExtZbp, IsRV32]
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>,
- Sched<[]>;
-def : InstAlias<"rev8.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011000)>,
- Sched<[]>;
-def : InstAlias<"rev4.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011100)>,
- Sched<[]>;
-def : InstAlias<"rev2.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011110)>,
- Sched<[]>;
-def : InstAlias<"rev.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011111)>,
- Sched<[]>;
-def : InstAlias<"rev32 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b100000)>,
- Sched<[]>;
-def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b110000)>,
- Sched<[]>;
-def : InstAlias<"rev8 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111000)>,
- Sched<[]>;
-def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111100)>,
- Sched<[]>;
-def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111110)>,
- Sched<[]>;
-def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111111)>,
- Sched<[]>;
-
-def : InstAlias<"zip8.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01000)>,
- Sched<[]>;
-def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>,
- Sched<[]>;
-def : InstAlias<"zip4.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01100)>,
- Sched<[]>;
-def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>,
- Sched<[]>;
-def : InstAlias<"zip2.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01110)>,
- Sched<[]>;
-def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>,
- Sched<[]>;
-def : InstAlias<"zip.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01111)>,
- Sched<[]>;
-def : InstAlias<"unzip.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>,
- Sched<[]>;
-def : InstAlias<"zip16 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b10000)>,
- Sched<[]>;
-def : InstAlias<"unzip16 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>,
- Sched<[]>;
-def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11000)>,
- Sched<[]>;
-def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>,
- Sched<[]>;
-def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11100)>,
- Sched<[]>;
-def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>,
- Sched<[]>;
-def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11110)>,
- Sched<[]>;
-def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>,
- Sched<[]>;
-def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11111)>,
- Sched<[]>;
-def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>,
- Sched<[]>;
-
-def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>,
- Sched<[]>;
-def : InstAlias<"orc8.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011000)>,
- Sched<[]>;
-def : InstAlias<"orc4.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011100)>,
- Sched<[]>;
-def : InstAlias<"orc2.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011110)>,
- Sched<[]>;
-def : InstAlias<"orc.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011111)>,
- Sched<[]>;
-def : InstAlias<"orc32 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b100000)>,
- Sched<[]>;
-def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b110000)>,
- Sched<[]>;
-def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111000)>,
- Sched<[]>;
-def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111100)>,
- Sched<[]>;
-def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111110)>,
- Sched<[]>;
-def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111111)>,
- Sched<[]>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>;
+def : InstAlias<"rev8.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011000)>;
+def : InstAlias<"rev4.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011100)>;
+def : InstAlias<"rev2.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011110)>;
+def : InstAlias<"rev.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011111)>;
+def : InstAlias<"rev32 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b100000)>;
+def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b110000)>;
+// rev8 is considered an instruction rather than an alias.
+def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111100)>;
+def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111110)>;
+def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111111)>;
+
+def : InstAlias<"zip8.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>;
+def : InstAlias<"zip4.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>;
+def : InstAlias<"zip2.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>;
+def : InstAlias<"zip.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01111)>;
+def : InstAlias<"unzip.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>;
+def : InstAlias<"zip16 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b10000)>;
+def : InstAlias<"unzip16 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>;
+def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11000)>;
+def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>;
+def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>;
+def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>;
+def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11111)>;
+def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>;
+
+def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>;
+def : InstAlias<"orc8.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011000)>;
+def : InstAlias<"orc4.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011100)>;
+def : InstAlias<"orc2.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011110)>;
+def : InstAlias<"orc.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011111)>;
+def : InstAlias<"orc32 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b100000)>;
+def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b110000)>;
+def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111000)>;
+def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111100)>;
+def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111110)>;
+def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111111)>;
+} // Predicates = [HasStdExtZbp, IsRV64]
//===----------------------------------------------------------------------===//
// Compressed Instruction patterns
@@ -628,22 +658,14 @@ def : CompressPat<(SUB GPRC:$rs1, X0, GPRC:$rs1),
(C_NEG GPRC:$rs1)>;
} // Predicates = [HasStdExtZbproposedc, HasStdExtC]
-let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in {
-def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
+let Predicates = [HasStdExtZbproposedc, HasStdExtZba, HasStdExtC, IsRV64] in {
+def : CompressPat<(ADDUW GPRC:$rs1, GPRC:$rs1, X0),
(C_ZEXTW GPRC:$rs1)>;
} // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
//===----------------------------------------------------------------------===//
// Codegen patterns
//===----------------------------------------------------------------------===//
-def SLOIPat : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>;
-def SROIPat : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
-def RORIPat : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>;
-def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
-def SLOIWPat : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
-def SROIWPat : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
-def RORIWPat : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
-def FSRIWPat : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>;
let Predicates = [HasStdExtZbbOrZbp] in {
def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
@@ -651,221 +673,194 @@ def : Pat<(or GPR:$rs1, (not GPR:$rs2)), (ORN GPR:$rs1, GPR:$rs2)>;
def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbbOrZbp]
-let Predicates = [HasStdExtZbb] in {
-def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1),
+let Predicates = [HasStdExtZbp] in {
+def : Pat<(not (shiftop<shl> (not GPR:$rs1), GPR:$rs2)),
(SLO GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1),
+def : Pat<(not (shiftop<srl> (not GPR:$rs1), GPR:$rs2)),
(SRO GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbb]
+} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbbOrZbp] in {
def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
-def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbbOrZbp]
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1),
- (SBCLR GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1),
- (SBCLR GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs] in
-def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
- (SBSET GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
- (SBSET GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
- (SBINV GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
- (SBINV GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV32] in
-def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1),
- (SBEXT GPR:$rs1, GPR:$rs2)>;
-
-let Predicates = [HasStdExtZbs, IsRV64] in
-def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1),
- (SBEXT GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs] in {
+def : Pat<(and (not (shiftop<shl> 1, GPR:$rs2)), GPR:$rs1),
+ (BCLR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (BCLR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shiftop<shl> 1, GPR:$rs2), GPR:$rs1),
+ (BSET GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (shiftop<shl> 1, GPR:$rs2), GPR:$rs1),
+ (BINV GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (shiftop<srl> GPR:$rs1, GPR:$rs2), 1),
+ (BEXT GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(shiftop<shl> 1, GPR:$rs2),
+ (BSET X0, GPR:$rs2)>;
+
+def : Pat<(and GPR:$rs1, BCLRMask:$mask),
+ (BCLRI GPR:$rs1, (BCLRXForm imm:$mask))>;
+def : Pat<(or GPR:$rs1, BSETINVMask:$mask),
+ (BSETI GPR:$rs1, (BSETINVXForm imm:$mask))>;
+def : Pat<(xor GPR:$rs1, BSETINVMask:$mask),
+ (BINVI GPR:$rs1, (BSETINVXForm imm:$mask))>;
-let Predicates = [HasStdExtZbb] in {
+def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
+ (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+}
+
+let Predicates = [HasStdExtZbp] in {
def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
(SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
(SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
-} // Predicates = [HasStdExtZbb]
+} // Predicates = [HasStdExtZbp]
-// There's no encoding for roli in the current version of the 'B' extension
-// (v0.92) as it can be implemented with rori by negating the immediate.
-// For this reason we pattern-match only against rori[w].
-let Predicates = [HasStdExtZbbOrZbp] in
-def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt),
+// There's no encoding for roli in the the 'B' extension as it can be
+// implemented with rori by negating the immediate.
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : Pat<(rotr GPR:$rs1, uimmlog2xlen:$shamt),
(RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
+ (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
+}
-// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are
-// pattern-matched by simple andi, ori, and xori.
-let Predicates = [HasStdExtZbs] in
-def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
- (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def riscv_grevi : SDNode<"RISCVISD::GREVI", SDTIntBinOp, []>;
+def riscv_greviw : SDNode<"RISCVISD::GREVIW", SDTIntBinOp, []>;
+def riscv_gorci : SDNode<"RISCVISD::GORCI", SDTIntBinOp, []>;
+def riscv_gorciw : SDNode<"RISCVISD::GORCIW", SDTIntBinOp, []>;
-let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
- (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
- (GORCI GPR:$rs1, (i32 1))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1),
- (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))),
- (GORCI GPR:$rs1, (i32 2))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1),
- (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))),
- (GORCI GPR:$rs1, (i32 4))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1),
- (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))),
- (GORCI GPR:$rs1, (i32 8))>;
-def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1),
- (shl GPR:$rs1, (i32 16))),
- (GORCI GPR:$rs1, (i32 16))>;
-} // Predicates = [HasStdExtZbp, IsRV32]
+let Predicates = [HasStdExtZbp] in {
+def : Pat<(riscv_grevi GPR:$rs1, timm:$shamt), (GREVI GPR:$rs1, timm:$shamt)>;
+def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>;
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))),
- (GORCI GPR:$rs1, (i64 1))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))),
- (GORCI GPR:$rs1, (i64 2))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))),
- (GORCI GPR:$rs1, (i64 4))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))),
- (GORCI GPR:$rs1, (i64 8))>;
-def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))),
- (GORCI GPR:$rs1, (i64 16))>;
-def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1),
- (shl GPR:$rs1, (i64 32))),
- (GORCI GPR:$rs1, (i64 32))>;
-} // Predicates = [HasStdExtZbp, IsRV64]
+// We treat orc.b as a separate instruction, so match it directly.
+def : Pat<(riscv_gorci GPR:$rs1, (XLenVT 7)), (ORCB GPR:$rs1)>;
+} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)),
- (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))),
- (GREVI GPR:$rs1, (i32 1))>;
-def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)),
- (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))),
- (GREVI GPR:$rs1, (i32 2))>;
-def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)),
- (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))),
- (GREVI GPR:$rs1, (i32 4))>;
-def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)),
- (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))),
- (GREVI GPR:$rs1, (i32 8))>;
-def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>;
-def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))),
- (GREVI GPR:$rs1, (i32 16))>;
-def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
-def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>;
-def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>;
+def : Pat<(rotr (riscv_grevi GPR:$rs1, (i32 24)), (i32 16)), (GREVI GPR:$rs1, 8)>;
+def : Pat<(rotl (riscv_grevi GPR:$rs1, (i32 24)), (i32 16)), (GREVI GPR:$rs1, 8)>;
+
+// We treat rev8 as a separate instruction, so match it directly.
+def : Pat<(riscv_grevi GPR:$rs1, (i32 24)), (REV8_RV32 GPR:$rs1)>;
} // Predicates = [HasStdExtZbp, IsRV32]
let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)),
- (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))),
- (GREVI GPR:$rs1, (i64 1))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)),
- (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))),
- (GREVI GPR:$rs1, (i64 2))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)),
- (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))),
- (GREVI GPR:$rs1, (i64 4))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)),
- (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))),
- (GREVI GPR:$rs1, (i64 8))>;
-def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)),
- (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))),
- (GREVI GPR:$rs1, (i64 16))>;
-def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))),
- (GREVI GPR:$rs1, (i64 32))>;
-def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
-def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
-def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
+// We treat rev8 as a separate instruction, so match it directly.
+def : Pat<(riscv_grevi GPR:$rs1, (i64 56)), (REV8_RV64 GPR:$rs1)>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbt] in {
-def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
+def : Pat<(or (and (not GPR:$rs2), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
(CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1),
+
+def : Pat<(select (XLenVT (setne GPR:$rs2, 0)), GPR:$rs1, GPR:$rs3),
+ (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq GPR:$rs2, 0)), GPR:$rs3, GPR:$rs1),
(CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3),
- (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3),
- (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
- (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+def : Pat<(select (XLenVT (setne GPR:$x, simm12_plus1:$y)), GPR:$rs1, GPR:$rs3),
+ (CMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq GPR:$x, simm12_plus1:$y)), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setne GPR:$x, GPR:$y)), GPR:$rs1, GPR:$rs3),
+ (CMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setuge GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, (SLTU GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setule GPR:$y, GPR:$x)), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, (SLTU GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setge GPR:$x, GPR:$y)), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, (SLT GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setle GPR:$y, GPR:$x)), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, (SLT GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select GPR:$rs2, GPR:$rs1, GPR:$rs3),
+ (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+} // Predicates = [HasStdExtZbt]
+
+// fshl and fshr concatenate their operands in the same order. fsr and fsl
+// instruction use different orders. fshl will return its first operand for
+// shift of zero, fshr will return its second operand. fsl and fsr both return
+// $rs1 so the patterns need to have different operand orders.
+//
+// fshl and fshr only read the lower log2(xlen) bits of the shift amount, but
+// fsl/fsr instructions read log2(xlen)+1 bits. DAG combine may have removed
+// an AND mask on the shift amount that we need to add back to avoid a one in
+// the extra bit.
+// FIXME: If we can prove that the extra bit in the shift amount is zero, we
+// don't need this mask.
+let Predicates = [HasStdExtZbt, IsRV32] in {
+def : Pat<(fshl GPR:$rs1, GPR:$rs3, GPR:$rs2),
+ (FSL GPR:$rs1, (ANDI GPR:$rs2, 31), GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs3, GPR:$rs1, GPR:$rs2),
+ (FSR GPR:$rs1, (ANDI GPR:$rs2, 31), GPR:$rs3)>;
+}
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def : Pat<(fshl GPR:$rs1, GPR:$rs3, GPR:$rs2),
+ (FSL GPR:$rs1, (ANDI GPR:$rs2, 63), GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs3, GPR:$rs1, GPR:$rs2),
+ (FSR GPR:$rs1, (ANDI GPR:$rs2, 63), GPR:$rs3)>;
+}
+let Predicates = [HasStdExtZbt] in {
+def : Pat<(fshr GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
+ (FSRI GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt)>;
+// We can use FSRI for fshl by immediate if we subtract the immediate from
+// XLen and swap the operands.
+def : Pat<(fshl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
+ (FSRI GPR:$rs1, GPR:$rs3, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
} // Predicates = [HasStdExtZbt]
let Predicates = [HasStdExtZbb] in {
def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
-def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>;
+def : Pat<(ctpop GPR:$rs1), (CPOP GPR:$rs1)>;
} // Predicates = [HasStdExtZbb]
-let Predicates = [HasStdExtZbb, IsRV32] in
-def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>;
-let Predicates = [HasStdExtZbb, IsRV64] in
-def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>;
-
-let Predicates = [HasStdExtZbb, IsRV32] in
-def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>;
-let Predicates = [HasStdExtZbb, IsRV64] in
-def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>;
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(sext_inreg GPR:$rs1, i8), (SEXTB GPR:$rs1)>;
+def : Pat<(sext_inreg GPR:$rs1, i16), (SEXTH GPR:$rs1)>;
+}
let Predicates = [HasStdExtZbb] in {
def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2),
- (MIN GPR:$rs1, GPR:$rs2)>;
def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2),
- (MAX GPR:$rs1, GPR:$rs2)>;
def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2),
- (MINU GPR:$rs1, GPR:$rs2)>;
def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
- (MAXU GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbb]
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+let Predicates = [HasStdExtZbb, IsRV32] in {
+def : Pat<(bswap GPR:$rs1), (REV8_RV32 GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV32]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : Pat<(bswap GPR:$rs1), (REV8_RV64 GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV32] in
def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))),
(PACK GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+let Predicates = [HasStdExtZbp, IsRV64] in
def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))),
(PACK GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+let Predicates = [HasStdExtZbp, IsRV32] in
def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))),
(PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+let Predicates = [HasStdExtZbp, IsRV64] in
def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))),
(PACKU GPR:$rs1, GPR:$rs2)>;
-let Predicates = [HasStdExtZbbOrZbp] in
+let Predicates = [HasStdExtZbp] in
def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00),
(and GPR:$rs1, 0x00FF)),
(PACKH GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+def : Pat<(and GPR:$rs, 0x0000FFFF), (ZEXTH_RV32 GPR:$rs)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : Pat<(and GPR:$rs, 0x000000000000FFFF), (ZEXTH_RV64 GPR:$rs)>;
+}
+
let Predicates = [HasStdExtZbp, IsRV32] in {
def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
(and GPR:$rs1, (i32 0xFF0000FF))),
@@ -908,156 +903,129 @@ def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
(SHFLI GPR:$rs1, (i64 1))>;
} // Predicates = [HasStdExtZbp, IsRV64]
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)),
- (ADDIWU GPR:$rs, simm12:$simm12)>;
-def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt),
- (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>;
-def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
- (ADDWU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
- (SUBWU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
+let Predicates = [HasStdExtZba] in {
+def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), GPR:$rs2),
+ (SH1ADD GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), GPR:$rs2),
+ (SH2ADD GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), GPR:$rs2),
+ (SH3ADD GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZba]
+
+let Predicates = [HasStdExtZba, IsRV64] in {
+def : Pat<(SLLIUWPat GPR:$rs1, uimm5:$shamt),
+ (SLLIUW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt),
+ (SLLIUW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(add (and GPR:$rs1, (i64 0xFFFFFFFF)), GPR:$rs2),
(ADDUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
- (SUBUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1),
+def : Pat<(and GPR:$rs, 0x00000000FFFFFFFF), (ADDUW GPR:$rs, X0)>;
+
+def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 1)), GPR:$rs2),
+ (SH1ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 2)), GPR:$rs2),
+ (SH2ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (shl (and GPR:$rs1, (i64 0xFFFFFFFF)), (XLenVT 3)), GPR:$rs2),
+ (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 1)), GPR:$rs2),
+ (SH1ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 2)), GPR:$rs2),
+ (SH2ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 3)), GPR:$rs2),
+ (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZba, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(not (shiftopw<riscv_sllw> (not GPR:$rs1), GPR:$rs2)),
(SLOW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1),
+def : Pat<(not (shiftopw<riscv_srlw> (not GPR:$rs1), GPR:$rs2)),
(SROW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
- (riscv_srlw (assertsexti32 GPR:$rs1),
- (sub (i64 0), (assertsexti32 GPR:$rs2)))),
+def : Pat<(riscv_rolw GPR:$rs1, GPR:$rs2),
(ROLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1),
- (sub (i64 0), (assertsexti32 GPR:$rs2))),
- (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))),
+def : Pat<(riscv_rorw GPR:$rs1, GPR:$rs2),
(RORW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_rorw GPR:$rs1, uimm5:$rs2),
+ (RORIW GPR:$rs1, uimm5:$rs2)>;
+def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
+ (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
let Predicates = [HasStdExtZbs, IsRV64] in {
-def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1),
- (assertsexti32 GPR:$rs1)),
- (SBCLRW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
- (assertsexti32 GPR:$rs1)),
- (SBSETW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
- (assertsexti32 GPR:$rs1)),
- (SBINVW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
- 1),
- (SBEXTW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbs, IsRV64]
+def : Pat<(and (not (riscv_sllw 1, GPR:$rs2)), (assertsexti32 GPR:$rs1)),
+ (BCLRW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (and (not (riscv_sllw 1, GPR:$rs2)), GPR:$rs1), i32),
+ (BCLRW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (riscv_sllw 1, GPR:$rs2), (assertsexti32 GPR:$rs1)),
+ (BSETW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (or (riscv_sllw 1, GPR:$rs2), GPR:$rs1), i32),
+ (BSETW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_sllw 1, GPR:$rs2), (assertsexti32 GPR:$rs1)),
+ (BINVW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sext_inreg (xor (riscv_sllw 1, GPR:$rs2), GPR:$rs1), i32),
+ (BINVW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (riscv_srlw GPR:$rs1, GPR:$rs2), 1),
+ (BEXTW GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(riscv_sllw 1, GPR:$rs2),
+ (BSETW X0, GPR:$rs2)>;
+
+def : Pat<(and (assertsexti32 GPR:$rs1), BCLRWMask:$mask),
+ (BCLRIW GPR:$rs1, (BCLRXForm imm:$mask))>;
+def : Pat<(or (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
+ (BSETIW GPR:$rs1, (BSETINVXForm imm:$mask))>;
+def : Pat<(xor (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
+ (BINVIW GPR:$rs1, (BSETINVXForm imm:$mask))>;
-let Predicates = [HasStdExtZbb, IsRV64] in {
-def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
- (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
-def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
- (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+} // Predicates = [HasStdExtZbs, IsRV64]
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
-def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt),
- (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(sext_inreg (SLOIPat GPR:$rs1, uimm5:$shamt), i32),
+ (SLOIW GPR:$rs1, uimm5:$shamt)>;
+def : Pat<(SROIWPat GPR:$rs1, uimm5:$shamt),
+ (SROIW GPR:$rs1, uimm5:$shamt)>;
+} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))),
- i32),
- (GORCIW GPR:$rs1, (i64 1))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))),
- i32),
- (GORCIW GPR:$rs1, (i64 2))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))),
- i32),
- (GORCIW GPR:$rs1, (i64 4))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))),
- i32),
- (GORCIW GPR:$rs1, (i64 8))>;
-def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)),
- GPR:$rs1),
- (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))),
- i32),
- (GORCIW GPR:$rs1, (i64 16))>;
-def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)),
- GPR:$rs1),
- (shl GPR:$rs1, (i64 16))), i32),
- (GORCIW GPR:$rs1, (i64 16))>;
-
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)),
- (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))),
- i32),
- (GREVIW GPR:$rs1, (i64 1))>;
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)),
- (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))),
- i32),
- (GREVIW GPR:$rs1, (i64 2))>;
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)),
- (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))),
- i32),
- (GREVIW GPR:$rs1, (i64 4))>;
-def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)),
- (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))),
- i32),
- (GREVIW GPR:$rs1, (i64 8))>;
-def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)),
- (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32),
- (GREVIW GPR:$rs1, (i64 16))>;
-def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>;
-def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
+def : Pat<(riscv_rorw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
+def : Pat<(riscv_rolw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
+def : Pat<(riscv_greviw GPR:$rs1, timm:$shamt), (GREVIW GPR:$rs1, timm:$shamt)>;
+def : Pat<(riscv_gorciw GPR:$rs1, timm:$shamt), (GORCIW GPR:$rs1, timm:$shamt)>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbt, IsRV64] in {
-def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
- (i64 0),
- (i64 17),
- (assertsexti32 GPR:$rs1),
- (or (riscv_sllw (assertsexti32 GPR:$rs1),
- (and (assertsexti32 GPR:$rs3), 31)),
- (riscv_srlw (assertsexti32 GPR:$rs2),
- (sub (i64 32),
- (assertsexti32 GPR:$rs3))))),
+def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2),
(FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
- (i64 0),
- (i64 17),
- (assertsexti32 GPR:$rs2),
- (or (riscv_sllw (assertsexti32 GPR:$rs1),
- (sub (i64 32),
- (assertsexti32 GPR:$rs3))),
- (riscv_srlw (assertsexti32 GPR:$rs2),
- (and (assertsexti32 GPR:$rs3), 31)))),
+def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, GPR:$rs2),
(FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
- (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
+ (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>;
+def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
+ (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>;
} // Predicates = [HasStdExtZbt, IsRV64]
let Predicates = [HasStdExtZbb, IsRV64] in {
def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
(CLZW GPR:$rs1)>;
-// We don't pattern-match CTZW here as it has the same pattern and result as
-// RV64 CTZ
-def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>;
+// computeKnownBits can't figure out that the and mask on the add result is
+// unnecessary so we need to pattern match it away.
+def : Pat<(and (add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
+ (i64 0xFFFFFFFF)),
+ (CLZW GPR:$rs1)>;
+def : Pat<(cttz (or GPR:$rs1, (i64 0x100000000))),
+ (CTZW GPR:$rs1)>;
+def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (CPOPW GPR:$rs1)>;
} // Predicates = [HasStdExtZbb, IsRV64]
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)),
- (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)),
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(sext_inreg (or (shl GPR:$rs2, (i64 16)),
+ (and GPR:$rs1, 0x000000000000FFFF)),
i32),
(PACKW GPR:$rs1, GPR:$rs2)>;
def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
- (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000),
- (i64 16))),
+ (SRLIWPat GPR:$rs1, (i64 16))),
(PACKUW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+} // Predicates = [HasStdExtZbp, IsRV64]
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index f68767847ade..30df455c1927 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -140,6 +140,7 @@ def uimm8_lsb000 : Operand<XLenVT>,
def simm9_lsb0 : Operand<OtherVT>,
ImmLeaf<XLenVT, [{return isShiftedInt<8, 1>(Imm);}]> {
let ParserMatchClass = SImmAsmOperand<9, "Lsb0">;
+ let PrintMethod = "printBranchOperand";
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<9>";
let MCOperandPredicate = [{
@@ -149,6 +150,7 @@ def simm9_lsb0 : Operand<OtherVT>,
return MCOp.isBareSymbolRef();
}];
+ let OperandType = "OPERAND_PCREL";
}
// A 9-bit unsigned immediate where the least significant three bits are zero.
@@ -200,6 +202,7 @@ def simm10_lsb0000nonzero : Operand<XLenVT>,
def simm12_lsb0 : Operand<XLenVT>,
ImmLeaf<XLenVT, [{return isShiftedInt<11, 1>(Imm);}]> {
let ParserMatchClass = SImmAsmOperand<12, "Lsb0">;
+ let PrintMethod = "printBranchOperand";
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<12>";
let MCOperandPredicate = [{
@@ -208,6 +211,7 @@ def simm12_lsb0 : Operand<XLenVT>,
return isShiftedInt<11, 1>(Imm);
return MCOp.isBareSymbolRef();
}];
+ let OperandType = "OPERAND_PCREL";
}
//===----------------------------------------------------------------------===//
@@ -239,7 +243,7 @@ class CStore_rri<bits<3> funct3, string OpcodeStr,
OpcodeStr, "$rs2, ${imm}(${rs1})">;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class Bcz<bits<3> funct3, string OpcodeStr, PatFrag CondOp,
+class Bcz<bits<3> funct3, string OpcodeStr,
RegisterClass cls>
: RVInst16CB<funct3, 0b01, (outs), (ins cls:$rs1, simm9_lsb0:$imm),
OpcodeStr, "$rs1, $imm"> {
@@ -469,8 +473,8 @@ def C_J : RVInst16CJ<0b101, 0b01, (outs), (ins simm12_lsb0:$offset),
let isBarrier=1;
}
-def C_BEQZ : Bcz<0b110, "c.beqz", seteq, GPRC>, Sched<[WriteJmp]>;
-def C_BNEZ : Bcz<0b111, "c.bnez", setne, GPRC>, Sched<[WriteJmp]>;
+def C_BEQZ : Bcz<0b110, "c.beqz", GPRC>, Sched<[WriteJmp]>;
+def C_BNEZ : Bcz<0b111, "c.bnez", GPRC>, Sched<[WriteJmp]>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
@@ -519,7 +523,8 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
let rs2 = 0;
}
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1,
+ isAsCheapAsAMove = 1 in
def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
"c.mv", "$rs1, $rs2">,
Sched<[WriteIALU, ReadIALU]>;
@@ -744,6 +749,7 @@ class CompressPat<dag input, dag output> {
dag Input = input;
dag Output = output;
list<Predicate> Predicates = [];
+ bit isCompressOnly = false;
}
// Patterns are defined in the same order the compressed instructions appear
@@ -829,25 +835,30 @@ def : CompressPat<(SUB GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_SUB GPRC:$rs1, GPRC:$rs2)>;
def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_XOR GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
def : CompressPat<(XOR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
(C_XOR GPRC:$rs1, GPRC:$rs2)>;
def : CompressPat<(OR GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_OR GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
def : CompressPat<(OR GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
(C_OR GPRC:$rs1, GPRC:$rs2)>;
def : CompressPat<(AND GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_AND GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
def : CompressPat<(AND GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
(C_AND GPRC:$rs1, GPRC:$rs2)>;
} // Predicates = [HasStdExtC]
let Predicates = [HasStdExtC, IsRV64] in {
+let isCompressOnly = true in
def : CompressPat<(ADDIW GPRNoX0:$rd, X0, simm6:$imm),
(C_LI GPRNoX0:$rd, simm6:$imm)>;
def : CompressPat<(SUBW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_SUBW GPRC:$rs1, GPRC:$rs2)>;
def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs1, GPRC:$rs2),
(C_ADDW GPRC:$rs1, GPRC:$rs2)>;
+let isCompressOnly = true in
def : CompressPat<(ADDW GPRC:$rs1, GPRC:$rs2, GPRC:$rs1),
(C_ADDW GPRC:$rs1, GPRC:$rs2)>;
} // Predicates = [HasStdExtC, IsRV64]
@@ -890,10 +901,12 @@ def : CompressPat<(LD GPRNoX0:$rd, SP:$rs1, uimm9_lsb000:$imm),
let Predicates = [HasStdExtC] in {
def : CompressPat<(JALR X0, GPRNoX0:$rs1, 0),
(C_JR GPRNoX0:$rs1)>;
+let isCompressOnly = true in {
def : CompressPat<(ADD GPRNoX0:$rs1, X0, GPRNoX0:$rs2),
(C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, X0),
(C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+}
def : CompressPat<(ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0),
(C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>;
def : CompressPat<(EBREAK), (C_EBREAK)>;
@@ -902,6 +915,7 @@ def : CompressPat<(JALR X1, GPRNoX0:$rs1, 0),
(C_JALR GPRNoX0:$rs1)>;
def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs1, GPRNoX0:$rs2),
(C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
+let isCompressOnly = true in
def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1),
(C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
} // Predicates = [HasStdExtC]
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 6c36f53cd563..133599e13b8b 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -299,23 +299,6 @@ def : PatFpr64Fpr64<setolt, FLT_D>;
def : PatFpr64Fpr64<setle, FLE_D>;
def : PatFpr64Fpr64<setole, FLE_D>;
-// Define pattern expansions for setcc operations which aren't directly
-// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
-// Legalizer.
-
-def : Pat<(seto FPR64:$rs1, FPR64:$rs2),
- (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
- (FEQ_D FPR64:$rs2, FPR64:$rs2))>;
-def : Pat<(seto FPR64:$rs1, FPR64:$rs1),
- (FEQ_D $rs1, $rs1)>;
-
-def : Pat<(setuo FPR64:$rs1, FPR64:$rs2),
- (SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
- (FEQ_D FPR64:$rs2, FPR64:$rs2)),
- 1)>;
-def : Pat<(setuo FPR64:$rs1, FPR64:$rs1),
- (SLTIU (FEQ_D $rs1, $rs1), 1)>;
-
def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>;
/// Loads
@@ -361,6 +344,7 @@ let Predicates = [HasStdExtD, IsRV64] in {
/// Float constants
def : Pat<(f64 (fpimm0)), (FMV_D_X X0)>;
+// Moves (no conversion)
def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>;
def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
@@ -368,11 +352,11 @@ def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
// because fpto[u|s]i produce poison if the value can't fit into the target.
// We match the single case below because fcvt.wu.d sign-extends its result so
// is cheaper than fcvt.lu.d+sext.w.
-def : Pat<(sext_inreg (zexti32 (fp_to_uint FPR64:$rs1)), i32),
+def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR64:$rs1)), i32),
(FCVT_WU_D $rs1, 0b001)>;
// [u]int32->fp
-def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_D_W $rs1)>;
+def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_D_W $rs1)>;
def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_D_WU $rs1)>;
def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_L_D FPR64:$rs1, 0b001)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index ce5c3abb6a06..4529949f693e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -303,10 +303,6 @@ def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>;
/// Float conversion operations
-// Moves (no conversion)
-def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
-def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
-
// [u]int32<->float conversion patterns must be gated on IsRV32 or IsRV64, so
// are defined later.
@@ -359,23 +355,6 @@ def : PatFpr32Fpr32<setolt, FLT_S>;
def : PatFpr32Fpr32<setle, FLE_S>;
def : PatFpr32Fpr32<setole, FLE_S>;
-// Define pattern expansions for setcc operations which aren't directly
-// handled by a RISC-V instruction and aren't expanded in the SelectionDAG
-// Legalizer.
-
-def : Pat<(seto FPR32:$rs1, FPR32:$rs2),
- (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
- (FEQ_S FPR32:$rs2, FPR32:$rs2))>;
-def : Pat<(seto FPR32:$rs1, FPR32:$rs1),
- (FEQ_S $rs1, $rs1)>;
-
-def : Pat<(setuo FPR32:$rs1, FPR32:$rs2),
- (SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
- (FEQ_S FPR32:$rs2, FPR32:$rs2)),
- 1)>;
-def : Pat<(setuo FPR32:$rs1, FPR32:$rs1),
- (SLTIU (FEQ_S $rs1, $rs1), 1)>;
-
def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>;
/// Loads
@@ -389,6 +368,10 @@ defm : StPat<store, FSW, FPR32>;
} // Predicates = [HasStdExtF]
let Predicates = [HasStdExtF, IsRV32] in {
+// Moves (no conversion)
+def : Pat<(bitconvert GPR:$rs1), (FMV_W_X GPR:$rs1)>;
+def : Pat<(bitconvert FPR32:$rs1), (FMV_X_W FPR32:$rs1)>;
+
// float->[u]int. Round-to-zero must be used.
def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
@@ -399,9 +382,10 @@ def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
} // Predicates = [HasStdExtF, IsRV32]
let Predicates = [HasStdExtF, IsRV64] in {
+// Moves (no conversion)
def : Pat<(riscv_fmv_w_x_rv64 GPR:$src), (FMV_W_X GPR:$src)>;
def : Pat<(riscv_fmv_x_anyextw_rv64 FPR32:$src), (FMV_X_W FPR32:$src)>;
-def : Pat<(sexti32 (riscv_fmv_x_anyextw_rv64 FPR32:$src)),
+def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32),
(FMV_X_W FPR32:$src)>;
// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
@@ -416,7 +400,7 @@ def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_L_S $rs1, 0b001)>;
def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_LU_S $rs1, 0b001)>;
// [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (sext_inreg GPR:$rs1, i32)), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_L $rs1, 0b111)>;
def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_LU $rs1, 0b111)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 987534aadd79..8cfb903a173c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -81,9 +81,11 @@ def : PatGprGpr<riscv_remuw, REMUW>;
// Handle the specific cases where using DIVU/REMU would be correct and result
// in fewer instructions than emitting DIVUW/REMUW then zero-extending the
// result.
-def : Pat<(zexti32 (riscv_divuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))),
+def : Pat<(and (riscv_divuw (assertzexti32 GPR:$rs1),
+ (assertzexti32 GPR:$rs2)), 0xffffffff),
(DIVU GPR:$rs1, GPR:$rs2)>;
-def : Pat<(zexti32 (riscv_remuw (zexti32 GPR:$rs1), (zexti32 GPR:$rs2))),
+def : Pat<(and (riscv_remuw (assertzexti32 GPR:$rs1),
+ (assertzexti32 GPR:$rs2)), 0xffffffff),
(REMU GPR:$rs1, GPR:$rs2)>;
// Although the sexti32 operands may not have originated from an i32 srem,
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 1c7f53fecb8c..4f9e9cfbdb98 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
///
/// This file describes the RISC-V instructions from the standard 'V' Vector
-/// extension, version 0.8.
+/// extension, version 0.9.
/// This version is still experimental as the 'V' extension hasn't been
/// ratified yet.
///
@@ -31,18 +31,6 @@ def VTypeIOp : Operand<XLenVT> {
let DecoderMethod = "decodeUImmOperand<11>";
}
-def VRegAsmOperand : AsmOperandClass {
- let Name = "RVVRegOpOperand";
- let RenderMethod = "addRegOperands";
- let PredicateMethod = "isReg";
- let ParserMethod = "parseRegister";
-}
-
-def VRegOp : RegisterOperand<VR> {
- let ParserMatchClass = VRegAsmOperand;
- let PrintMethod = "printOperand";
-}
-
def VMaskAsmOperand : AsmOperandClass {
let Name = "RVVMaskRegOpOperand";
let RenderMethod = "addRegOperands";
@@ -74,14 +62,13 @@ def simm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<5>(Imm);}]> {
def SImm5Plus1AsmOperand : AsmOperandClass {
let Name = "SImm5Plus1";
- let RenderMethod = "addSImm5Plus1Operands";
+ let RenderMethod = "addImmOperands";
let DiagnosticType = "InvalidSImm5Plus1";
}
def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
[{return isInt<5>(Imm - 1);}]> {
let ParserMatchClass = SImm5Plus1AsmOperand;
- let PrintMethod = "printSImm5Plus1";
let MCOperandPredicate = [{
int64_t Imm;
if (MCOp.evaluateAsConstantImm(Imm))
@@ -96,161 +83,229 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
// load vd, (rs1), vm
-class VUnitStrideLoad<RISCVMOP mop, RISCVLSUMOP lumop, RISCVWidth width,
- string opcodestr>
- : RVInstVLU<0b000, mop, lumop, width, (outs VRegOp:$vd),
+class VUnitStrideLoad<RISCVLSUMOP lumop, RISCVWidth width,
+ string opcodestr>
+ : RVInstVLU<0b000, width.Value{3}, lumop, width.Value{2-0},
+ (outs VR:$vd),
(ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
// load vd, (rs1), rs2, vm
-class VStridedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
- : RVInstVLS<0b000, mop, width, (outs VRegOp:$vd),
+class VStridedLoad<RISCVWidth width, string opcodestr>
+ : RVInstVLS<0b000, width.Value{3}, width.Value{2-0},
+ (outs VR:$vd),
(ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
"$vd, (${rs1}), $rs2$vm">;
// load vd, (rs1), vs2, vm
class VIndexedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
- : RVInstVLX<0b000, mop, width, (outs VRegOp:$vd),
- (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm), opcodestr,
+ : RVInstVLX<0b000, width.Value{3}, mop, width.Value{2-0},
+ (outs VR:$vd),
+ (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm), opcodestr,
"$vd, (${rs1}), $vs2$vm">;
// vl<nf>r.v vd, (rs1)
-class VWholeLoad<bits<3> nf, string opcodestr>
- : RVInstVLU<nf, MOPLDUnitStrideU, LUMOPUnitStrideWholeReg,
- LSWidthVSEW, (outs VRegOp:$vd), (ins GPR:$rs1),
+class VWholeLoad<bits<3> nf, RISCVWidth width, string opcodestr>
+ : RVInstVLU<nf, width.Value{3}, LUMOPUnitStrideWholeReg,
+ width.Value{2-0}, (outs VR:$vd), (ins GPR:$rs1),
opcodestr, "$vd, (${rs1})"> {
let vm = 1;
let Uses = [];
+ let RVVConstraint = NoConstraint;
}
+
+// segment load vd, (rs1), vm
+class VUnitStrideSegmentLoad<bits<3> nf, RISCVLSUMOP lumop,
+ RISCVWidth width, string opcodestr>
+ : RVInstVLU<nf, width.Value{3}, lumop, width.Value{2-0},
+ (outs VR:$vd),
+ (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
+
+// segment load vd, (rs1), rs2, vm
+class VStridedSegmentLoad<bits<3> nf, RISCVWidth width, string opcodestr>
+ : RVInstVLS<nf, width.Value{3}, width.Value{2-0},
+ (outs VR:$vd),
+ (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
+ "$vd, (${rs1}), $rs2$vm">;
+
+// segment load vd, (rs1), vs2, vm
+class VIndexedSegmentLoad<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+ string opcodestr>
+ : RVInstVLX<nf, width.Value{3}, mop, width.Value{2-0},
+ (outs VR:$vd),
+ (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm), opcodestr,
+ "$vd, (${rs1}), $vs2$vm">;
} // hasSideEffects = 0, mayLoad = 1, mayStore = 0
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
// store vd, vs3, (rs1), vm
-class VUnitStrideStore<RISCVMOP mop, RISCVLSUMOP sumop, RISCVWidth width,
+class VUnitStrideStore<RISCVLSUMOP sumop, RISCVWidth width,
string opcodestr>
- : RVInstVSU<0b000, mop, sumop, width, (outs),
- (ins VRegOp:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
+ : RVInstVSU<0b000, width.Value{3}, sumop, width.Value{2-0},
+ (outs), (ins VR:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
"$vs3, (${rs1})$vm">;
// store vd, vs3, (rs1), rs2, vm
-class VStridedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
- : RVInstVSS<0b000, mop, width, (outs),
- (ins VRegOp:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
+class VStridedStore<RISCVWidth width, string opcodestr>
+ : RVInstVSS<0b000, width.Value{3}, width.Value{2-0}, (outs),
+ (ins VR:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
opcodestr, "$vs3, (${rs1}), $rs2$vm">;
// store vd, vs3, (rs1), vs2, vm
class VIndexedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
- : RVInstVSX<0b000, mop, width, (outs),
- (ins VRegOp:$vs3, GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+ : RVInstVSX<0b000, width.Value{3}, mop, width.Value{2-0}, (outs),
+ (ins VR:$vs3, GPR:$rs1, VR:$vs2, VMaskOp:$vm),
opcodestr, "$vs3, (${rs1}), $vs2$vm">;
// vs<nf>r.v vd, (rs1)
class VWholeStore<bits<3> nf, string opcodestr>
- : RVInstVSU<nf, MOPSTUnitStride, SUMOPUnitStrideWholeReg,
- LSWidthVSEW, (outs), (ins VRegOp:$vs3, GPR:$rs1),
+ : RVInstVSU<nf, 0, SUMOPUnitStrideWholeReg,
+ 0b000, (outs), (ins VR:$vs3, GPR:$rs1),
opcodestr, "$vs3, (${rs1})"> {
let vm = 1;
let Uses = [];
}
+
+// segment store vd, vs3, (rs1), vm
+class VUnitStrideSegmentStore<bits<3> nf, RISCVWidth width, string opcodestr>
+ : RVInstVSU<nf, width.Value{3}, SUMOPUnitStride, width.Value{2-0},
+ (outs), (ins VR:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
+ "$vs3, (${rs1})$vm">;
+
+// segment store vd, vs3, (rs1), rs2, vm
+class VStridedSegmentStore<bits<3> nf, RISCVWidth width, string opcodestr>
+ : RVInstVSS<nf, width.Value{3}, width.Value{2-0}, (outs),
+ (ins VR:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
+ opcodestr, "$vs3, (${rs1}), $rs2$vm">;
+
+// segment store vd, vs3, (rs1), vs2, vm
+class VIndexedSegmentStore<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+ string opcodestr>
+ : RVInstVSX<nf, width.Value{3}, mop, width.Value{2-0}, (outs),
+ (ins VR:$vs3, GPR:$rs1, VR:$vs2, VMaskOp:$vm),
+ opcodestr, "$vs3, (${rs1}), $vs2$vm">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 1
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
// op vd, vs2, vs1, vm
class VALUVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVV<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, VRegOp:$vs1, VMaskOp:$vm),
+ : RVInstVV<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, VR:$vs1, VMaskOp:$vm),
opcodestr, "$vd, $vs2, $vs1$vm">;
// op vd, vs2, vs1, v0 (without mask, use v0 as carry input)
class VALUmVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVV<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, VRegOp:$vs1, VMV0:$v0),
+ : RVInstVV<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, VR:$vs1, VMV0:$v0),
opcodestr, "$vd, $vs2, $vs1, v0"> {
let vm = 0;
}
// op vd, vs1, vs2, vm (reverse the order of vs1 and vs2)
class VALUrVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVV<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs1, VRegOp:$vs2, VMaskOp:$vm),
+ : RVInstVV<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs1, VR:$vs2, VMaskOp:$vm),
opcodestr, "$vd, $vs1, $vs2$vm">;
-// op vd, vs1, vs2
+// op vd, vs2, vs1
class VALUVVNoVm<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVV<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, VRegOp:$vs1),
+ : RVInstVV<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, VR:$vs1),
opcodestr, "$vd, $vs2, $vs1"> {
let vm = 1;
}
// op vd, vs2, rs1, vm
class VALUVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVX<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, GPR:$rs1, VMaskOp:$vm),
+ : RVInstVX<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
opcodestr, "$vd, $vs2, $rs1$vm">;
// op vd, vs2, rs1, v0 (without mask, use v0 as carry input)
class VALUmVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVX<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, GPR:$rs1, VMV0:$v0),
+ : RVInstVX<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, GPR:$rs1, VMV0:$v0),
opcodestr, "$vd, $vs2, $rs1, v0"> {
let vm = 0;
}
// op vd, rs1, vs2, vm (reverse the order of rs1 and vs2)
class VALUrVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVX<funct6, opv, (outs VRegOp:$vd),
- (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+ : RVInstVX<funct6, opv, (outs VR:$vd),
+ (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm),
opcodestr, "$vd, $rs1, $vs2$vm">;
// op vd, vs1, vs2
class VALUVXNoVm<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVX<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, GPR:$rs1),
+ : RVInstVX<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, GPR:$rs1),
opcodestr, "$vd, $vs2, $rs1"> {
let vm = 1;
}
// op vd, vs2, imm, vm
class VALUVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
- : RVInstIVI<funct6, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, optype:$imm, VMaskOp:$vm),
+ : RVInstIVI<funct6, (outs VR:$vd),
+ (ins VR:$vs2, optype:$imm, VMaskOp:$vm),
opcodestr, "$vd, $vs2, $imm$vm">;
// op vd, vs2, imm, v0 (without mask, use v0 as carry input)
class VALUmVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
- : RVInstIVI<funct6, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, optype:$imm, VMV0:$v0),
+ : RVInstIVI<funct6, (outs VR:$vd),
+ (ins VR:$vs2, optype:$imm, VMV0:$v0),
opcodestr, "$vd, $vs2, $imm, v0"> {
let vm = 0;
}
// op vd, vs2, imm, vm
class VALUVINoVm<bits<6> funct6, string opcodestr, Operand optype = simm5>
- : RVInstIVI<funct6, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, optype:$imm),
+ : RVInstIVI<funct6, (outs VR:$vd),
+ (ins VR:$vs2, optype:$imm),
opcodestr, "$vd, $vs2, $imm"> {
let vm = 1;
}
// op vd, vs2, rs1, vm (Float)
class VALUVF<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVX<funct6, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, FPR32:$rs1, VMaskOp:$vm),
+ : RVInstVX<funct6, opv, (outs VR:$vd),
+ (ins VR:$vs2, FPR32:$rs1, VMaskOp:$vm),
opcodestr, "$vd, $vs2, $rs1$vm">;
// op vd, rs1, vs2, vm (Float) (with mask, reverse the order of rs1 and vs2)
class VALUrVF<bits<6> funct6, RISCVVFormat opv, string opcodestr>
- : RVInstVX<funct6, opv, (outs VRegOp:$vd),
- (ins FPR32:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+ : RVInstVX<funct6, opv, (outs VR:$vd),
+ (ins FPR32:$rs1, VR:$vs2, VMaskOp:$vm),
opcodestr, "$vd, $rs1, $vs2$vm">;
// op vd, vs2, vm (use vs1 as instruction encoding)
class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr>
- : RVInstV<funct6, vs1, opv, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, VMaskOp:$vm),
+ : RVInstV<funct6, vs1, opv, (outs VR:$vd),
+ (ins VR:$vs2, VMaskOp:$vm),
opcodestr, "$vd, $vs2$vm">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
+// vamo vd, (rs1), vs2, vd, vm
+class VAMOWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
+ : RVInstVAMO<amoop, width.Value{2-0}, (outs VR:$vd_wd),
+ (ins GPR:$rs1, VR:$vs2, VR:$vd, VMaskOp:$vm),
+ opcodestr, "$vd_wd, (${rs1}), $vs2, $vd$vm"> {
+ let Constraints = "$vd_wd = $vd";
+ let wd = 1;
+ bits<5> vd;
+ let Inst{11-7} = vd;
+}
+
+// vamo x0, (rs1), vs2, vs3, vm
+class VAMONoWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
+ : RVInstVAMO<amoop, width.Value{2-0}, (outs),
+ (ins GPR:$rs1, VR:$vs2, VR:$vs3, VMaskOp:$vm),
+ opcodestr, "x0, (${rs1}), $vs2, $vs3$vm"> {
+ bits<5> vs3;
+ let Inst{11-7} = vs3;
+}
+
+} // hasSideEffects = 0, mayLoad = 1, mayStore = 1
+
//===----------------------------------------------------------------------===//
// Combination of instruction classes.
// Use these multiclasses to define instructions more easily.
@@ -358,6 +413,22 @@ multiclass VALU_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>;
}
+multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
+ def _WD : VAMOWd<amoop, width, opcodestr>;
+ def _UNWD : VAMONoWd<amoop, width, opcodestr>;
+}
+
+multiclass VWholeLoad<bits<3> nf, string opcodestr> {
+ def E8_V : VWholeLoad<nf, LSWidth8, opcodestr # "e8.v">;
+ def E16_V : VWholeLoad<nf, LSWidth16, opcodestr # "e16.v">;
+ def E32_V : VWholeLoad<nf, LSWidth32, opcodestr # "e32.v">;
+ def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v">;
+ def E128_V : VWholeLoad<nf, LSWidth128, opcodestr # "e128.v">;
+ def E256_V : VWholeLoad<nf, LSWidth256, opcodestr # "e256.v">;
+ def E512_V : VWholeLoad<nf, LSWidth512, opcodestr # "e512.v">;
+ def E1024_V : VWholeLoad<nf, LSWidth1024, opcodestr # "e1024.v">;
+}
+
//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
@@ -372,77 +443,94 @@ def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
} // hasSideEffects = 1, mayLoad = 0, mayStore = 0
// Vector Unit-Stride Instructions
-def VLB_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVByte, "vlb.v">;
-def VLH_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVHalf, "vlh.v">;
-def VLW_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVWord, "vlw.v">;
-
-def VLBU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVByte, "vlbu.v">;
-def VLHU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVHalf, "vlhu.v">;
-def VLWU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVWord, "vlwu.v">;
-
-def VLE_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVSEW, "vle.v">;
-
-def VLBFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVByte, "vlbff.v">;
-def VLHFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVHalf, "vlhff.v">;
-def VLWFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVWord, "vlwff.v">;
-
-def VLBUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVByte, "vlbuff.v">;
-def VLHUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVHalf, "vlhuff.v">;
-def VLWUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVWord, "vlwuff.v">;
-
-def VLEFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVSEW, "vleff.v">;
-
-def VSB_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVByte, "vsb.v">;
-def VSH_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVHalf, "vsh.v">;
-def VSW_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVWord, "vsw.v">;
-
-def VSE_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVSEW, "vse.v">;
+def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">;
+def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">;
+def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">;
+def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">;
+def VLE128_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth128, "vle128.v">;
+def VLE256_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth256, "vle256.v">;
+def VLE512_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth512, "vle512.v">;
+def VLE1024_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth1024, "vle1024.v">;
+
+def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">;
+def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">;
+def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">;
+def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">;
+def VLE128FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth128, "vle128ff.v">;
+def VLE256FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth256, "vle256ff.v">;
+def VLE512FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth512, "vle512ff.v">;
+def VLE1024FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth1024, "vle1024ff.v">;
+
+def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">;
+def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">;
+def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">;
+def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">;
+def VSE128_V : VUnitStrideStore<SUMOPUnitStride, LSWidth128, "vse128.v">;
+def VSE256_V : VUnitStrideStore<SUMOPUnitStride, LSWidth256, "vse256.v">;
+def VSE512_V : VUnitStrideStore<SUMOPUnitStride, LSWidth512, "vse512.v">;
+def VSE1024_V : VUnitStrideStore<SUMOPUnitStride, LSWidth1024, "vse1024.v">;
// Vector Strided Instructions
-def VLSB_V : VStridedLoad<MOPLDStridedS, LSWidthVByte, "vlsb.v">;
-def VLSH_V : VStridedLoad<MOPLDStridedS, LSWidthVHalf, "vlsh.v">;
-def VLSW_V : VStridedLoad<MOPLDStridedS, LSWidthVWord, "vlsw.v">;
-
-def VLSBU_V : VStridedLoad<MOPLDStridedU, LSWidthVByte, "vlsbu.v">;
-def VLSHU_V : VStridedLoad<MOPLDStridedU, LSWidthVHalf, "vlshu.v">;
-def VLSWU_V : VStridedLoad<MOPLDStridedU, LSWidthVWord, "vlswu.v">;
-
-def VLSE_V : VStridedLoad<MOPLDStridedU, LSWidthVSEW, "vlse.v">;
-
-def VSSB_V : VStridedStore<MOPSTStrided, LSWidthVByte, "vssb.v">;
-def VSSH_V : VStridedStore<MOPSTStrided, LSWidthVHalf, "vssh.v">;
-def VSSW_V : VStridedStore<MOPSTStrided, LSWidthVWord, "vssw.v">;
-def VSSE_V : VStridedStore<MOPSTStrided, LSWidthVSEW, "vsse.v">;
+def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">;
+def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">;
+def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">;
+def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">;
+def VLSE128_V : VStridedLoad<LSWidth128, "vlse128.v">;
+def VLSE256_V : VStridedLoad<LSWidth256, "vlse256.v">;
+def VLSE512_V : VStridedLoad<LSWidth512, "vlse512.v">;
+def VLSE1024_V : VStridedLoad<LSWidth1024, "vlse1024.v">;
+
+def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">;
+def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">;
+def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">;
+def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">;
+def VSSE128_V : VStridedStore<LSWidth128, "vsse128.v">;
+def VSSE256_V : VStridedStore<LSWidth256, "vsse256.v">;
+def VSSE512_V : VStridedStore<LSWidth512, "vsse512.v">;
+def VSSE1024_V : VStridedStore<LSWidth1024, "vsse1024.v">;
// Vector Indexed Instructions
-def VLXB_V : VIndexedLoad<MOPLDIndexedS, LSWidthVByte, "vlxb.v">;
-def VLXH_V : VIndexedLoad<MOPLDIndexedS, LSWidthVHalf, "vlxh.v">;
-def VLXW_V : VIndexedLoad<MOPLDIndexedS, LSWidthVWord, "vlxw.v">;
-
-def VLXBU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVByte, "vlxbu.v">;
-def VLXHU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVHalf, "vlxhu.v">;
-def VLXWU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVWord, "vlxwu.v">;
-
-def VLXE_V : VIndexedLoad<MOPLDIndexedU, LSWidthVSEW, "vlxe.v">;
-
-def VSXB_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVByte, "vsxb.v">;
-def VSXH_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVHalf, "vsxh.v">;
-def VSXW_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVWord, "vsxw.v">;
-def VSXE_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVSEW, "vsxe.v">;
-
-def VSUXB_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVByte, "vsuxb.v">;
-def VSUXH_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVHalf, "vsuxh.v">;
-def VSUXW_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVWord, "vsuxw.v">;
-def VSUXE_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVSEW, "vsuxe.v">;
-
-def VL1R_V : VWholeLoad<0, "vl1r.v">;
-def VS1R_V : VWholeStore<0, "vs1r.v">;
+def VLUXEI8_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth8, "vluxei8.v">;
+def VLUXEI16_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth16, "vluxei16.v">;
+def VLUXEI32_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth32, "vluxei32.v">;
+def VLUXEI64_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth64, "vluxei64.v">;
+
+def VLOXEI8_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth8, "vloxei8.v">;
+def VLOXEI16_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth16, "vloxei16.v">;
+def VLOXEI32_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth32, "vloxei32.v">;
+def VLOXEI64_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth64, "vloxei64.v">;
+
+def VSUXEI8_V : VIndexedStore<MOPSTIndexedUnord, LSWidth8, "vsuxei8.v">;
+def VSUXEI16_V : VIndexedStore<MOPSTIndexedUnord, LSWidth16, "vsuxei16.v">;
+def VSUXEI32_V : VIndexedStore<MOPSTIndexedUnord, LSWidth32, "vsuxei32.v">;
+def VSUXEI64_V : VIndexedStore<MOPSTIndexedUnord, LSWidth64, "vsuxei64.v">;
+
+def VSOXEI8_V : VIndexedStore<MOPSTIndexedOrder, LSWidth8, "vsoxei8.v">;
+def VSOXEI16_V : VIndexedStore<MOPSTIndexedOrder, LSWidth16, "vsoxei16.v">;
+def VSOXEI32_V : VIndexedStore<MOPSTIndexedOrder, LSWidth32, "vsoxei32.v">;
+def VSOXEI64_V : VIndexedStore<MOPSTIndexedOrder, LSWidth64, "vsoxei64.v">;
+
+defm VL1R : VWholeLoad<1, "vl1r">;
+defm VL2R : VWholeLoad<2, "vl2r">;
+defm VL4R : VWholeLoad<4, "vl4r">;
+defm VL8R : VWholeLoad<8, "vl8r">;
+def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VR:$vd, GPR:$rs1)>;
+
+def VS1R_V : VWholeStore<1, "vs1r.v">;
+def VS2R_V : VWholeStore<2, "vs2r.v">;
+def VS4R_V : VWholeStore<4, "vs4r.v">;
+def VS8R_V : VWholeStore<8, "vs8r.v">;
// Vector Single-Width Integer Add and Subtract
defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
defm VSUB_V : VALU_IV_V_X<"vsub", 0b000010>;
defm VRSUB_V : VALU_IV_X_I<"vrsub", 0b000011>;
+def : InstAlias<"vneg.v $vd, $vs$vm", (VRSUB_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+
// Vector Widening Integer Add/Subtract
// Refer to 11.2 Widening Vector Arithmetic Instructions
// The destination vector register group cannot overlap a source vector
@@ -468,17 +556,29 @@ defm VWSUB_W : VALU_MV_V_X<"vwsub", 0b110111, "w">;
} // Constraints = "@earlyclobber $vd"
def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm",
- (VWADD_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>;
+ (VWADD_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
def : InstAlias<"vwcvtu.x.x.v $vd, $vs$vm",
- (VWADDU_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>;
+ (VWADDU_VX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
+
+// Vector Integer Extension
+defm VZEXT_VF8 : VALU_MV_VS2<"vzext.vf8", 0b010010, 0b00010>;
+defm VSEXT_VF8 : VALU_MV_VS2<"vsext.vf8", 0b010010, 0b00011>;
+defm VZEXT_VF4 : VALU_MV_VS2<"vzext.vf4", 0b010010, 0b00100>;
+defm VSEXT_VF4 : VALU_MV_VS2<"vsext.vf4", 0b010010, 0b00101>;
+defm VZEXT_VF2 : VALU_MV_VS2<"vzext.vf2", 0b010010, 0b00110>;
+defm VSEXT_VF2 : VALU_MV_VS2<"vsext.vf2", 0b010010, 0b00111>;
// Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
defm VADC_V : VALUm_IV_V_X_I<"vadc", 0b010000>;
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
defm VMADC_V : VALUm_IV_V_X_I<"vmadc", 0b010001>;
defm VMADC_V : VALUNoVm_IV_V_X_I<"vmadc", 0b010001>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
defm VSBC_V : VALUm_IV_V_X<"vsbc", 0b010010>;
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
defm VMSBC_V : VALUm_IV_V_X<"vmsbc", 0b010011>;
defm VMSBC_V : VALUNoVm_IV_V_X<"vmsbc", 0b010011>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
// Vector Bitwise Logical Instructions
defm VAND_V : VALU_IV_V_X_I<"vand", 0b001001>;
@@ -486,7 +586,7 @@ defm VOR_V : VALU_IV_V_X_I<"vor", 0b001010>;
defm VXOR_V : VALU_IV_V_X_I<"vxor", 0b001011>;
def : InstAlias<"vnot.v $vd, $vs$vm",
- (VXOR_VI VRegOp:$vd, VRegOp:$vs, -1, VMaskOp:$vm)>;
+ (VXOR_VI VR:$vd, VR:$vs, -1, VMaskOp:$vm)>;
// Vector Single-Width Bit Shift Instructions
defm VSLL_V : VALU_IV_V_X_I<"vsll", 0b100101, uimm5>;
@@ -498,12 +598,16 @@ defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
// The destination vector register group cannot overlap the first source
// vector register group (specified by vs2). The destination vector register
// group cannot overlap the mask register if used, unless LMUL=1.
-let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+let Constraints = "@earlyclobber $vd" in {
defm VNSRL_W : VALU_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+} // Constraints = "@earlyclobber $vd"
+
+def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
+ (VNSRL_WX VR:$vd, VR:$vs, X0, VMaskOp:$vm)>;
// Vector Integer Comparison Instructions
+let RVVConstraint = NoConstraint in {
defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>;
defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>;
defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>;
@@ -512,27 +616,61 @@ defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>;
defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>;
defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>;
defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>;
+} // RVVConstraint = NoConstraint
def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
- (VMSLTU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+ (VMSLTU_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
def : InstAlias<"vmsgt.vv $vd, $va, $vb$vm",
- (VMSLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+ (VMSLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
def : InstAlias<"vmsgeu.vv $vd, $va, $vb$vm",
- (VMSLEU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+ (VMSLEU_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
def : InstAlias<"vmsge.vv $vd, $va, $vb$vm",
- (VMSLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
-def : InstAlias<"vmsltu.vi $vd, $va, $imm$vm",
- (VMSLEU_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
- VMaskOp:$vm), 0>;
-def : InstAlias<"vmslt.vi $vd, $va, $imm$vm",
- (VMSLE_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
- VMaskOp:$vm), 0>;
-def : InstAlias<"vmsgeu.vi $vd, $va, $imm$vm",
- (VMSGTU_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
- VMaskOp:$vm), 0>;
-def : InstAlias<"vmsge.vi $vd, $va, $imm$vm",
- (VMSGT_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
- VMaskOp:$vm), 0>;
+ (VMSLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
+
+let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0,
+ mayStore = 0 in {
+// For unsigned comparisons we need to special case 0 immediate to maintain
+// the always true/false semantics we would invert if we just decremented the
+// immediate like we do for signed. To match the GNU assembler we will use
+// vmseq/vmsne.vv with the same register for both operands which we can't do
+// from an InstAlias.
+def PseudoVMSGEU_VI : Pseudo<(outs VR:$vd),
+ (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+ [], "vmsgeu.vi", "$vd, $vs2, $imm$vm">;
+def PseudoVMSLTU_VI : Pseudo<(outs VR:$vd),
+ (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+ [], "vmsltu.vi", "$vd, $vs2, $imm$vm">;
+// Handle signed with pseudos as well for more consistency in the
+// implementation.
+def PseudoVMSGE_VI : Pseudo<(outs VR:$vd),
+ (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+ [], "vmsge.vi", "$vd, $vs2, $imm$vm">;
+def PseudoVMSLT_VI : Pseudo<(outs VR:$vd),
+ (ins VR:$vs2, simm5_plus1:$imm, VMaskOp:$vm),
+ [], "vmslt.vi", "$vd, $vs2, $imm$vm">;
+}
+
+let isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 0,
+ mayStore = 0 in {
+def PseudoVMSGEU_VX : Pseudo<(outs VR:$vd),
+ (ins VR:$vs2, GPR:$rs1),
+ [], "vmsgeu.vx", "$vd, $vs2, $rs1">;
+def PseudoVMSGE_VX : Pseudo<(outs VR:$vd),
+ (ins VR:$vs2, GPR:$rs1),
+ [], "vmsge.vx", "$vd, $vs2, $rs1">;
+def PseudoVMSGEU_VX_M : Pseudo<(outs VRNoV0:$vd),
+ (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+ [], "vmsgeu.vx", "$vd, $vs2, $rs1$vm">;
+def PseudoVMSGE_VX_M : Pseudo<(outs VRNoV0:$vd),
+ (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+ [], "vmsge.vx", "$vd, $vs2, $rs1$vm">;
+def PseudoVMSGEU_VX_M_T : Pseudo<(outs VMV0:$vd, VR:$scratch),
+ (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+ [], "vmsgeu.vx", "$vd, $vs2, $rs1$vm, $scratch">;
+def PseudoVMSGE_VX_M_T : Pseudo<(outs VMV0:$vd, VR:$scratch),
+ (ins VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+ [], "vmsge.vx", "$vd, $vs2, $rs1$vm, $scratch">;
+}
// Vector Integer Min/Max Instructions
defm VMINU_V : VALU_IV_V_X<"vminu", 0b000100>;
@@ -577,15 +715,16 @@ defm VWMACCUS_V : VALUr_MV_X<"vwmaccus", 0b111110>;
defm VMERGE_V : VALUm_IV_V_X_I<"vmerge", 0b010111>;
// Vector Integer Move Instructions
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1,
+ RVVConstraint = NoConstraint in {
// op vd, vs1
-def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VRegOp:$vd),
- (ins VRegOp:$vs1), "vmv.v.v", "$vd, $vs1">;
+def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VR:$vd),
+ (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">;
// op vd, rs1
-def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VRegOp:$vd),
+def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VR:$vd),
(ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">;
// op vd, imm
-def VMV_V_I : RVInstIVI<0b010111, (outs VRegOp:$vd),
+def VMV_V_I : RVInstIVI<0b010111, (outs VR:$vd),
(ins simm5:$imm), "vmv.v.i", "$vd, $imm">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
@@ -609,11 +748,13 @@ defm VSSRL_V : VALU_IV_V_X_I<"vssrl", 0b101010, uimm5>;
defm VSSRA_V : VALU_IV_V_X_I<"vssra", 0b101011, uimm5>;
// Vector Narrowing Fixed-Point Clip Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+let Constraints = "@earlyclobber $vd" in {
defm VNCLIPU_W : VALU_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
defm VNCLIP_W : VALU_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+} // Constraints = "@earlyclobber $vd"
+} // Predicates = [HasStdExtV]
+let Predicates = [HasStdExtV, HasStdExtF] in {
// Vector Single-Width Floating-Point Add/Subtract Instructions
defm VFADD_V : VALU_FV_V_F<"vfadd", 0b000000>;
defm VFSUB_V : VALU_FV_V_F<"vfsub", 0b000010>;
@@ -664,7 +805,9 @@ defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
// Vector Floating-Point Square-Root Instruction
-defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b100011, 0b00000>;
+defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
+defm VFRSQRTE7_V : VALU_FV_VS2<"vfrsqrte7.v", 0b010011, 0b00100>;
+defm VFRECE7_V : VALU_FV_VS2<"vfrece7.v", 0b010011, 0b00101>;
// Vector Floating-Point MIN/MAX Instructions
defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
@@ -675,32 +818,38 @@ defm VFSGNJ_V : VALU_FV_V_F<"vfsgnj", 0b001000>;
defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>;
defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>;
+def : InstAlias<"vfneg.v $vd, $vs$vm",
+ (VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
+
// Vector Floating-Point Compare Instructions
+let RVVConstraint = NoConstraint in {
defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>;
defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>;
defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>;
defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>;
defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>;
defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>;
+} // RVVConstraint = NoConstraint
def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
- (VMFLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+ (VMFLT_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
def : InstAlias<"vmfge.vv $vd, $va, $vb$vm",
- (VMFLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+ (VMFLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
// Vector Floating-Point Classify Instruction
-defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b100011, 0b10000>;
+defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
// Vector Floating-Point Merge Instruction
-def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
- (ins VRegOp:$vs2, FPR32:$rs1, VMV0:$v0),
+def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
+ (ins VR:$vs2, FPR32:$rs1, VMV0:$v0),
"vfmerge.vfm", "$vd, $vs2, $rs1, v0"> {
let vm = 0;
}
// Vector Floating-Point Move Instruction
-def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
+let RVVConstraint = NoConstraint in
+def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
(ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1"> {
let vs2 = 0;
let vm = 1;
@@ -708,31 +857,40 @@ def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
// Single-Width Floating-Point/Integer Type-Convert Instructions
-defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b100010, 0b00000>;
-defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b100010, 0b00001>;
-defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b100010, 0b00010>;
-defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b100010, 0b00011>;
+defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
+defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
+defm VFCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
+defm VFCVT_RTZ_X_F_V : VALU_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
+defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
+defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
// Widening Floating-Point/Integer Type-Convert Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in {
-defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b100010, 0b01000>;
-defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b100010, 0b01001>;
-defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b100010, 0b01010>;
-defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b100010, 0b01011>;
-defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b100010, 0b01100>;
+defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
+defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
+defm VFWCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
+defm VFWCVT_RTZ_X_F_V : VALU_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
+defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
+defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
+defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
// Narrowing Floating-Point/Integer Type-Convert Instructions
-let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
-defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b100010, 0b10000>;
-defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b100010, 0b10001>;
-defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b100010, 0b10010>;
-defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b100010, 0b10011>;
-defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b100010, 0b10100>;
-defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b100010, 0b10101>;
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+let Constraints = "@earlyclobber $vd" in {
+defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
+defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
+defm VFNCVT_RTZ_XU_F_W : VALU_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
+defm VFNCVT_RTZ_X_F_W : VALU_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
+defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
+defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
+defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
+defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
+} // Constraints = "@earlyclobber $vd"
+} // Predicates = [HasStdExtV, HasStdExtF]
+let Predicates = [HasStdExtV] in {
// Vector Single-Width Integer Reduction Instructions
+let RVVConstraint = NoConstraint in {
defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>;
defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>;
defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>;
@@ -741,34 +899,42 @@ defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>;
defm VREDAND : VALU_MV_V<"vredand", 0b000001>;
defm VREDOR : VALU_MV_V<"vredor", 0b000010>;
defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>;
+} // RVVConstraint = NoConstraint
// Vector Widening Integer Reduction Instructions
-let Constraints = "@earlyclobber $vd" in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
// Set earlyclobber for following instructions for second and mask operands.
// This has the downside that the earlyclobber constraint is too coarse and
// will impose unnecessary restrictions by not allowing the destination to
// overlap with the first (wide) operand.
defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>;
defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>;
-} // Constraints = "@earlyclobber $vd"
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+} // Predicates = [HasStdExtV]
+let Predicates = [HasStdExtV, HasStdExtF] in {
// Vector Single-Width Floating-Point Reduction Instructions
+let RVVConstraint = NoConstraint in {
defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>;
defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>;
defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>;
defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>;
+} // RVVConstraint = NoConstraint
// Vector Widening Floating-Point Reduction Instructions
-let Constraints = "@earlyclobber $vd" in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
// Set earlyclobber for following instructions for second and mask operands.
// This has the downside that the earlyclobber constraint is too coarse and
// will impose unnecessary restrictions by not allowing the destination to
// overlap with the first (wide) operand.
defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>;
defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>;
-} // Constraints = "@earlyclobber $vd"
+} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+} // Predicates = [HasStdExtV, HasStdExtF]
+let Predicates = [HasStdExtV] in {
// Vector Mask-Register Logical Instructions
+let RVVConstraint = NoConstraint in {
defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">;
defm VMNAND_M : VALU_MV_Mask<"vmnand", 0b011101, "m">;
defm VMANDNOT_M : VALU_MV_Mask<"vmandnot", 0b011000, "m">;
@@ -777,82 +943,95 @@ defm VMOR_M : VALU_MV_Mask<"vmor", 0b011010, "m">;
defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">;
defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">;
defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">;
+}
-def : InstAlias<"vmcpy.m $vd, $vs",
- (VMAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>;
+def : InstAlias<"vmmv.m $vd, $vs",
+ (VMAND_MM VR:$vd, VR:$vs, VR:$vs)>;
def : InstAlias<"vmclr.m $vd",
- (VMXOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>;
+ (VMXOR_MM VR:$vd, VR:$vd, VR:$vd)>;
def : InstAlias<"vmset.m $vd",
- (VMXNOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>;
+ (VMXNOR_MM VR:$vd, VR:$vd, VR:$vd)>;
def : InstAlias<"vmnot.m $vd, $vs",
- (VMNAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>;
+ (VMNAND_MM VR:$vd, VR:$vs, VR:$vs)>;
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+ RVVConstraint = NoConstraint in {
// Vector mask population count vpopc
def VPOPC_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
- (ins VRegOp:$vs2, VMaskOp:$vm),
+ (ins VR:$vs2, VMaskOp:$vm),
"vpopc.m", "$vd, $vs2$vm">;
// vfirst find-first-set mask bit
def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
- (ins VRegOp:$vs2, VMaskOp:$vm),
+ (ins VR:$vs2, VMaskOp:$vm),
"vfirst.m", "$vd, $vs2$vm">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
// vmsbf.m set-before-first mask bit
defm VMSBF_M : VALU_MV_VS2<"vmsbf.m", 0b010100, 0b00001>;
-
// vmsif.m set-including-first mask bit
defm VMSIF_M : VALU_MV_VS2<"vmsif.m", 0b010100, 0b00011>;
-
// vmsof.m set-only-first mask bit
defm VMSOF_M : VALU_MV_VS2<"vmsof.m", 0b010100, 0b00010>;
-
// Vector Iota Instruction
-let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
defm VIOTA_M : VALU_MV_VS2<"viota.m", 0b010100, 0b10000>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota
// Vector Element Index Instruction
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
-def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VRegOp:$vd),
+def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VR:$vd),
(ins VMaskOp:$vm), "vid.v", "$vd$vm"> {
let vs2 = 0;
}
// Integer Scalar Move Instructions
-let vm = 1 in {
+let vm = 1, RVVConstraint = NoConstraint in {
def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
- (ins VRegOp:$vs2), "vmv.x.s", "$vd, $vs2">;
-def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VRegOp:$vd),
- (ins GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
+ (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">;
+let Constraints = "$vd = $vd_wb" in
+def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
+ (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
}
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+} // Predicates = [HasStdExtV]
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1 in {
+let Predicates = [HasStdExtV, HasStdExtF] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
+ RVVConstraint = NoConstraint in {
// Floating-Point Scalar Move Instructions
def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
- (ins VRegOp:$vs2), "vfmv.f.s", "$vd, $vs2">;
-def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VRegOp:$vd),
- (ins FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
+ (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">;
+let Constraints = "$vd = $vd_wb" in
+def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
+ (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
+} // Predicates = [HasStdExtV, HasStdExtF]
+let Predicates = [HasStdExtV] in {
// Vector Slide Instructions
let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
defm VSLIDEUP_V : VALU_IV_X_I<"vslideup", 0b001110, uimm5>;
+defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>;
+defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+} // Predicates = [HasStdExtV]
+let Predicates = [HasStdExtV, HasStdExtF] in {
let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
+defm VFSLIDE1UP_V : VALU_FV_F<"vfslide1up", 0b001110>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+defm VFSLIDE1DOWN_V : VALU_FV_F<"vfslide1down", 0b001111>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+let Predicates = [HasStdExtV] in {
// Vector Register Gather Instruction
let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
defm VRGATHER_V : VALU_IV_V_X_I<"vrgather", 0b001100, uimm5>;
+def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">;
} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
// Vector Compress Instruction
@@ -860,10 +1039,11 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress in {
defm VCOMPRESS_V : VALU_MV_Mask<"vcompress", 0b010111>;
} // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+ RVVConstraint = NoConstraint in {
foreach nf = [1, 2, 4, 8] in {
- def VMV#nf#R_V : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VRegOp:$vd),
- (ins VRegOp:$vs2), "vmv" # nf # "r.v",
+ def VMV#nf#R_V : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VR:$vd),
+ (ins VR:$vs2), "vmv" # nf # "r.v",
"$vd, $vs2"> {
let Uses = [];
let vm = 1;
@@ -871,3 +1051,174 @@ foreach nf = [1, 2, 4, 8] in {
}
} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtZvlsseg] in {
+ foreach nf=2-8 in {
+ def VLSEG#nf#E8_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth8, "vlseg"#nf#"e8.v">;
+ def VLSEG#nf#E16_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth16, "vlseg"#nf#"e16.v">;
+ def VLSEG#nf#E32_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth32, "vlseg"#nf#"e32.v">;
+ def VLSEG#nf#E64_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth64, "vlseg"#nf#"e64.v">;
+ def VLSEG#nf#E128_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth128, "vlseg"#nf#"e128.v">;
+ def VLSEG#nf#E256_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth256, "vlseg"#nf#"e256.v">;
+ def VLSEG#nf#E512_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth512, "vlseg"#nf#"e512.v">;
+ def VLSEG#nf#E1024_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth1024, "vlseg"#nf#"e1024.v">;
+
+ def VLSEG#nf#E8FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth8, "vlseg"#nf#"e8ff.v">;
+ def VLSEG#nf#E16FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth16, "vlseg"#nf#"e16ff.v">;
+ def VLSEG#nf#E32FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth32, "vlseg"#nf#"e32ff.v">;
+ def VLSEG#nf#E64FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth64, "vlseg"#nf#"e64ff.v">;
+ def VLSEG#nf#E128FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth128, "vlseg"#nf#"e128ff.v">;
+ def VLSEG#nf#E256FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth256, "vlseg"#nf#"e256ff.v">;
+ def VLSEG#nf#E512FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth512, "vlseg"#nf#"e512ff.v">;
+ def VLSEG#nf#E1024FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth1024, "vlseg"#nf#"e1024ff.v">;
+
+ def VSSEG#nf#E8_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth8, "vsseg"#nf#"e8.v">;
+ def VSSEG#nf#E16_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth16, "vsseg"#nf#"e16.v">;
+ def VSSEG#nf#E32_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth32, "vsseg"#nf#"e32.v">;
+ def VSSEG#nf#E64_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">;
+ def VSSEG#nf#E128_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth128, "vsseg"#nf#"e128.v">;
+ def VSSEG#nf#E256_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth256, "vsseg"#nf#"e256.v">;
+ def VSSEG#nf#E512_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth512, "vsseg"#nf#"e512.v">;
+ def VSSEG#nf#E1024_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth1024, "vsseg"#nf#"e1024.v">;
+
+ // Vector Strided Instructions
+ def VLSSEG#nf#E8_V : VStridedSegmentLoad<!add(nf, -1), LSWidth8, "vlsseg"#nf#"e8.v">;
+ def VLSSEG#nf#E16_V : VStridedSegmentLoad<!add(nf, -1), LSWidth16, "vlsseg"#nf#"e16.v">;
+ def VLSSEG#nf#E32_V : VStridedSegmentLoad<!add(nf, -1), LSWidth32, "vlsseg"#nf#"e32.v">;
+ def VLSSEG#nf#E64_V : VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">;
+ def VLSSEG#nf#E128_V : VStridedSegmentLoad<!add(nf, -1), LSWidth128, "vlsseg"#nf#"e128.v">;
+ def VLSSEG#nf#E256_V : VStridedSegmentLoad<!add(nf, -1), LSWidth256, "vlsseg"#nf#"e256.v">;
+ def VLSSEG#nf#E512_V : VStridedSegmentLoad<!add(nf, -1), LSWidth512, "vlsseg"#nf#"e512.v">;
+ def VLSSEG#nf#E1024_V : VStridedSegmentLoad<!add(nf, -1), LSWidth1024, "vlsseg"#nf#"e1024.v">;
+
+ def VSSSEG#nf#E8_V : VStridedSegmentStore<!add(nf, -1), LSWidth8, "vssseg"#nf#"e8.v">;
+ def VSSSEG#nf#E16_V : VStridedSegmentStore<!add(nf, -1), LSWidth16, "vssseg"#nf#"e16.v">;
+ def VSSSEG#nf#E32_V : VStridedSegmentStore<!add(nf, -1), LSWidth32, "vssseg"#nf#"e32.v">;
+ def VSSSEG#nf#E64_V : VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">;
+ def VSSSEG#nf#E128_V : VStridedSegmentStore<!add(nf, -1), LSWidth128, "vssseg"#nf#"e128.v">;
+ def VSSSEG#nf#E256_V : VStridedSegmentStore<!add(nf, -1), LSWidth256, "vssseg"#nf#"e256.v">;
+ def VSSSEG#nf#E512_V : VStridedSegmentStore<!add(nf, -1), LSWidth512, "vssseg"#nf#"e512.v">;
+ def VSSSEG#nf#E1024_V : VStridedSegmentStore<!add(nf, -1), LSWidth1024, "vssseg"#nf#"e1024.v">;
+
+ // Vector Indexed Instructions
+ def VLUXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth8, "vluxseg"#nf#"ei8.v">;
+ def VLUXSEG#nf#EI16_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth16, "vluxseg"#nf#"ei16.v">;
+ def VLUXSEG#nf#EI32_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth32, "vluxseg"#nf#"ei32.v">;
+ def VLUXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth64, "vluxseg"#nf#"ei64.v">;
+ def VLUXSEG#nf#EI128_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth128, "vluxseg"#nf#"ei128.v">;
+ def VLUXSEG#nf#EI256_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth256, "vluxseg"#nf#"ei256.v">;
+ def VLUXSEG#nf#EI512_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth512, "vluxseg"#nf#"ei512.v">;
+ def VLUXSEG#nf#EI1024_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
+ LSWidth1024, "vluxseg"#nf#"ei1024.v">;
+
+ def VLOXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth8, "vloxseg"#nf#"ei8.v">;
+ def VLOXSEG#nf#EI16_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth16, "vloxseg"#nf#"ei16.v">;
+ def VLOXSEG#nf#EI32_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth32, "vloxseg"#nf#"ei32.v">;
+ def VLOXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth64, "vloxseg"#nf#"ei64.v">;
+ def VLOXSEG#nf#EI128_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth128, "vloxseg"#nf#"ei128.v">;
+ def VLOXSEG#nf#EI256_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth256, "vloxseg"#nf#"ei256.v">;
+ def VLOXSEG#nf#EI512_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth512, "vloxseg"#nf#"ei512.v">;
+ def VLOXSEG#nf#EI1024_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
+ LSWidth1024, "vloxseg"#nf#"ei1024.v">;
+
+ def VSUXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth8, "vsuxseg"#nf#"ei8.v">;
+ def VSUXSEG#nf#EI16_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth16, "vsuxseg"#nf#"ei16.v">;
+ def VSUXSEG#nf#EI32_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth32, "vsuxseg"#nf#"ei32.v">;
+ def VSUXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth64, "vsuxseg"#nf#"ei64.v">;
+ def VSUXSEG#nf#EI128_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth128, "vsuxseg"#nf#"ei128.v">;
+ def VSUXSEG#nf#EI256_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth256, "vsuxseg"#nf#"ei256.v">;
+ def VSUXSEG#nf#EI512_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth512, "vsuxseg"#nf#"ei512.v">;
+ def VSUXSEG#nf#EI1024_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
+ LSWidth1024, "vsuxseg"#nf#"ei1024.v">;
+
+ def VSOXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth8, "vsoxseg"#nf#"ei8.v">;
+ def VSOXSEG#nf#EI16_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth16, "vsoxseg"#nf#"ei16.v">;
+ def VSOXSEG#nf#EI32_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth32, "vsoxseg"#nf#"ei32.v">;
+ def VSOXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth64, "vsoxseg"#nf#"ei64.v">;
+ def VSOXSEG#nf#EI128_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth128, "vsoxseg"#nf#"ei128.v">;
+ def VSOXSEG#nf#EI256_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth256, "vsoxseg"#nf#"ei256.v">;
+ def VSOXSEG#nf#EI512_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth512, "vsoxseg"#nf#"ei512.v">;
+ def VSOXSEG#nf#EI1024_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
+ LSWidth1024, "vsoxseg"#nf#"ei1024.v">;
+ }
+} // Predicates = [HasStdExtZvlsseg]
+
+let Predicates = [HasStdExtZvamo, HasStdExtA] in {
+ defm VAMOSWAPEI8 : VAMO<AMOOPVamoSwap, LSWidth8, "vamoswapei8.v">;
+ defm VAMOSWAPEI16 : VAMO<AMOOPVamoSwap, LSWidth16, "vamoswapei16.v">;
+ defm VAMOSWAPEI32 : VAMO<AMOOPVamoSwap, LSWidth32, "vamoswapei32.v">;
+
+ defm VAMOADDEI8 : VAMO<AMOOPVamoAdd, LSWidth8, "vamoaddei8.v">;
+ defm VAMOADDEI16 : VAMO<AMOOPVamoAdd, LSWidth16, "vamoaddei16.v">;
+ defm VAMOADDEI32 : VAMO<AMOOPVamoAdd, LSWidth32, "vamoaddei32.v">;
+
+ defm VAMOXOREI8 : VAMO<AMOOPVamoXor, LSWidth8, "vamoxorei8.v">;
+ defm VAMOXOREI16 : VAMO<AMOOPVamoXor, LSWidth16, "vamoxorei16.v">;
+ defm VAMOXOREI32 : VAMO<AMOOPVamoXor, LSWidth32, "vamoxorei32.v">;
+
+ defm VAMOANDEI8 : VAMO<AMOOPVamoAnd, LSWidth8, "vamoandei8.v">;
+ defm VAMOANDEI16 : VAMO<AMOOPVamoAnd, LSWidth16, "vamoandei16.v">;
+ defm VAMOANDEI32 : VAMO<AMOOPVamoAnd, LSWidth32, "vamoandei32.v">;
+
+ defm VAMOOREI8 : VAMO<AMOOPVamoOr, LSWidth8, "vamoorei8.v">;
+ defm VAMOOREI16 : VAMO<AMOOPVamoOr, LSWidth16, "vamoorei16.v">;
+ defm VAMOOREI32 : VAMO<AMOOPVamoOr, LSWidth32, "vamoorei32.v">;
+
+ defm VAMOMINEI8 : VAMO<AMOOPVamoMin, LSWidth8, "vamominei8.v">;
+ defm VAMOMINEI16 : VAMO<AMOOPVamoMin, LSWidth16, "vamominei16.v">;
+ defm VAMOMINEI32 : VAMO<AMOOPVamoMin, LSWidth32, "vamominei32.v">;
+
+ defm VAMOMAXEI8 : VAMO<AMOOPVamoMax, LSWidth8, "vamomaxei8.v">;
+ defm VAMOMAXEI16 : VAMO<AMOOPVamoMax, LSWidth16, "vamomaxei16.v">;
+ defm VAMOMAXEI32 : VAMO<AMOOPVamoMax, LSWidth32, "vamomaxei32.v">;
+
+ defm VAMOMINUEI8 : VAMO<AMOOPVamoMinu, LSWidth8, "vamominuei8.v">;
+ defm VAMOMINUEI16 : VAMO<AMOOPVamoMinu, LSWidth16, "vamominuei16.v">;
+ defm VAMOMINUEI32 : VAMO<AMOOPVamoMinu, LSWidth32, "vamominuei32.v">;
+
+ defm VAMOMAXUEI8 : VAMO<AMOOPVamoMaxu, LSWidth8, "vamomaxuei8.v">;
+ defm VAMOMAXUEI16 : VAMO<AMOOPVamoMaxu, LSWidth16, "vamomaxuei16.v">;
+ defm VAMOMAXUEI32 : VAMO<AMOOPVamoMaxu, LSWidth32, "vamomaxuei32.v">;
+} // Predicates = [HasStdExtZvamo, HasStdExtA]
+
+let Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] in {
+ defm VAMOSWAPEI64 : VAMO<AMOOPVamoSwap, LSWidth64, "vamoswapei64.v">;
+ defm VAMOADDEI64 : VAMO<AMOOPVamoAdd, LSWidth64, "vamoaddei64.v">;
+ defm VAMOXOREI64 : VAMO<AMOOPVamoXor, LSWidth64, "vamoxorei64.v">;
+ defm VAMOANDEI64 : VAMO<AMOOPVamoAnd, LSWidth64, "vamoandei64.v">;
+ defm VAMOOREI64 : VAMO<AMOOPVamoOr, LSWidth64, "vamoorei64.v">;
+ defm VAMOMINEI64 : VAMO<AMOOPVamoMin, LSWidth64, "vamominei64.v">;
+ defm VAMOMAXEI64 : VAMO<AMOOPVamoMax, LSWidth64, "vamomaxei64.v">;
+ defm VAMOMINUEI64 : VAMO<AMOOPVamoMinu, LSWidth64, "vamominuei64.v">;
+ defm VAMOMAXUEI64 : VAMO<AMOOPVamoMaxu, LSWidth64, "vamomaxuei64.v">;
+} // Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64]
+
+include "RISCVInstrInfoVPseudos.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
new file mode 100644
index 000000000000..06e4d053d5d7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -0,0 +1,4397 @@
+//===-- RISCVInstrInfoVPseudos.td - RISC-V 'V' Pseudos -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains the required infrastructure to support code generation
+/// for the standard 'V' (Vector) extension, version 0.9. This version is still
+/// experimental as the 'V' extension hasn't been ratified yet.
+///
+/// This file is included from RISCVInstrInfoV.td
+///
+//===----------------------------------------------------------------------===//
+
+def riscv_vmv_x_s : SDNode<"RISCVISD::VMV_X_S",
+ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>,
+ SDTCisInt<1>]>>;
+def riscv_read_vlenb : SDNode<"RISCVISD::READ_VLENB",
+ SDTypeProfile<1, 0, [SDTCisVT<0, XLenVT>]>>;
+
+def riscv_vleff : SDNode<"RISCVISD::VLEFF",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, XLenVT>]>,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+ SDNPSideEffect]>;
+def riscv_vleff_mask : SDNode<"RISCVISD::VLEFF_MASK",
+ SDTypeProfile<1, 4, [SDTCisVec<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisPtrTy<2>,
+ SDTCVecEltisVT<3, i1>,
+ SDTCisVT<4, XLenVT>]>,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+ SDNPSideEffect]>;
+def riscv_read_vl : SDNode<"RISCVISD::READ_VL",
+ SDTypeProfile<1, 0, [SDTCisVT<0, XLenVT>]>,
+ [SDNPInGlue]>;
+
+// X0 has special meaning for vsetvl/vsetvli.
+// rd | rs1 | AVL value | Effect on vl
+//--------------------------------------------------------------
+// !X0 | X0 | VLMAX | Set vl to VLMAX
+// X0 | X0 | Value in vl | Keep current vl, just change vtype.
+def NoX0 : SDNodeXForm<undef,
+[{
+ auto *C = dyn_cast<ConstantSDNode>(N);
+ if (C && C->isNullValue()) {
+ SDLoc DL(N);
+ return SDValue(CurDAG->getMachineNode(RISCV::ADDI, DL, Subtarget->getXLenVT(),
+ CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT()),
+ CurDAG->getTargetConstant(0, DL, Subtarget->getXLenVT())), 0);
+ }
+ return SDValue(N, 0);
+}]>;
+
+def DecImm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() - 1, SDLoc(N),
+ N->getValueType(0));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+// This class describes information associated to the LMUL.
+class LMULInfo<int lmul, VReg regclass, VReg wregclass,
+ VReg f2regclass, VReg f4regclass, VReg f8regclass, string mx> {
+ bits<3> value = lmul; // This is encoded as the vlmul field of vtype.
+ VReg vrclass = regclass;
+ VReg wvrclass = wregclass;
+ VReg f8vrclass = f8regclass;
+ VReg f4vrclass = f4regclass;
+ VReg f2vrclass = f2regclass;
+ string MX = mx;
+}
+
+// Associate LMUL with tablegen records of register classes.
+def V_M1 : LMULInfo<0b000, VR, VRM2, VR, VR, VR, "M1">;
+def V_M2 : LMULInfo<0b001, VRM2, VRM4, VR, VR, VR, "M2">;
+def V_M4 : LMULInfo<0b010, VRM4, VRM8, VRM2, VR, VR, "M4">;
+def V_M8 : LMULInfo<0b011, VRM8,/*NoVReg*/VR, VRM4, VRM2, VR, "M8">;
+
+def V_MF8 : LMULInfo<0b101, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF8">;
+def V_MF4 : LMULInfo<0b110, VR, VR, VR,/*NoVReg*/VR,/*NoVReg*/VR, "MF4">;
+def V_MF2 : LMULInfo<0b111, VR, VR, VR, VR,/*NoVReg*/VR, "MF2">;
+
+// Used to iterate over all possible LMULs.
+def MxList {
+ list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
+}
+
+class FPR_Info<RegisterClass regclass, string fx> {
+ RegisterClass fprclass = regclass;
+ string FX = fx;
+}
+
+def SCALAR_F16 : FPR_Info<FPR16, "F16">;
+def SCALAR_F32 : FPR_Info<FPR32, "F32">;
+def SCALAR_F64 : FPR_Info<FPR64, "F64">;
+
+def FPList {
+ list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32, SCALAR_F64];
+}
+
+class MxSet<int eew> {
+ list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
+ !eq(eew, 16) : [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
+ !eq(eew, 32) : [V_MF2, V_M1, V_M2, V_M4, V_M8],
+ !eq(eew, 64) : [V_M1, V_M2, V_M4, V_M8]);
+}
+
+class NFSet<LMULInfo m> {
+ list<int> L = !cond(!eq(m.value, V_M8.value): [],
+ !eq(m.value, V_M4.value): [2],
+ !eq(m.value, V_M2.value): [2, 3, 4],
+ true: [2, 3, 4, 5, 6, 7, 8]);
+}
+
+class shift_amount<int num> {
+ int val = !if(!eq(num, 1), 0, !add(1, shift_amount<!srl(num, 1)>.val));
+}
+
+class octuple_from_str<string MX> {
+ int ret = !cond(!eq(MX, "MF8") : 1,
+ !eq(MX, "MF4") : 2,
+ !eq(MX, "MF2") : 4,
+ !eq(MX, "M1") : 8,
+ !eq(MX, "M2") : 16,
+ !eq(MX, "M4") : 32,
+ !eq(MX, "M8") : 64);
+}
+
+class octuple_to_str<int octuple> {
+ string ret = !if(!eq(octuple, 1), "MF8",
+ !if(!eq(octuple, 2), "MF4",
+ !if(!eq(octuple, 4), "MF2",
+ !if(!eq(octuple, 8), "M1",
+ !if(!eq(octuple, 16), "M2",
+ !if(!eq(octuple, 32), "M4",
+ !if(!eq(octuple, 64), "M8",
+ "NoDef")))))));
+}
+
+// Output pattern for X0 used to represent VLMAX in the pseudo instructions.
+def VLMax : OutPatFrag<(ops), (XLenVT X0)>;
+
+// List of EEW.
+defvar EEWList = [8, 16, 32, 64];
+
+class SegRegClass<LMULInfo m, int nf> {
+ VReg RC = !cast<VReg>("VRN" # nf # !cond(!eq(m.value, V_MF8.value): V_M1.MX,
+ !eq(m.value, V_MF4.value): V_M1.MX,
+ !eq(m.value, V_MF2.value): V_M1.MX,
+ true: m.MX));
+}
+
+//===----------------------------------------------------------------------===//
+// Vector register and vector group type information.
+//===----------------------------------------------------------------------===//
+
+class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, VReg Reg, LMULInfo M,
+ ValueType Scal = XLenVT, RegisterClass ScalarReg = GPR>
+{
+ ValueType Vector = Vec;
+ ValueType Mask = Mas;
+ int SEW = Sew;
+ VReg RegClass = Reg;
+ LMULInfo LMul = M;
+ ValueType Scalar = Scal;
+ RegisterClass ScalarRegClass = ScalarReg;
+ // The pattern fragment which produces the AVL operand, representing the
+ // "natural" vector length for this type. For scalable vectors this is VLMax.
+ OutPatFrag AVL = VLMax;
+
+ string ScalarSuffix = !cond(!eq(Scal, XLenVT) : "X",
+ !eq(Scal, f16) : "F16",
+ !eq(Scal, f32) : "F32",
+ !eq(Scal, f64) : "F64");
+}
+
+class GroupVTypeInfo<ValueType Vec, ValueType VecM1, ValueType Mas, int Sew,
+ VReg Reg, LMULInfo M, ValueType Scal = XLenVT,
+ RegisterClass ScalarReg = GPR>
+ : VTypeInfo<Vec, Mas, Sew, Reg, M, Scal, ScalarReg>
+{
+ ValueType VectorM1 = VecM1;
+}
+
+defset list<VTypeInfo> AllVectors = {
+ defset list<VTypeInfo> AllIntegerVectors = {
+ defset list<VTypeInfo> NoGroupIntegerVectors = {
+ def VI8MF8: VTypeInfo<vint8mf8_t, vbool64_t, 8, VR, V_MF8>;
+ def VI8MF4: VTypeInfo<vint8mf4_t, vbool32_t, 8, VR, V_MF4>;
+ def VI8MF2: VTypeInfo<vint8mf2_t, vbool16_t, 8, VR, V_MF2>;
+ def VI8M1: VTypeInfo<vint8m1_t, vbool8_t, 8, VR, V_M1>;
+ def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, VR, V_MF4>;
+ def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, VR, V_MF2>;
+ def VI16M1: VTypeInfo<vint16m1_t, vbool16_t, 16, VR, V_M1>;
+ def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, VR, V_MF2>;
+ def VI32M1: VTypeInfo<vint32m1_t, vbool32_t, 32, VR, V_M1>;
+ def VI64M1: VTypeInfo<vint64m1_t, vbool64_t, 64, VR, V_M1>;
+ }
+ defset list<GroupVTypeInfo> GroupIntegerVectors = {
+ def VI8M2: GroupVTypeInfo<vint8m2_t, vint8m1_t, vbool4_t, 8, VRM2, V_M2>;
+ def VI8M4: GroupVTypeInfo<vint8m4_t, vint8m1_t, vbool2_t, 8, VRM4, V_M4>;
+ def VI8M8: GroupVTypeInfo<vint8m8_t, vint8m1_t, vbool1_t, 8, VRM8, V_M8>;
+
+ def VI16M2: GroupVTypeInfo<vint16m2_t,vint16m1_t,vbool8_t, 16,VRM2, V_M2>;
+ def VI16M4: GroupVTypeInfo<vint16m4_t,vint16m1_t,vbool4_t, 16,VRM4, V_M4>;
+ def VI16M8: GroupVTypeInfo<vint16m8_t,vint16m1_t,vbool2_t, 16,VRM8, V_M8>;
+
+ def VI32M2: GroupVTypeInfo<vint32m2_t,vint32m1_t,vbool16_t,32,VRM2, V_M2>;
+ def VI32M4: GroupVTypeInfo<vint32m4_t,vint32m1_t,vbool8_t, 32,VRM4, V_M4>;
+ def VI32M8: GroupVTypeInfo<vint32m8_t,vint32m1_t,vbool4_t, 32,VRM8, V_M8>;
+
+ def VI64M2: GroupVTypeInfo<vint64m2_t,vint64m1_t,vbool32_t,64,VRM2, V_M2>;
+ def VI64M4: GroupVTypeInfo<vint64m4_t,vint64m1_t,vbool16_t,64,VRM4, V_M4>;
+ def VI64M8: GroupVTypeInfo<vint64m8_t,vint64m1_t,vbool8_t, 64,VRM8, V_M8>;
+ }
+ }
+
+ defset list<VTypeInfo> AllFloatVectors = {
+ defset list<VTypeInfo> NoGroupFloatVectors = {
+ def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, VR, V_MF4, f16, FPR16>;
+ def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, VR, V_MF2, f16, FPR16>;
+ def VF16M1: VTypeInfo<vfloat16m1_t, vbool16_t, 16, VR, V_M1, f16, FPR16>;
+
+ def VF32MF2: VTypeInfo<vfloat32mf2_t,vbool64_t, 32, VR, V_MF2, f32, FPR32>;
+ def VF32M1: VTypeInfo<vfloat32m1_t, vbool32_t, 32, VR, V_M1, f32, FPR32>;
+
+ def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, VR, V_M1, f64, FPR64>;
+ }
+
+ defset list<GroupVTypeInfo> GroupFloatVectors = {
+ def VF16M2: GroupVTypeInfo<vfloat16m2_t, vfloat16m1_t, vbool8_t, 16,
+ VRM2, V_M2, f16, FPR16>;
+ def VF16M4: GroupVTypeInfo<vfloat16m4_t, vfloat16m1_t, vbool4_t, 16,
+ VRM4, V_M4, f16, FPR16>;
+ def VF16M8: GroupVTypeInfo<vfloat16m8_t, vfloat16m1_t, vbool2_t, 16,
+ VRM8, V_M8, f16, FPR16>;
+
+ def VF32M2: GroupVTypeInfo<vfloat32m2_t, vfloat32m1_t, vbool16_t, 32,
+ VRM2, V_M2, f32, FPR32>;
+ def VF32M4: GroupVTypeInfo<vfloat32m4_t, vfloat32m1_t, vbool8_t, 32,
+ VRM4, V_M4, f32, FPR32>;
+ def VF32M8: GroupVTypeInfo<vfloat32m8_t, vfloat32m1_t, vbool4_t, 32,
+ VRM8, V_M8, f32, FPR32>;
+
+ def VF64M2: GroupVTypeInfo<vfloat64m2_t, vfloat64m1_t, vbool32_t, 64,
+ VRM2, V_M2, f64, FPR64>;
+ def VF64M4: GroupVTypeInfo<vfloat64m4_t, vfloat64m1_t, vbool16_t, 64,
+ VRM4, V_M4, f64, FPR64>;
+ def VF64M8: GroupVTypeInfo<vfloat64m8_t, vfloat64m1_t, vbool8_t, 64,
+ VRM8, V_M8, f64, FPR64>;
+ }
+ }
+}
+
+// This functor is used to obtain the int vector type that has the same SEW and
+// multiplier as the input parameter type
+class GetIntVTypeInfo<VTypeInfo vti>
+{
+ // Equivalent integer vector type. Eg.
+ // VI8M1 → VI8M1 (identity)
+ // VF64M4 → VI64M4
+ VTypeInfo Vti = !cast<VTypeInfo>(!subst("VF", "VI", !cast<string>(vti)));
+}
+
+class MTypeInfo<ValueType Mas, LMULInfo M, string Bx> {
+ ValueType Mask = Mas;
+ // {SEW, VLMul} values set a valid VType to deal with this mask type.
+ // we assume SEW=8 and set corresponding LMUL.
+ int SEW = 8;
+ LMULInfo LMul = M;
+ string BX = Bx; // Appendix of mask operations.
+ // The pattern fragment which produces the AVL operand, representing the
+ // "natural" vector length for this mask type. For scalable masks this is
+ // VLMax.
+ OutPatFrag AVL = VLMax;
+}
+
+defset list<MTypeInfo> AllMasks = {
+ // vbool<n>_t, <n> = SEW/LMUL, we assume SEW=8 and corresponding LMUL.
+ def : MTypeInfo<vbool64_t, V_MF8, "B1">;
+ def : MTypeInfo<vbool32_t, V_MF4, "B2">;
+ def : MTypeInfo<vbool16_t, V_MF2, "B4">;
+ def : MTypeInfo<vbool8_t, V_M1, "B8">;
+ def : MTypeInfo<vbool4_t, V_M2, "B16">;
+ def : MTypeInfo<vbool2_t, V_M4, "B32">;
+ def : MTypeInfo<vbool1_t, V_M8, "B64">;
+}
+
+class VTypeInfoToWide<VTypeInfo vti, VTypeInfo wti>
+{
+ VTypeInfo Vti = vti;
+ VTypeInfo Wti = wti;
+}
+
+class VTypeInfoToFraction<VTypeInfo vti, VTypeInfo fti>
+{
+ VTypeInfo Vti = vti;
+ VTypeInfo Fti = fti;
+}
+
+defset list<VTypeInfoToWide> AllWidenableIntVectors = {
+ def : VTypeInfoToWide<VI8MF8, VI16MF4>;
+ def : VTypeInfoToWide<VI8MF4, VI16MF2>;
+ def : VTypeInfoToWide<VI8MF2, VI16M1>;
+ def : VTypeInfoToWide<VI8M1, VI16M2>;
+ def : VTypeInfoToWide<VI8M2, VI16M4>;
+ def : VTypeInfoToWide<VI8M4, VI16M8>;
+
+ def : VTypeInfoToWide<VI16MF4, VI32MF2>;
+ def : VTypeInfoToWide<VI16MF2, VI32M1>;
+ def : VTypeInfoToWide<VI16M1, VI32M2>;
+ def : VTypeInfoToWide<VI16M2, VI32M4>;
+ def : VTypeInfoToWide<VI16M4, VI32M8>;
+
+ def : VTypeInfoToWide<VI32MF2, VI64M1>;
+ def : VTypeInfoToWide<VI32M1, VI64M2>;
+ def : VTypeInfoToWide<VI32M2, VI64M4>;
+ def : VTypeInfoToWide<VI32M4, VI64M8>;
+}
+
+defset list<VTypeInfoToWide> AllWidenableFloatVectors = {
+ def : VTypeInfoToWide<VF16MF4, VF32MF2>;
+ def : VTypeInfoToWide<VF16MF2, VF32M1>;
+ def : VTypeInfoToWide<VF16M1, VF32M2>;
+ def : VTypeInfoToWide<VF16M2, VF32M4>;
+ def : VTypeInfoToWide<VF16M4, VF32M8>;
+
+ def : VTypeInfoToWide<VF32MF2, VF64M1>;
+ def : VTypeInfoToWide<VF32M1, VF64M2>;
+ def : VTypeInfoToWide<VF32M2, VF64M4>;
+ def : VTypeInfoToWide<VF32M4, VF64M8>;
+}
+
+defset list<VTypeInfoToFraction> AllFractionableVF2IntVectors = {
+ def : VTypeInfoToFraction<VI16MF4, VI8MF8>;
+ def : VTypeInfoToFraction<VI16MF2, VI8MF4>;
+ def : VTypeInfoToFraction<VI16M1, VI8MF2>;
+ def : VTypeInfoToFraction<VI16M2, VI8M1>;
+ def : VTypeInfoToFraction<VI16M4, VI8M2>;
+ def : VTypeInfoToFraction<VI16M8, VI8M4>;
+ def : VTypeInfoToFraction<VI32MF2, VI16MF4>;
+ def : VTypeInfoToFraction<VI32M1, VI16MF2>;
+ def : VTypeInfoToFraction<VI32M2, VI16M1>;
+ def : VTypeInfoToFraction<VI32M4, VI16M2>;
+ def : VTypeInfoToFraction<VI32M8, VI16M4>;
+ def : VTypeInfoToFraction<VI64M1, VI32MF2>;
+ def : VTypeInfoToFraction<VI64M2, VI32M1>;
+ def : VTypeInfoToFraction<VI64M4, VI32M2>;
+ def : VTypeInfoToFraction<VI64M8, VI32M4>;
+}
+
+defset list<VTypeInfoToFraction> AllFractionableVF4IntVectors = {
+ def : VTypeInfoToFraction<VI32MF2, VI8MF8>;
+ def : VTypeInfoToFraction<VI32M1, VI8MF4>;
+ def : VTypeInfoToFraction<VI32M2, VI8MF2>;
+ def : VTypeInfoToFraction<VI32M4, VI8M1>;
+ def : VTypeInfoToFraction<VI32M8, VI8M2>;
+ def : VTypeInfoToFraction<VI64M1, VI16MF4>;
+ def : VTypeInfoToFraction<VI64M2, VI16MF2>;
+ def : VTypeInfoToFraction<VI64M4, VI16M1>;
+ def : VTypeInfoToFraction<VI64M8, VI16M2>;
+}
+
+defset list<VTypeInfoToFraction> AllFractionableVF8IntVectors = {
+ def : VTypeInfoToFraction<VI64M1, VI8MF8>;
+ def : VTypeInfoToFraction<VI64M2, VI8MF4>;
+ def : VTypeInfoToFraction<VI64M4, VI8MF2>;
+ def : VTypeInfoToFraction<VI64M8, VI8M1>;
+}
+
+defset list<VTypeInfoToWide> AllWidenableIntToFloatVectors = {
+ def : VTypeInfoToWide<VI8MF8, VF16MF4>;
+ def : VTypeInfoToWide<VI8MF4, VF16MF2>;
+ def : VTypeInfoToWide<VI8MF2, VF16M1>;
+ def : VTypeInfoToWide<VI8M1, VF16M2>;
+ def : VTypeInfoToWide<VI8M2, VF16M4>;
+ def : VTypeInfoToWide<VI8M4, VF16M8>;
+
+ def : VTypeInfoToWide<VI16MF4, VF32MF2>;
+ def : VTypeInfoToWide<VI16MF2, VF32M1>;
+ def : VTypeInfoToWide<VI16M1, VF32M2>;
+ def : VTypeInfoToWide<VI16M2, VF32M4>;
+ def : VTypeInfoToWide<VI16M4, VF32M8>;
+
+ def : VTypeInfoToWide<VI32MF2, VF64M1>;
+ def : VTypeInfoToWide<VI32M1, VF64M2>;
+ def : VTypeInfoToWide<VI32M2, VF64M4>;
+ def : VTypeInfoToWide<VI32M4, VF64M8>;
+}
+
+// This class holds the record of the RISCVVPseudoTable below.
+// This represents the information we need in codegen for each pseudo.
+// The definition should be consistent with `struct PseudoInfo` in
+// RISCVBaseInfo.h.
+class CONST8b<bits<8> val> {
+ bits<8> V = val;
+}
+def InvalidIndex : CONST8b<0x80>;
+class RISCVVPseudo {
+ Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
+ Instruction BaseInstr;
+}
+
+// The actual table.
+def RISCVVPseudosTable : GenericTable {
+ let FilterClass = "RISCVVPseudo";
+ let CppTypeName = "PseudoInfo";
+ let Fields = [ "Pseudo", "BaseInstr" ];
+ let PrimaryKey = [ "Pseudo" ];
+ let PrimaryKeyName = "getPseudoInfo";
+}
+
+def RISCVVIntrinsicsTable : GenericTable {
+ let FilterClass = "RISCVVIntrinsic";
+ let CppTypeName = "RISCVVIntrinsicInfo";
+ let Fields = ["IntrinsicID", "ExtendOperand"];
+ let PrimaryKey = ["IntrinsicID"];
+ let PrimaryKeyName = "getRISCVVIntrinsicInfo";
+}
+
+class RISCVZvlsseg<string IntrName, bits<11> S, bits<3> L, bits<3> IL = V_M1.value> {
+ Intrinsic IntrinsicID = !cast<Intrinsic>(IntrName);
+ bits<11> SEW = S;
+ bits<3> LMUL = L;
+ bits<3> IndexLMUL = IL;
+ Pseudo Pseudo = !cast<Pseudo>(NAME);
+}
+
+def RISCVZvlssegTable : GenericTable {
+ let FilterClass = "RISCVZvlsseg";
+ let Fields = ["IntrinsicID", "SEW", "LMUL", "IndexLMUL", "Pseudo"];
+ let PrimaryKey = ["IntrinsicID", "SEW", "LMUL", "IndexLMUL"];
+ let PrimaryKeyName = "getPseudo";
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the different pseudo instructions.
+//===----------------------------------------------------------------------===//
+
+class PseudoToVInst<string PseudoInst> {
+ string VInst = !subst("_M8", "",
+ !subst("_M4", "",
+ !subst("_M2", "",
+ !subst("_M1", "",
+ !subst("_MF2", "",
+ !subst("_MF4", "",
+ !subst("_MF8", "",
+ !subst("_B1", "",
+ !subst("_B2", "",
+ !subst("_B4", "",
+ !subst("_B8", "",
+ !subst("_B16", "",
+ !subst("_B32", "",
+ !subst("_B64", "",
+ !subst("_MASK", "",
+ !subst("F16", "F",
+ !subst("F32", "F",
+ !subst("F64", "F",
+ !subst("Pseudo", "", PseudoInst)))))))))))))))))));
+}
+
+class ToLowerCase<string Upper> {
+ string L = !subst("FF", "ff",
+ !subst("VLSEG", "vlseg",
+ !subst("VLSSEG", "vlsseg",
+ !subst("VSSEG", "vsseg",
+ !subst("VSSSEG", "vssseg",
+ !subst("VLOXSEG", "vloxseg",
+ !subst("VLUXSEG", "vluxseg",
+ !subst("VSOXSEG", "vsoxseg",
+ !subst("VSUXSEG", "vsuxseg", Upper)))))))));
+}
+
+// Example: PseudoVLSEG2E32_V_M2 -> int_riscv_vlseg2
+// Example: PseudoVLSEG2E32_V_M2_MASK -> int_riscv_vlseg2_mask
+class PseudoToIntrinsic<string PseudoInst, bit IsMasked> {
+ string Intrinsic = !strconcat("int_riscv_",
+ ToLowerCase<
+ !subst("E8", "",
+ !subst("E16", "",
+ !subst("E32", "",
+ !subst("E64", "",
+ !subst("EI8", "",
+ !subst("EI16", "",
+ !subst("EI32", "",
+ !subst("EI64", "",
+ !subst("_V", "", PseudoToVInst<PseudoInst>.VInst)))))))))>.L,
+ !if(IsMasked, "_mask", ""));
+}
+
+// The destination vector register group for a masked vector instruction cannot
+// overlap the source mask register (v0), unless the destination vector register
+// is being written with a mask value (e.g., comparisons) or the scalar result
+// of a reduction.
+class GetVRegNoV0<VReg VRegClass> {
+ VReg R = !cond(!eq(VRegClass, VR) : VRNoV0,
+ !eq(VRegClass, VRM2) : VRM2NoV0,
+ !eq(VRegClass, VRM4) : VRM4NoV0,
+ !eq(VRegClass, VRM8) : VRM8NoV0,
+ !eq(1, 1) : VRegClass);
+}
+
+// Join strings in list using separator and ignoring empty elements
+class Join<list<string> strings, string separator> {
+ string ret = !foldl(!head(strings), !tail(strings), a, b,
+ !cond(
+ !and(!empty(a), !empty(b)) : "",
+ !empty(a) : b,
+ !empty(b) : a,
+ 1 : a#separator#b));
+}
+
+class VPseudo<Instruction instr, LMULInfo m, dag outs, dag ins> :
+ Pseudo<outs, ins, []>, RISCVVPseudo {
+ let BaseInstr = instr;
+ let VLMul = m.value;
+}
+
+class VPseudoUSLoadNoMask<VReg RetClass>:
+ Pseudo<(outs RetClass:$rd),
+ (ins GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSLoadMask<VReg RetClass>:
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge,
+ GPR:$rs1,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSLoadNoMask<VReg RetClass>:
+ Pseudo<(outs RetClass:$rd),
+ (ins GPR:$rs1, GPR:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSLoadMask<VReg RetClass>:
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge,
+ GPR:$rs1, GPR:$rs2,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass>:
+ Pseudo<(outs RetClass:$rd),
+ (ins GPR:$rs1, IdxClass:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoILoadMask<VReg RetClass, VReg IdxClass>:
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge,
+ GPR:$rs1, IdxClass:$rs2,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSStoreNoMask<VReg StClass>:
+ Pseudo<(outs),
+ (ins StClass:$rd, GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSStoreMask<VReg StClass>:
+ Pseudo<(outs),
+ (ins StClass:$rd, GPR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSStoreNoMask<VReg StClass>:
+ Pseudo<(outs),
+ (ins StClass:$rd, GPR:$rs1, GPR:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSStoreMask<VReg StClass>:
+ Pseudo<(outs),
+ (ins StClass:$rd, GPR:$rs1, GPR:$rs2, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Unary instruction that is never masked so HasDummyMask=0.
+class VPseudoUnaryNoDummyMask<VReg RetClass,
+ DAGOperand Op2Class> :
+ Pseudo<(outs RetClass:$rd),
+ (ins Op2Class:$rs1, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoNullaryNoMask<VReg RegClass>:
+ Pseudo<(outs RegClass:$rd),
+ (ins GPR:$vl, ixlenimm:$sew),
+ []>, RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoNullaryMask<VReg RegClass>:
+ Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
+ (ins GetVRegNoV0<RegClass>.R:$merge, VMaskOp:$vm, GPR:$vl,
+ ixlenimm:$sew), []>, RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints ="$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Nullary for pseudo instructions. They are expanded in
+// RISCVExpandPseudoInsts pass.
+class VPseudoNullaryPseudoM<string BaseInst>
+ : Pseudo<(outs VR:$rd), (ins GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ // BaseInstr is not used in RISCVExpandPseudoInsts pass.
+ // Just fill a corresponding real v-inst to pass tablegen check.
+ let BaseInstr = !cast<Instruction>(BaseInst);
+}
+
+// RetClass could be GPR or VReg.
+class VPseudoUnaryNoMask<DAGOperand RetClass, VReg OpClass, string Constraint = ""> :
+ Pseudo<(outs RetClass:$rd),
+ (ins OpClass:$rs2, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = Constraint;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// mask unary operation without maskedoff
+class VPseudoMaskUnarySOutMask:
+ Pseudo<(outs GPR:$rd),
+ (ins VR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Masked mask operation have no $rd=$merge constraints
+class VPseudoUnaryMOutMask:
+ Pseudo<(outs VR:$rd),
+ (ins VR:$merge, VR:$rs1, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+// Mask can be V0~V31
+class VPseudoUnaryAnyMask<VReg RetClass,
+ VReg Op1Class> :
+ Pseudo<(outs RetClass:$rd),
+ (ins RetClass:$merge,
+ Op1Class:$rs2,
+ VR:$vm, GPR:$vl, ixlenimm:$sew),
+ []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "@earlyclobber $rd, $rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoBinaryNoMask<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ string Constraint> :
+ Pseudo<(outs RetClass:$rd),
+ (ins Op1Class:$rs2, Op2Class:$rs1, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = Constraint;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass>:
+ Pseudo<(outs),
+ (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoIStoreMask<VReg StClass, VReg IdxClass>:
+ Pseudo<(outs),
+ (ins StClass:$rd, GPR:$rs1, IdxClass:$rs2, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoBinaryMask<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ string Constraint> :
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge,
+ Op1Class:$rs2, Op2Class:$rs1,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoBinaryCarryIn<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ LMULInfo MInfo,
+ bit CarryIn,
+ string Constraint> :
+ Pseudo<(outs RetClass:$rd),
+ !if(CarryIn,
+ (ins Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, GPR:$vl,
+ ixlenimm:$sew),
+ (ins Op1Class:$rs2, Op2Class:$rs1, GPR:$vl, ixlenimm:$sew)), []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = Constraint;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 0;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+ let VLMul = MInfo.value;
+}
+
+class VPseudoTernaryNoMask<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ string Constraint> :
+ Pseudo<(outs RetClass:$rd),
+ (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+ GPR:$vl, ixlenimm:$sew),
+ []>,
+ RISCVVPseudo {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = Join<[Constraint, "$rd = $rs3"], ",">.ret;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoAMOWDNoMask<VReg RetClass,
+ VReg Op1Class> :
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
+ (ins GPR:$rs1,
+ Op1Class:$vs2,
+ GetVRegNoV0<RetClass>.R:$vd,
+ GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+ let Constraints = "$vd_wd = $vd";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoAMOWDMask<VReg RetClass,
+ VReg Op1Class> :
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
+ (ins GPR:$rs1,
+ Op1Class:$vs2,
+ GetVRegNoV0<RetClass>.R:$vd,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew), []>,
+ RISCVVPseudo {
+ let mayLoad = 1;
+ let mayStore = 1;
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+ let Constraints = "$vd_wd = $vd";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+multiclass VPseudoAMOEI<int eew> {
+ // Standard scalar AMO supports 32, 64, and 128 Mem data bits,
+ // and in the base vector "V" extension, only SEW up to ELEN = max(XLEN, FLEN)
+ // are required to be supported.
+ // therefore only [32, 64] is allowed here.
+ foreach sew = [32, 64] in {
+ foreach lmul = MxSet<sew>.m in {
+ defvar octuple_lmul = octuple_from_str<lmul.MX>.ret;
+ // Calculate emul = eew * lmul / sew
+ defvar octuple_emul = !srl(!mul(eew, octuple_lmul), shift_amount<sew>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar emulMX = octuple_to_str<octuple_emul>.ret;
+ defvar lmulMX = octuple_to_str<octuple_lmul>.ret;
+ defvar emul= !cast<LMULInfo>("V_" # emulMX);
+ defvar lmul = !cast<LMULInfo>("V_" # lmulMX);
+ let VLMul = lmul.value in {
+ def "_WD_" # lmulMX # "_" # emulMX : VPseudoAMOWDNoMask<lmul.vrclass, emul.vrclass>;
+ def "_WD_" # lmulMX # "_" # emulMX # "_MASK" : VPseudoAMOWDMask<lmul.vrclass, emul.vrclass>;
+ }
+ }
+ }
+ }
+}
+
+multiclass VPseudoAMO {
+ foreach eew = EEWList in
+ defm "EI" # eew : VPseudoAMOEI<eew>;
+}
+
+class VPseudoUSSegLoadNoMask<VReg RetClass, bits<11> EEW>:
+ Pseudo<(outs RetClass:$rd),
+ (ins GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSSegLoadMask<VReg RetClass, bits<11> EEW>:
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegLoadNoMask<VReg RetClass, bits<11> EEW>:
+ Pseudo<(outs RetClass:$rd),
+ (ins GPR:$rs1, GPR:$offset, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegLoadMask<VReg RetClass, bits<11> EEW>:
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+ GPR:$offset, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Constraints = "$rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegLoadNoMask<VReg RetClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+ Pseudo<(outs RetClass:$rd),
+ (ins GPR:$rs1, IdxClass:$offset, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul, LMUL> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ // For vector indexed segment loads, the destination vector register groups
+ // cannot overlap the source vector register group
+ let Constraints = "@earlyclobber $rd";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+ Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+ (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
+ IdxClass:$offset, VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul, LMUL> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ // For vector indexed segment loads, the destination vector register groups
+ // cannot overlap the source vector register group
+ let Constraints = "@earlyclobber $rd, $rd = $merge";
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasMergeOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSSegStoreNoMask<VReg ValClass, bits<11> EEW>:
+ Pseudo<(outs),
+ (ins ValClass:$rd, GPR:$rs1, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoUSSegStoreMask<VReg ValClass, bits<11> EEW>:
+ Pseudo<(outs),
+ (ins ValClass:$rd, GPR:$rs1,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegStoreNoMask<VReg ValClass, bits<11> EEW>:
+ Pseudo<(outs),
+ (ins ValClass:$rd, GPR:$rs1, GPR: $offset, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoSSegStoreMask<VReg ValClass, bits<11> EEW>:
+ Pseudo<(outs),
+ (ins ValClass:$rd, GPR:$rs1, GPR: $offset,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegStoreNoMask<VReg ValClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+ Pseudo<(outs),
+ (ins ValClass:$rd, GPR:$rs1, IdxClass: $index,
+ GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, false>.Intrinsic, EEW, VLMul, LMUL> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let HasDummyMask = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, bits<11> EEW, bits<3> LMUL>:
+ Pseudo<(outs),
+ (ins ValClass:$rd, GPR:$rs1, IdxClass: $index,
+ VMaskOp:$vm, GPR:$vl, ixlenimm:$sew),[]>,
+ RISCVVPseudo,
+ RISCVZvlsseg<PseudoToIntrinsic<NAME, true>.Intrinsic, EEW, VLMul, LMUL> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let hasSideEffects = 0;
+ let usesCustomInserter = 1;
+ let Uses = [VL, VTYPE];
+ let HasVLOp = 1;
+ let HasSEWOp = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+multiclass VPseudoUSLoad {
+ foreach lmul = MxList.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "_V_" # LInfo : VPseudoUSLoadNoMask<vreg>;
+ def "_V_" # LInfo # "_MASK" : VPseudoUSLoadMask<vreg>;
+ }
+ }
+}
+
+multiclass VPseudoSLoad {
+ foreach lmul = MxList.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "_V_" # LInfo : VPseudoSLoadNoMask<vreg>;
+ def "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg>;
+ }
+ }
+}
+
+multiclass VPseudoILoad {
+ foreach lmul = MxList.m in
+ foreach idx_lmul = MxList.m in {
+ defvar LInfo = lmul.MX;
+ defvar Vreg = lmul.vrclass;
+ defvar IdxLInfo = idx_lmul.MX;
+ defvar IdxVreg = idx_lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "_V_" # IdxLInfo # "_" # LInfo : VPseudoILoadNoMask<Vreg, IdxVreg>;
+ def "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoILoadMask<Vreg, IdxVreg>;
+ }
+ }
+}
+
+multiclass VPseudoUSStore {
+ foreach lmul = MxList.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "_V_" # LInfo : VPseudoUSStoreNoMask<vreg>;
+ def "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg>;
+ }
+ }
+}
+
+multiclass VPseudoSStore {
+ foreach lmul = MxList.m in {
+ defvar LInfo = lmul.MX;
+ defvar vreg = lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "_V_" # LInfo : VPseudoSStoreNoMask<vreg>;
+ def "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg>;
+ }
+ }
+}
+
+multiclass VPseudoIStore {
+ foreach lmul = MxList.m in
+ foreach idx_lmul = MxList.m in {
+ defvar LInfo = lmul.MX;
+ defvar Vreg = lmul.vrclass;
+ defvar IdxLInfo = idx_lmul.MX;
+ defvar IdxVreg = idx_lmul.vrclass;
+ let VLMul = lmul.value in {
+ def "_V_" # IdxLInfo # "_" # LInfo : VPseudoIStoreNoMask<Vreg, IdxVreg>;
+ def "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : VPseudoIStoreMask<Vreg, IdxVreg>;
+ }
+ }
+}
+
+multiclass VPseudoUnaryS_M {
+ foreach mti = AllMasks in
+ {
+ let VLMul = mti.LMul.value in {
+ def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>;
+ def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask;
+ }
+ }
+}
+
+multiclass VPseudoUnaryM_M {
+ defvar constraint = "@earlyclobber $rd";
+ foreach mti = AllMasks in
+ {
+ let VLMul = mti.LMul.value in {
+ def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>;
+ def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>;
+ }
+ }
+}
+
+multiclass VPseudoMaskNullaryV {
+ foreach m = MxList.m in {
+ let VLMul = m.value in {
+ def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>;
+ def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>;
+ }
+ }
+}
+
+multiclass VPseudoNullaryPseudoM <string BaseInst> {
+ foreach mti = AllMasks in {
+ let VLMul = mti.LMul.value in {
+ def "_M_" # mti.BX : VPseudoNullaryPseudoM<BaseInst # "_MM">;
+ }
+ }
+}
+
+multiclass VPseudoUnaryV_M {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxList.m in {
+ let VLMul = m.value in {
+ def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>;
+ def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>;
+ }
+ }
+}
+
+multiclass VPseudoUnaryV_V_AnyMask {
+ foreach m = MxList.m in {
+ let VLMul = m.value in
+ def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>;
+ }
+}
+
+multiclass VPseudoBinary<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ LMULInfo MInfo,
+ string Constraint = ""> {
+ let VLMul = MInfo.value in {
+ def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+ Constraint>;
+ def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class,
+ Constraint>;
+ }
+}
+
+multiclass VPseudoBinaryEmul<VReg RetClass,
+ VReg Op1Class,
+ DAGOperand Op2Class,
+ LMULInfo lmul,
+ LMULInfo emul,
+ string Constraint = ""> {
+ let VLMul = lmul.value in {
+ def "_" # lmul.MX # "_" # emul.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
+ Constraint>;
+ def "_" # lmul.MX # "_" # emul.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class,
+ Constraint>;
+ }
+}
+
+multiclass VPseudoBinaryV_VV<string Constraint = ""> {
+ foreach m = MxList.m in
+ defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> {
+ foreach m = MxList.m in {
+ foreach sew = EEWList in {
+ defvar octuple_lmul = octuple_from_str<m.MX>.ret;
+ // emul = lmul * eew / sew
+ defvar octuple_emul = !srl(!mul(octuple_lmul, eew), shift_amount<sew>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar emulMX = octuple_to_str<octuple_emul>.ret;
+ defvar emul = !cast<LMULInfo>("V_" # emulMX);
+ defm _VV : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul, Constraint>;
+ }
+ }
+ }
+}
+
+multiclass VPseudoBinaryV_VX<string Constraint = ""> {
+ foreach m = MxList.m in
+ defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VF<string Constraint = ""> {
+ foreach m = MxList.m in
+ foreach f = FPList.fpinfo in
+ defm "_V" # f.FX : VPseudoBinary<m.vrclass, m.vrclass,
+ f.fprclass, m, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
+ foreach m = MxList.m in
+ defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
+}
+
+multiclass VPseudoBinaryM_MM {
+ foreach m = MxList.m in
+ let VLMul = m.value in {
+ def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">;
+ }
+}
+
+// We use earlyclobber here due to
+// * The destination EEW is smaller than the source EEW and the overlap is
+// in the lowest-numbered part of the source register group is legal.
+// Otherwise, it is illegal.
+// * The destination EEW is greater than the source EEW, the source EMUL is
+// at least 1, and the overlap is in the highest-numbered part of the
+// destination register group is legal. Otherwise, it is illegal.
+multiclass VPseudoBinaryW_VV {
+ foreach m = MxList.m[0-5] in
+ defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_VX {
+ foreach m = MxList.m[0-5] in
+ defm "_VX" : VPseudoBinary<m.wvrclass, m.vrclass, GPR, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_VF {
+ foreach m = MxList.m[0-5] in
+ foreach f = FPList.fpinfo[0-1] in
+ defm "_V" # f.FX : VPseudoBinary<m.wvrclass, m.vrclass,
+ f.fprclass, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_WV {
+ foreach m = MxList.m[0-5] in
+ defm _WV : VPseudoBinary<m.wvrclass, m.wvrclass, m.vrclass, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_WX {
+ foreach m = MxList.m[0-5] in
+ defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryW_WF {
+ foreach m = MxList.m[0-5] in
+ foreach f = FPList.fpinfo[0-1] in
+ defm "_W" # f.FX : VPseudoBinary<m.wvrclass, m.wvrclass,
+ f.fprclass, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_WV {
+ foreach m = MxList.m[0-5] in
+ defm _WV : VPseudoBinary<m.vrclass, m.wvrclass, m.vrclass, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_WX {
+ foreach m = MxList.m[0-5] in
+ defm _WX : VPseudoBinary<m.vrclass, m.wvrclass, GPR, m,
+ "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_WI {
+ foreach m = MxList.m[0-5] in
+ defm _WI : VPseudoBinary<m.vrclass, m.wvrclass, uimm5, m,
+ "@earlyclobber $rd">;
+}
+
+// For vadc and vsbc, the instruction encoding is reserved if the destination
+// vector register is v0.
+// For vadc and vsbc, CarryIn == 1 and CarryOut == 0
+multiclass VPseudoBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1,
+ string Constraint = ""> {
+ foreach m = MxList.m in
+ def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX :
+ VPseudoBinaryCarryIn<!if(CarryOut, VR,
+ !if(!and(CarryIn, !not(CarryOut)),
+ GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+ m.vrclass, m.vrclass, m, CarryIn, Constraint>;
+}
+
+multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1,
+ string Constraint = ""> {
+ foreach m = MxList.m in
+ def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX :
+ VPseudoBinaryCarryIn<!if(CarryOut, VR,
+ !if(!and(CarryIn, !not(CarryOut)),
+ GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+ m.vrclass, GPR, m, CarryIn, Constraint>;
+}
+
+multiclass VPseudoBinaryV_FM {
+ foreach m = MxList.m in
+ foreach f = FPList.fpinfo in
+ def "_V" # f.FX # "M_" # m.MX :
+ VPseudoBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
+ m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">;
+}
+
+multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
+ string Constraint = ""> {
+ foreach m = MxList.m in
+ def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX :
+ VPseudoBinaryCarryIn<!if(CarryOut, VR,
+ !if(!and(CarryIn, !not(CarryOut)),
+ GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+ m.vrclass, simm5, m, CarryIn, Constraint>;
+}
+
+multiclass VPseudoUnaryV_V_X_I_NoDummyMask {
+ foreach m = MxList.m in {
+ let VLMul = m.value in {
+ def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>;
+ def "_X_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, GPR>;
+ def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>;
+ }
+ }
+}
+
+multiclass VPseudoUnaryV_F_NoDummyMask {
+ foreach m = MxList.m in {
+ foreach f = FPList.fpinfo in {
+ let VLMul = m.value in {
+ def "_" # f.FX # "_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>;
+ }
+ }
+ }
+}
+
+multiclass VPseudoUnaryV_V {
+ foreach m = MxList.m in {
+ let VLMul = m.value in {
+ def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>;
+ def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>;
+ }
+ }
+}
+
+multiclass PseudoUnaryV_VF2 {
+ defvar constraints = "@earlyclobber $rd";
+ foreach m = MxList.m[1-6] in
+ {
+ let VLMul = m.value in {
+ def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>;
+ def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f2vrclass,
+ constraints>;
+ }
+ }
+}
+
+multiclass PseudoUnaryV_VF4 {
+ defvar constraints = "@earlyclobber $rd";
+ foreach m = MxList.m[2-6] in
+ {
+ let VLMul = m.value in {
+ def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>;
+ def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f4vrclass,
+ constraints>;
+ }
+ }
+}
+
+multiclass PseudoUnaryV_VF8 {
+ defvar constraints = "@earlyclobber $rd";
+ foreach m = MxList.m[3-6] in
+ {
+ let VLMul = m.value in {
+ def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>;
+ def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f8vrclass,
+ constraints>;
+ }
+ }
+}
+
+// The destination EEW is 1.
+// The source EEW is 8, 16, 32, or 64.
+// When the destination EEW is different from source EEW, we need to use
+// @earlyclobber to avoid the overlap between destination and source registers.
+multiclass VPseudoBinaryM_VV {
+ foreach m = MxList.m in
+ defm _VV : VPseudoBinary<VR, m.vrclass, m.vrclass, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryM_VX {
+ foreach m = MxList.m in
+ defm "_VX" :
+ VPseudoBinary<VR, m.vrclass, GPR, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryM_VF {
+ foreach m = MxList.m in
+ foreach f = FPList.fpinfo in
+ defm "_V" # f.FX :
+ VPseudoBinary<VR, m.vrclass, f.fprclass, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryM_VI {
+ foreach m = MxList.m in
+ defm _VI : VPseudoBinary<VR, m.vrclass, simm5, m, "@earlyclobber $rd">;
+}
+
+multiclass VPseudoBinaryV_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+ defm "" : VPseudoBinaryV_VV<Constraint>;
+ defm "" : VPseudoBinaryV_VX<Constraint>;
+ defm "" : VPseudoBinaryV_VI<ImmType, Constraint>;
+}
+
+multiclass VPseudoBinaryV_VV_VX {
+ defm "" : VPseudoBinaryV_VV;
+ defm "" : VPseudoBinaryV_VX;
+}
+
+multiclass VPseudoBinaryV_VV_VF {
+ defm "" : VPseudoBinaryV_VV;
+ defm "" : VPseudoBinaryV_VF;
+}
+
+multiclass VPseudoBinaryV_VX_VI<Operand ImmType = simm5> {
+ defm "" : VPseudoBinaryV_VX;
+ defm "" : VPseudoBinaryV_VI<ImmType>;
+}
+
+multiclass VPseudoBinaryW_VV_VX {
+ defm "" : VPseudoBinaryW_VV;
+ defm "" : VPseudoBinaryW_VX;
+}
+
+multiclass VPseudoBinaryW_VV_VF {
+ defm "" : VPseudoBinaryW_VV;
+ defm "" : VPseudoBinaryW_VF;
+}
+
+multiclass VPseudoBinaryW_WV_WX {
+ defm "" : VPseudoBinaryW_WV;
+ defm "" : VPseudoBinaryW_WX;
+}
+
+multiclass VPseudoBinaryW_WV_WF {
+ defm "" : VPseudoBinaryW_WV;
+ defm "" : VPseudoBinaryW_WF;
+}
+
+multiclass VPseudoBinaryV_VM_XM_IM {
+ defm "" : VPseudoBinaryV_VM;
+ defm "" : VPseudoBinaryV_XM;
+ defm "" : VPseudoBinaryV_IM;
+}
+
+multiclass VPseudoBinaryV_VM_XM {
+ defm "" : VPseudoBinaryV_VM;
+ defm "" : VPseudoBinaryV_XM;
+}
+
+multiclass VPseudoBinaryM_VM_XM_IM<string Constraint> {
+ defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+ defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+ defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+}
+
+multiclass VPseudoBinaryM_VM_XM<string Constraint> {
+ defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+ defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+}
+
+multiclass VPseudoBinaryM_V_X_I<string Constraint> {
+ defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+ defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+ defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+}
+
+multiclass VPseudoBinaryM_V_X<string Constraint> {
+ defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+ defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+}
+
+multiclass VPseudoBinaryV_WV_WX_WI {
+ defm "" : VPseudoBinaryV_WV;
+ defm "" : VPseudoBinaryV_WX;
+ defm "" : VPseudoBinaryV_WI;
+}
+
+multiclass VPseudoTernary<VReg RetClass,
+ VReg Op1Class,
+ RegisterClass Op2Class,
+ LMULInfo MInfo,
+ string Constraint = ""> {
+ let VLMul = MInfo.value in {
+ def "_" # MInfo.MX : VPseudoTernaryNoMask<RetClass, Op1Class, Op2Class, Constraint>;
+ def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class, Constraint>;
+ }
+}
+
+multiclass VPseudoTernaryV_VV<string Constraint = ""> {
+ foreach m = MxList.m in
+ defm _VV : VPseudoTernary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VX<string Constraint = ""> {
+ foreach m = MxList.m in
+ defm _VX : VPseudoTernary<m.vrclass, m.vrclass, GPR, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> {
+ foreach m = MxList.m in
+ defm "_VX" : VPseudoTernary<m.vrclass, GPR, m.vrclass, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VF_AAXA<string Constraint = ""> {
+ foreach m = MxList.m in
+ foreach f = FPList.fpinfo in
+ defm "_V" # f.FX : VPseudoTernary<m.vrclass, f.fprclass, m.vrclass,
+ m, Constraint>;
+}
+
+multiclass VPseudoTernaryW_VV {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxList.m[0-5] in
+ defm _VV : VPseudoTernary<m.wvrclass, m.vrclass, m.vrclass, m, constraint>;
+}
+
+multiclass VPseudoTernaryW_VX {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxList.m[0-5] in
+ defm "_VX" : VPseudoTernary<m.wvrclass, GPR, m.vrclass, m, constraint>;
+}
+
+multiclass VPseudoTernaryW_VF {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxList.m[0-5] in
+ foreach f = FPList.fpinfo[0-1] in
+ defm "_V" # f.FX : VPseudoTernary<m.wvrclass, f.fprclass, m.vrclass, m,
+ constraint>;
+}
+
+multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
+ foreach m = MxList.m in
+ defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
+}
+
+multiclass VPseudoTernaryV_VV_VX_AAXA<string Constraint = ""> {
+ defm "" : VPseudoTernaryV_VV<Constraint>;
+ defm "" : VPseudoTernaryV_VX_AAXA<Constraint>;
+}
+
+multiclass VPseudoTernaryV_VV_VF_AAXA<string Constraint = ""> {
+ defm "" : VPseudoTernaryV_VV<Constraint>;
+ defm "" : VPseudoTernaryV_VF_AAXA<Constraint>;
+}
+
+multiclass VPseudoTernaryV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+ defm "" : VPseudoTernaryV_VX<Constraint>;
+ defm "" : VPseudoTernaryV_VI<ImmType, Constraint>;
+}
+
+multiclass VPseudoTernaryW_VV_VX {
+ defm "" : VPseudoTernaryW_VV;
+ defm "" : VPseudoTernaryW_VX;
+}
+
+multiclass VPseudoTernaryW_VV_VF {
+ defm "" : VPseudoTernaryW_VV;
+ defm "" : VPseudoTernaryW_VF;
+}
+
+multiclass VPseudoBinaryM_VV_VX_VI {
+ defm "" : VPseudoBinaryM_VV;
+ defm "" : VPseudoBinaryM_VX;
+ defm "" : VPseudoBinaryM_VI;
+}
+
+multiclass VPseudoBinaryM_VV_VX {
+ defm "" : VPseudoBinaryM_VV;
+ defm "" : VPseudoBinaryM_VX;
+}
+
+multiclass VPseudoBinaryM_VV_VF {
+ defm "" : VPseudoBinaryM_VV;
+ defm "" : VPseudoBinaryM_VF;
+}
+
+multiclass VPseudoBinaryM_VX_VI {
+ defm "" : VPseudoBinaryM_VX;
+ defm "" : VPseudoBinaryM_VI;
+}
+
+multiclass VPseudoReductionV_VS {
+ foreach m = MxList.m in {
+ let WritesElement0 = 1 in
+ defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>;
+ }
+}
+
+multiclass VPseudoConversion<VReg RetClass,
+ VReg Op1Class,
+ LMULInfo MInfo,
+ string Constraint = ""> {
+ let VLMul = MInfo.value in {
+ def "_" # MInfo.MX : VPseudoUnaryNoMask<RetClass, Op1Class, Constraint>;
+ def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask<RetClass, Op1Class,
+ Constraint>;
+ }
+}
+
+multiclass VPseudoConversionV_V {
+ foreach m = MxList.m in
+ defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>;
+}
+
+multiclass VPseudoConversionW_V {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxList.m[0-5] in
+ defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>;
+}
+
+multiclass VPseudoConversionV_W {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxList.m[0-5] in
+ defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>;
+}
+
+multiclass VPseudoUSSegLoad<bit isFF> {
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ let VLMul = lmul.value in {
+ foreach nf = NFSet<lmul>.L in {
+ defvar vreg = SegRegClass<lmul, nf>.RC;
+ defvar FFStr = !if(isFF, "FF", "");
+ def nf # "E" # eew # FFStr # "_V_" # LInfo :
+ VPseudoUSSegLoadNoMask<vreg, eew>;
+ def nf # "E" # eew # FFStr # "_V_" # LInfo # "_MASK" :
+ VPseudoUSSegLoadMask<vreg, eew>;
+ }
+ }
+ }
+ }
+}
+
+multiclass VPseudoSSegLoad {
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ let VLMul = lmul.value in {
+ foreach nf = NFSet<lmul>.L in {
+ defvar vreg = SegRegClass<lmul, nf>.RC;
+ def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegLoadNoMask<vreg, eew>;
+ def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegLoadMask<vreg, eew>;
+ }
+ }
+ }
+ }
+}
+
+multiclass VPseudoISegLoad {
+ foreach idx_eew = EEWList in { // EEW for index argument.
+ foreach idx_lmul = MxSet<idx_eew>.m in { // LMUL for index argument.
+ foreach val_lmul = MxList.m in { // LMUL for the value.
+ defvar IdxLInfo = idx_lmul.MX;
+ defvar IdxVreg = idx_lmul.vrclass;
+ defvar ValLInfo = val_lmul.MX;
+ let VLMul = val_lmul.value in {
+ foreach nf = NFSet<val_lmul>.L in {
+ defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
+ VPseudoISegLoadNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
+ VPseudoISegLoadMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+ }
+ }
+ }
+ }
+ }
+}
+
+multiclass VPseudoUSSegStore {
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ let VLMul = lmul.value in {
+ foreach nf = NFSet<lmul>.L in {
+ defvar vreg = SegRegClass<lmul, nf>.RC;
+ def nf # "E" # eew # "_V_" # LInfo : VPseudoUSSegStoreNoMask<vreg, eew>;
+ def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSSegStoreMask<vreg, eew>;
+ }
+ }
+ }
+ }
+}
+
+multiclass VPseudoSSegStore {
+ foreach eew = EEWList in {
+ foreach lmul = MxSet<eew>.m in {
+ defvar LInfo = lmul.MX;
+ let VLMul = lmul.value in {
+ foreach nf = NFSet<lmul>.L in {
+ defvar vreg = SegRegClass<lmul, nf>.RC;
+ def nf # "E" # eew # "_V_" # LInfo : VPseudoSSegStoreNoMask<vreg, eew>;
+ def nf # "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSSegStoreMask<vreg, eew>;
+ }
+ }
+ }
+ }
+}
+
+multiclass VPseudoISegStore {
+ foreach idx_eew = EEWList in { // EEW for index argument.
+ foreach idx_lmul = MxSet<idx_eew>.m in { // LMUL for index argument.
+ foreach val_lmul = MxList.m in { // LMUL for the value.
+ defvar IdxLInfo = idx_lmul.MX;
+ defvar IdxVreg = idx_lmul.vrclass;
+ defvar ValLInfo = val_lmul.MX;
+ let VLMul = val_lmul.value in {
+ foreach nf = NFSet<val_lmul>.L in {
+ defvar ValVreg = SegRegClass<val_lmul, nf>.RC;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo :
+ VPseudoISegStoreNoMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+ def nf # "EI" # idx_eew # "_V_" # IdxLInfo # "_" # ValLInfo # "_MASK" :
+ VPseudoISegStoreMask<ValVreg, IdxVreg, idx_eew, idx_lmul.value>;
+ }
+ }
+ }
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the intrinsic patterns.
+//===----------------------------------------------------------------------===//
+
+class VPatUnaryNoMask<string intrinsic_name,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op2_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg op2_reg_class> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+ (op2_type op2_reg_class:$rs2),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+ (op2_type op2_reg_class:$rs2),
+ (NoX0 GPR:$vl), sew)>;
+
+class VPatUnaryMask<string intrinsic_name,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg result_reg_class,
+ VReg op2_reg_class> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+ (result_type result_reg_class:$merge),
+ (op2_type op2_reg_class:$rs2),
+ (mask_type V0),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_MASK")
+ (result_type result_reg_class:$merge),
+ (op2_type op2_reg_class:$rs2),
+ (mask_type V0), (NoX0 GPR:$vl), sew)>;
+
+class VPatMaskUnaryNoMask<string intrinsic_name,
+ string inst,
+ MTypeInfo mti> :
+ Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name)
+ (mti.Mask VR:$rs2),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_M_"#mti.BX)
+ (mti.Mask VR:$rs2),
+ (NoX0 GPR:$vl), mti.SEW)>;
+
+class VPatMaskUnaryMask<string intrinsic_name,
+ string inst,
+ MTypeInfo mti> :
+ Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name#"_mask")
+ (mti.Mask VR:$merge),
+ (mti.Mask VR:$rs2),
+ (mti.Mask V0),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
+ (mti.Mask VR:$merge),
+ (mti.Mask VR:$rs2),
+ (mti.Mask V0), (NoX0 GPR:$vl), mti.SEW)>;
+
+class VPatUnaryAnyMask<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg result_reg_class,
+ VReg op1_reg_class> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic)
+ (result_type result_reg_class:$merge),
+ (op1_type op1_reg_class:$rs1),
+ (mask_type VR:$rs2),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+ (result_type result_reg_class:$merge),
+ (op1_type op1_reg_class:$rs1),
+ (mask_type VR:$rs2),
+ (NoX0 GPR:$vl), sew)>;
+
+class VPatBinaryNoMask<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ int sew,
+ VReg op1_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (NoX0 GPR:$vl), sew)>;
+
+class VPatBinaryMask<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ VReg result_reg_class,
+ VReg op1_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+ (result_type result_reg_class:$merge),
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_MASK")
+ (result_type result_reg_class:$merge),
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0), (NoX0 GPR:$vl), sew)>;
+
+class VPatTernaryNoMask<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg result_reg_class,
+ RegisterClass op1_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic)
+ (result_type result_reg_class:$rs3),
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+ result_reg_class:$rs3,
+ (op1_type op1_reg_class:$rs1),
+ op2_kind:$rs2,
+ (NoX0 GPR:$vl), sew)>;
+
+class VPatTernaryMask<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg result_reg_class,
+ RegisterClass op1_reg_class,
+ DAGOperand op2_kind> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic#"_mask")
+ (result_type result_reg_class:$rs3),
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
+ result_reg_class:$rs3,
+ (op1_type op1_reg_class:$rs1),
+ op2_kind:$rs2,
+ (mask_type V0),
+ (NoX0 GPR:$vl), sew)>;
+
+class VPatAMOWDNoMask<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ int sew,
+ LMULInfo vlmul,
+ LMULInfo emul,
+ VReg op1_reg_class> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+ GPR:$rs1,
+ (op1_type op1_reg_class:$vs2),
+ (result_type vlmul.vrclass:$vd),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX)
+ $rs1, $vs2, $vd,
+ (NoX0 GPR:$vl), sew)>;
+
+class VPatAMOWDMask<string intrinsic_name,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ LMULInfo emul,
+ VReg op1_reg_class> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name # "_mask")
+ GPR:$rs1,
+ (op1_type op1_reg_class:$vs2),
+ (result_type vlmul.vrclass:$vd),
+ (mask_type V0),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX # "_MASK")
+ $rs1, $vs2, $vd,
+ (mask_type V0), (NoX0 GPR:$vl), sew)>;
+
+multiclass VPatUSLoad<string intrinsic,
+ string inst,
+ LLVMType type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg reg_class>
+{
+ defvar Intr = !cast<Intrinsic>(intrinsic);
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+ def : Pat<(type (Intr GPR:$rs1, GPR:$vl)),
+ (Pseudo $rs1, (NoX0 GPR:$vl), sew)>;
+ defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+ def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
+ GPR:$rs1, (mask_type V0), GPR:$vl)),
+ (PseudoMask $merge,
+ $rs1, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatUSLoadFF<string inst,
+ LLVMType type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg reg_class>
+{
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+ def : Pat<(type (riscv_vleff GPR:$rs1, GPR:$vl)),
+ (Pseudo $rs1, (NoX0 GPR:$vl), sew)>;
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+ def : Pat<(type (riscv_vleff_mask (type GetVRegNoV0<reg_class>.R:$merge),
+ GPR:$rs1, (mask_type V0), GPR:$vl)),
+ (PseudoMask $merge,
+ $rs1, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatSLoad<string intrinsic,
+ string inst,
+ LLVMType type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg reg_class>
+{
+ defvar Intr = !cast<Intrinsic>(intrinsic);
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+ def : Pat<(type (Intr GPR:$rs1, GPR:$rs2, GPR:$vl)),
+ (Pseudo $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+ defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+ def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
+ GPR:$rs1, GPR:$rs2, (mask_type V0), GPR:$vl)),
+ (PseudoMask $merge,
+ $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatILoad<string intrinsic,
+ string inst,
+ LLVMType type,
+ LLVMType idx_type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ LMULInfo idx_vlmul,
+ VReg reg_class,
+ VReg idx_reg_class>
+{
+ defvar Intr = !cast<Intrinsic>(intrinsic);
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
+ def : Pat<(type (Intr GPR:$rs1, (idx_type idx_reg_class:$rs2), GPR:$vl)),
+ (Pseudo $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+
+ defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
+ def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
+ GPR:$rs1, (idx_type idx_reg_class:$rs2),
+ (mask_type V0), GPR:$vl)),
+ (PseudoMask $merge,
+ $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatUSStore<string intrinsic,
+ string inst,
+ LLVMType type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg reg_class>
+{
+ defvar Intr = !cast<Intrinsic>(intrinsic);
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+ def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$vl),
+ (Pseudo $rs3, $rs1, (NoX0 GPR:$vl), sew)>;
+ defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+ def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, (mask_type V0), GPR:$vl),
+ (PseudoMask $rs3, $rs1, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatSStore<string intrinsic,
+ string inst,
+ LLVMType type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg reg_class>
+{
+ defvar Intr = !cast<Intrinsic>(intrinsic);
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
+ def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, GPR:$vl),
+ (Pseudo $rs3, $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+ defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
+ def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (mask_type V0), GPR:$vl),
+ (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatIStore<string intrinsic,
+ string inst,
+ LLVMType type,
+ LLVMType idx_type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ LMULInfo idx_vlmul,
+ VReg reg_class,
+ VReg idx_reg_class>
+{
+ defvar Intr = !cast<Intrinsic>(intrinsic);
+ defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
+ def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1,
+ (idx_type idx_reg_class:$rs2), GPR:$vl),
+ (Pseudo $rs3, $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+ defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+ defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
+ def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1,
+ (idx_type idx_reg_class:$rs2), (mask_type V0), GPR:$vl),
+ (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatUnaryS_M<string intrinsic_name,
+ string inst>
+{
+ foreach mti = AllMasks in {
+ def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name)
+ (mti.Mask VR:$rs1), GPR:$vl)),
+ (!cast<Instruction>(inst#"_M_"#mti.BX) $rs1,
+ (NoX0 GPR:$vl), mti.SEW)>;
+ def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name # "_mask")
+ (mti.Mask VR:$rs1), (mti.Mask V0), GPR:$vl)),
+ (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK") $rs1,
+ (mti.Mask V0), (NoX0 GPR:$vl), mti.SEW)>;
+ }
+}
+
+multiclass VPatUnaryV_V_AnyMask<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in {
+ def : VPatUnaryAnyMask<intrinsic, instruction, "VM",
+ vti.Vector, vti.Vector, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass,
+ vti.RegClass>;
+ }
+}
+
+multiclass VPatUnaryM_M<string intrinsic,
+ string inst>
+{
+ foreach mti = AllMasks in {
+ def : VPatMaskUnaryNoMask<intrinsic, inst, mti>;
+ def : VPatMaskUnaryMask<intrinsic, inst, mti>;
+ }
+}
+
+multiclass VPatUnaryV_M<string intrinsic, string instruction>
+{
+ foreach vti = AllIntegerVectors in {
+ def : VPatUnaryNoMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
+ vti.SEW, vti.LMul, VR>;
+ def : VPatUnaryMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
+ vti.Mask, vti.SEW, vti.LMul, vti.RegClass, VR>;
+ }
+}
+
+multiclass VPatUnaryV_VF<string intrinsic, string instruction, string suffix,
+ list<VTypeInfoToFraction> fractionList>
+{
+ foreach vtiTofti = fractionList in
+ {
+ defvar vti = vtiTofti.Vti;
+ defvar fti = vtiTofti.Fti;
+ def : VPatUnaryNoMask<intrinsic, instruction, suffix,
+ vti.Vector, fti.Vector,
+ vti.SEW, vti.LMul, fti.RegClass>;
+ def : VPatUnaryMask<intrinsic, instruction, suffix,
+ vti.Vector, fti.Vector, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass, fti.RegClass>;
+ }
+}
+
+multiclass VPatUnaryV_V<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in {
+ def : VPatUnaryNoMask<intrinsic, instruction, "V",
+ vti.Vector, vti.Vector,
+ vti.SEW, vti.LMul, vti.RegClass>;
+ def : VPatUnaryMask<intrinsic, instruction, "V",
+ vti.Vector, vti.Vector, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass, vti.RegClass>;
+ }
+}
+
+multiclass VPatNullaryV<string intrinsic, string instruction>
+{
+ foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
+ (NoX0 GPR:$vl), vti.SEW)>;
+ def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
+ (vti.Vector vti.RegClass:$merge),
+ (vti.Mask V0), (XLenVT GPR:$vl))),
+ (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
+ vti.RegClass:$merge, (vti.Mask V0),
+ (NoX0 GPR:$vl), vti.SEW)>;
+ }
+}
+
+multiclass VPatNullaryM<string intrinsic, string inst> {
+ foreach mti = AllMasks in
+ def : Pat<(mti.Mask (!cast<Intrinsic>(intrinsic)
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_M_"#mti.BX)
+ (NoX0 GPR:$vl), mti.SEW)>;
+}
+
+multiclass VPatBinary<string intrinsic,
+ string inst,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ VReg result_reg_class,
+ VReg op1_reg_class,
+ DAGOperand op2_kind>
+{
+ def : VPatBinaryNoMask<intrinsic, inst, result_type, op1_type, op2_type,
+ sew, op1_reg_class, op2_kind>;
+ def : VPatBinaryMask<intrinsic, inst, result_type, op1_type, op2_type,
+ mask_type, sew, result_reg_class, op1_reg_class,
+ op2_kind>;
+}
+
+multiclass VPatBinaryCarryIn<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg op1_reg_class,
+ DAGOperand op2_kind>
+{
+ def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (mask_type V0), (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatBinaryMaskOut<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg op1_reg_class,
+ DAGOperand op2_kind>
+{
+ def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (XLenVT GPR:$vl))),
+ (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+ (op1_type op1_reg_class:$rs1),
+ (op2_type op2_kind:$rs2),
+ (NoX0 GPR:$vl), sew)>;
+}
+
+multiclass VPatConversion<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg result_reg_class,
+ VReg op1_reg_class>
+{
+ def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
+ sew, vlmul, op1_reg_class>;
+ def : VPatUnaryMask<intrinsic, inst, kind, result_type, op1_type,
+ mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
+}
+
+multiclass VPatBinaryV_VV<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+ vti.Vector, vti.Vector, vti.Vector,vti.Mask,
+ vti.SEW, vti.RegClass,
+ vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_VV_INT<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in {
+ defvar ivti = GetIntVTypeInfo<vti>.Vti;
+ defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+ vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
+ vti.SEW, vti.RegClass,
+ vti.RegClass, vti.RegClass>;
+ }
+}
+
+multiclass VPatBinaryV_VV_INT_EEW<string intrinsic, string instruction,
+ int eew, list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in {
+ // emul = lmul * eew / sew
+ defvar vlmul = vti.LMul;
+ defvar octuple_lmul = octuple_from_str<vlmul.MX>.ret;
+ defvar octuple_emul = !srl(!mul(octuple_lmul, eew), shift_amount<vti.SEW>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar emul_str = octuple_to_str<octuple_emul>.ret;
+ defvar ivti = !cast<VTypeInfo>("VI" # eew # emul_str);
+ defvar inst = instruction # "_VV_" # vti.LMul.MX # "_" # emul_str;
+ defm : VPatBinary<intrinsic, inst,
+ vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
+ vti.SEW, vti.RegClass,
+ vti.RegClass, ivti.RegClass>;
+ }
+ }
+}
+
+multiclass VPatBinaryV_VX<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in {
+ defvar kind = "V"#vti.ScalarSuffix;
+ defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
+ vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
+ vti.SEW, vti.RegClass,
+ vti.RegClass, vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryV_VX_INT<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatBinary<intrinsic, instruction # "_VX_" # vti.LMul.MX,
+ vti.Vector, vti.Vector, XLenVT, vti.Mask,
+ vti.SEW, vti.RegClass,
+ vti.RegClass, GPR>;
+}
+
+multiclass VPatBinaryV_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist, Operand imm_type> {
+ foreach vti = vtilist in
+ defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+ vti.Vector, vti.Vector, XLenVT, vti.Mask,
+ vti.SEW, vti.RegClass,
+ vti.RegClass, imm_type>;
+}
+
+multiclass VPatBinaryM_MM<string intrinsic, string instruction> {
+ foreach mti = AllMasks in
+ def : VPatBinaryNoMask<intrinsic, instruction # "_MM_" # mti.LMul.MX,
+ mti.Mask, mti.Mask, mti.Mask,
+ mti.SEW, VR, VR>;
+}
+
+multiclass VPatBinaryW_VV<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defm : VPatBinary<intrinsic, instruction # "_VV_" # Vti.LMul.MX,
+ Wti.Vector, Vti.Vector, Vti.Vector, Vti.Mask,
+ Vti.SEW, Wti.RegClass,
+ Vti.RegClass, Vti.RegClass>;
+ }
+}
+
+multiclass VPatBinaryW_VX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defvar kind = "V"#Vti.ScalarSuffix;
+ defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+ Wti.Vector, Vti.Vector, Vti.Scalar, Vti.Mask,
+ Vti.SEW, Wti.RegClass,
+ Vti.RegClass, Vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryW_WV<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+ Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+ Vti.SEW, Wti.RegClass,
+ Wti.RegClass, Vti.RegClass>;
+ }
+}
+
+multiclass VPatBinaryW_WX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defvar kind = "W"#Vti.ScalarSuffix;
+ defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+ Wti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
+ Vti.SEW, Wti.RegClass,
+ Wti.RegClass, Vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryV_WV<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+ Vti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+ Vti.SEW, Vti.RegClass,
+ Wti.RegClass, Vti.RegClass>;
+ }
+}
+
+multiclass VPatBinaryV_WX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defvar kind = "W"#Vti.ScalarSuffix;
+ defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+ Vti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
+ Vti.SEW, Vti.RegClass,
+ Wti.RegClass, Vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryV_WI<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach VtiToWti = vtilist in {
+ defvar Vti = VtiToWti.Vti;
+ defvar Wti = VtiToWti.Wti;
+ defm : VPatBinary<intrinsic, instruction # "_WI_" # Vti.LMul.MX,
+ Vti.Vector, Wti.Vector, XLenVT, Vti.Mask,
+ Vti.SEW, Vti.RegClass,
+ Wti.RegClass, uimm5>;
+ }
+}
+
+multiclass VPatBinaryV_VM<string intrinsic, string instruction,
+ bit CarryOut = 0,
+ list<VTypeInfo> vtilist = AllIntegerVectors> {
+ foreach vti = vtilist in
+ defm : VPatBinaryCarryIn<intrinsic, instruction, "VVM",
+ !if(CarryOut, vti.Mask, vti.Vector),
+ vti.Vector, vti.Vector, vti.Mask,
+ vti.SEW, vti.LMul,
+ vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_XM<string intrinsic, string instruction,
+ bit CarryOut = 0,
+ list<VTypeInfo> vtilist = AllIntegerVectors> {
+ foreach vti = vtilist in
+ defm : VPatBinaryCarryIn<intrinsic, instruction,
+ "V"#vti.ScalarSuffix#"M",
+ !if(CarryOut, vti.Mask, vti.Vector),
+ vti.Vector, vti.Scalar, vti.Mask,
+ vti.SEW, vti.LMul,
+ vti.RegClass, vti.ScalarRegClass>;
+}
+
+multiclass VPatBinaryV_IM<string intrinsic, string instruction,
+ bit CarryOut = 0> {
+ foreach vti = AllIntegerVectors in
+ defm : VPatBinaryCarryIn<intrinsic, instruction, "VIM",
+ !if(CarryOut, vti.Mask, vti.Vector),
+ vti.Vector, XLenVT, vti.Mask,
+ vti.SEW, vti.LMul,
+ vti.RegClass, simm5>;
+}
+
+multiclass VPatBinaryV_V<string intrinsic, string instruction> {
+ foreach vti = AllIntegerVectors in
+ defm : VPatBinaryMaskOut<intrinsic, instruction, "VV",
+ vti.Mask, vti.Vector, vti.Vector,
+ vti.SEW, vti.LMul,
+ vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_X<string intrinsic, string instruction> {
+ foreach vti = AllIntegerVectors in
+ defm : VPatBinaryMaskOut<intrinsic, instruction, "VX",
+ vti.Mask, vti.Vector, XLenVT,
+ vti.SEW, vti.LMul,
+ vti.RegClass, GPR>;
+}
+
+multiclass VPatBinaryV_I<string intrinsic, string instruction> {
+ foreach vti = AllIntegerVectors in
+ defm : VPatBinaryMaskOut<intrinsic, instruction, "VI",
+ vti.Mask, vti.Vector, XLenVT,
+ vti.SEW, vti.LMul,
+ vti.RegClass, simm5>;
+}
+
+multiclass VPatBinaryM_VV<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+ vti.Mask, vti.Vector, vti.Vector, vti.Mask,
+ vti.SEW, VR,
+ vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryM_VX<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in {
+ defvar kind = "V"#vti.ScalarSuffix;
+ defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
+ vti.Mask, vti.Vector, vti.Scalar, vti.Mask,
+ vti.SEW, VR,
+ vti.RegClass, vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryM_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+ vti.Mask, vti.Vector, XLenVT, vti.Mask,
+ vti.SEW, VR,
+ vti.RegClass, simm5>;
+}
+
+multiclass VPatBinaryV_VV_VX_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist, Operand ImmType = simm5>
+{
+ defm "" : VPatBinaryV_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
+}
+
+multiclass VPatBinaryV_VV_VX<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist>
+{
+ defm "" : VPatBinaryV_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_VX_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist>
+{
+ defm "" : VPatBinaryV_VX<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, simm5>;
+}
+
+multiclass VPatBinaryW_VV_VX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist>
+{
+ defm "" : VPatBinaryW_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryW_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryW_WV_WX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist>
+{
+ defm "" : VPatBinaryW_WV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryW_WX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_WV_WX_WI<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist>
+{
+ defm "" : VPatBinaryV_WV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_WX<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_WI<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_VM_XM_IM<string intrinsic, string instruction>
+{
+ defm "" : VPatBinaryV_VM<intrinsic, instruction>;
+ defm "" : VPatBinaryV_XM<intrinsic, instruction>;
+ defm "" : VPatBinaryV_IM<intrinsic, instruction>;
+}
+
+multiclass VPatBinaryM_VM_XM_IM<string intrinsic, string instruction>
+{
+ defm "" : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>;
+ defm "" : VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
+ defm "" : VPatBinaryV_IM<intrinsic, instruction, /*CarryOut=*/1>;
+}
+
+multiclass VPatBinaryM_V_X_I<string intrinsic, string instruction>
+{
+ defm "" : VPatBinaryV_V<intrinsic, instruction>;
+ defm "" : VPatBinaryV_X<intrinsic, instruction>;
+ defm "" : VPatBinaryV_I<intrinsic, instruction>;
+}
+
+multiclass VPatBinaryV_VM_XM<string intrinsic, string instruction>
+{
+ defm "" : VPatBinaryV_VM<intrinsic, instruction>;
+ defm "" : VPatBinaryV_XM<intrinsic, instruction>;
+}
+
+multiclass VPatBinaryM_VM_XM<string intrinsic, string instruction>
+{
+ defm "" : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>;
+ defm "" : VPatBinaryV_XM<intrinsic, instruction, /*CarryOut=*/1>;
+}
+
+multiclass VPatBinaryM_V_X<string intrinsic, string instruction>
+{
+ defm "" : VPatBinaryV_V<intrinsic, instruction>;
+ defm "" : VPatBinaryV_X<intrinsic, instruction>;
+}
+
+multiclass VPatTernary<string intrinsic,
+ string inst,
+ string kind,
+ ValueType result_type,
+ ValueType op1_type,
+ ValueType op2_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ VReg result_reg_class,
+ RegisterClass op1_reg_class,
+ DAGOperand op2_kind> {
+ def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
+ mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+ op2_kind>;
+ def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
+ mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+ op2_kind>;
+}
+
+multiclass VPatTernaryV_VV<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatTernary<intrinsic, instruction, "VV",
+ vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass,
+ vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatTernaryV_VX<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatTernary<intrinsic, instruction, "VX",
+ vti.Vector, vti.Vector, XLenVT, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass,
+ vti.RegClass, GPR>;
+}
+
+multiclass VPatTernaryV_VX_AAXA<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ foreach vti = vtilist in
+ defm : VPatTernary<intrinsic, instruction,
+ "V"#vti.ScalarSuffix,
+ vti.Vector, vti.Scalar, vti.Vector, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass,
+ vti.ScalarRegClass, vti.RegClass>;
+}
+
+multiclass VPatTernaryV_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist, Operand Imm_type> {
+ foreach vti = vtilist in
+ defm : VPatTernary<intrinsic, instruction, "VI",
+ vti.Vector, vti.Vector, XLenVT, vti.Mask,
+ vti.SEW, vti.LMul, vti.RegClass,
+ vti.RegClass, Imm_type>;
+}
+
+multiclass VPatTernaryW_VV<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach vtiToWti = vtilist in {
+ defvar vti = vtiToWti.Vti;
+ defvar wti = vtiToWti.Wti;
+ defm : VPatTernary<intrinsic, instruction, "VV",
+ wti.Vector, vti.Vector, vti.Vector,
+ vti.Mask, vti.SEW, vti.LMul,
+ wti.RegClass, vti.RegClass, vti.RegClass>;
+ }
+}
+
+multiclass VPatTernaryW_VX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ foreach vtiToWti = vtilist in {
+ defvar vti = vtiToWti.Vti;
+ defvar wti = vtiToWti.Wti;
+ defm : VPatTernary<intrinsic, instruction,
+ "V"#vti.ScalarSuffix,
+ wti.Vector, vti.Scalar, vti.Vector,
+ vti.Mask, vti.SEW, vti.LMul,
+ wti.RegClass, vti.ScalarRegClass, vti.RegClass>;
+ }
+}
+
+multiclass VPatTernaryV_VV_VX_AAXA<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist> {
+ defm "" : VPatTernaryV_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatTernaryV_VX_AAXA<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatTernaryV_VX_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist, Operand Imm_type = simm5> {
+ defm "" : VPatTernaryV_VX<intrinsic, instruction, vtilist>;
+ defm "" : VPatTernaryV_VI<intrinsic, instruction, vtilist, Imm_type>;
+}
+
+multiclass VPatBinaryM_VV_VX_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist>
+{
+ defm "" : VPatBinaryM_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryM_VI<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatTernaryW_VV_VX<string intrinsic, string instruction,
+ list<VTypeInfoToWide> vtilist> {
+ defm "" : VPatTernaryW_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatTernaryW_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryM_VV_VX<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist>
+{
+ defm "" : VPatBinaryM_VV<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryM_VX_VI<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist>
+{
+ defm "" : VPatBinaryM_VX<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryM_VI<intrinsic, instruction, vtilist>;
+}
+
+multiclass VPatBinaryV_VV_VX_VI_INT<string intrinsic, string instruction,
+ list<VTypeInfo> vtilist, Operand ImmType = simm5>
+{
+ defm "" : VPatBinaryV_VV_INT<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_VX_INT<intrinsic, instruction, vtilist>;
+ defm "" : VPatBinaryV_VI<intrinsic, instruction, vtilist, ImmType>;
+}
+
+multiclass VPatReductionV_VS<string intrinsic, string instruction, bit IsFloat = 0> {
+ foreach vti = !if(IsFloat, NoGroupFloatVectors, NoGroupIntegerVectors) in
+ {
+ defvar vectorM1 = !cast<VTypeInfo>(!if(IsFloat, "VF", "VI") # vti.SEW # "M1");
+ defm : VPatTernary<intrinsic, instruction, "VS",
+ vectorM1.Vector, vti.Vector,
+ vectorM1.Vector, vti.Mask,
+ vti.SEW, vti.LMul,
+ VR, vti.RegClass, VR>;
+ }
+ foreach gvti = !if(IsFloat, GroupFloatVectors, GroupIntegerVectors) in
+ {
+ defm : VPatTernary<intrinsic, instruction, "VS",
+ gvti.VectorM1, gvti.Vector,
+ gvti.VectorM1, gvti.Mask,
+ gvti.SEW, gvti.LMul,
+ VR, gvti.RegClass, VR>;
+ }
+}
+
+multiclass VPatReductionW_VS<string intrinsic, string instruction, bit IsFloat = 0> {
+ foreach vti = !if(IsFloat, AllFloatVectors, AllIntegerVectors) in
+ {
+ defvar wtiSEW = !mul(vti.SEW, 2);
+ if !le(wtiSEW, 64) then {
+ defvar wtiM1 = !cast<VTypeInfo>(!if(IsFloat, "VF", "VI") # wtiSEW # "M1");
+ defm : VPatTernary<intrinsic, instruction, "VS",
+ wtiM1.Vector, vti.Vector,
+ wtiM1.Vector, vti.Mask,
+ vti.SEW, vti.LMul,
+ wtiM1.RegClass, vti.RegClass,
+ wtiM1.RegClass>;
+ }
+ }
+}
+
+multiclass VPatConversionVI_VF<string intrinsic,
+ string instruction>
+{
+ foreach fvti = AllFloatVectors in
+ {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+
+ defm : VPatConversion<intrinsic, instruction, "V",
+ ivti.Vector, fvti.Vector, ivti.Mask, fvti.SEW,
+ fvti.LMul, ivti.RegClass, fvti.RegClass>;
+ }
+}
+
+multiclass VPatConversionVF_VI<string intrinsic,
+ string instruction>
+{
+ foreach fvti = AllFloatVectors in
+ {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fvti.Vector, ivti.Vector, fvti.Mask, ivti.SEW,
+ ivti.LMul, fvti.RegClass, ivti.RegClass>;
+ }
+}
+
+multiclass VPatConversionWI_VF<string intrinsic, string instruction> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+
+ defm : VPatConversion<intrinsic, instruction, "V",
+ iwti.Vector, fvti.Vector, iwti.Mask, fvti.SEW,
+ fvti.LMul, iwti.RegClass, fvti.RegClass>;
+ }
+}
+
+multiclass VPatConversionWF_VI<string intrinsic, string instruction> {
+ foreach vtiToWti = AllWidenableIntToFloatVectors in
+ {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, vti.Vector, fwti.Mask, vti.SEW,
+ vti.LMul, fwti.RegClass, vti.RegClass>;
+ }
+}
+
+multiclass VPatConversionWF_VF <string intrinsic, string instruction> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, fvti.Vector, fwti.Mask, fvti.SEW,
+ fvti.LMul, fwti.RegClass, fvti.RegClass>;
+ }
+}
+
+multiclass VPatConversionVI_WF <string intrinsic, string instruction> {
+ foreach vtiToWti = AllWidenableIntToFloatVectors in
+ {
+ defvar vti = vtiToWti.Vti;
+ defvar fwti = vtiToWti.Wti;
+
+ defm : VPatConversion<intrinsic, instruction, "W",
+ vti.Vector, fwti.Vector, vti.Mask, vti.SEW,
+ vti.LMul, vti.RegClass, fwti.RegClass>;
+ }
+}
+
+multiclass VPatConversionVF_WI <string intrinsic, string instruction> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+
+ defm : VPatConversion<intrinsic, instruction, "W",
+ fvti.Vector, iwti.Vector, fvti.Mask, fvti.SEW,
+ fvti.LMul, fvti.RegClass, iwti.RegClass>;
+ }
+}
+
+multiclass VPatConversionVF_WF <string intrinsic, string instruction> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+
+ defm : VPatConversion<intrinsic, instruction, "W",
+ fvti.Vector, fwti.Vector, fvti.Mask, fvti.SEW,
+ fvti.LMul, fvti.RegClass, fwti.RegClass>;
+ }
+}
+
+multiclass VPatAMOWD<string intrinsic,
+ string inst,
+ ValueType result_type,
+ ValueType offset_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ LMULInfo emul,
+ VReg op1_reg_class>
+{
+ def : VPatAMOWDNoMask<intrinsic, inst, result_type, offset_type,
+ sew, vlmul, emul, op1_reg_class>;
+ def : VPatAMOWDMask<intrinsic, inst, result_type, offset_type,
+ mask_type, sew, vlmul, emul, op1_reg_class>;
+}
+
+multiclass VPatAMOV_WD<string intrinsic,
+ string inst,
+ list<VTypeInfo> vtilist> {
+ foreach eew = EEWList in {
+ foreach vti = vtilist in {
+ if !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64)) then {
+ defvar octuple_lmul = octuple_from_str<vti.LMul.MX>.ret;
+ // Calculate emul = eew * lmul / sew
+ defvar octuple_emul = !srl(!mul(eew, octuple_lmul), shift_amount<vti.SEW>.val);
+ if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
+ defvar emulMX = octuple_to_str<octuple_emul>.ret;
+ defvar offsetVti = !cast<VTypeInfo>("VI" # eew # emulMX);
+ defvar inst_ei = inst # "EI" # eew;
+ defm : VPatAMOWD<intrinsic, inst_ei,
+ vti.Vector, offsetVti.Vector,
+ vti.Mask, vti.SEW, vti.LMul, offsetVti.LMul, offsetVti.RegClass>;
+ }
+ }
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions for CodeGen
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+ def PseudoVMV1R_V : VPseudo<VMV1R_V, V_M1, (outs VR:$vd), (ins VR:$vs2)>;
+ def PseudoVMV2R_V : VPseudo<VMV2R_V, V_M2, (outs VRM2:$vd), (ins VRM2:$vs2)>;
+ def PseudoVMV4R_V : VPseudo<VMV4R_V, V_M4, (outs VRM4:$vd), (ins VRM4:$vs2)>;
+ def PseudoVMV8R_V : VPseudo<VMV8R_V, V_M8, (outs VRM8:$vd), (ins VRM8:$vs2)>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1 in {
+ def PseudoReadVLENB : Pseudo<(outs GPR:$rd), (ins),
+ [(set GPR:$rd, (riscv_read_vlenb))]>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1,
+ Uses = [VL] in
+def PseudoReadVL : Pseudo<(outs GPR:$rd), (ins),
+ [(set GPR:$rd, (riscv_read_vl))]>;
+
+//===----------------------------------------------------------------------===//
+// 6. Configuration-Setting Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudos.
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
+def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei), []>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// 7. Vector Loads and Stores
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 7.4 Vector Unit-Stride Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudos Unit-Stride Loads and Stores
+foreach eew = EEWList in {
+ defm PseudoVLE # eew : VPseudoUSLoad;
+ defm PseudoVSE # eew : VPseudoUSStore;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.5 Vector Strided Instructions
+//===----------------------------------------------------------------------===//
+
+// Vector Strided Loads and Stores
+foreach eew = EEWList in {
+ defm PseudoVLSE # eew : VPseudoSLoad;
+ defm PseudoVSSE # eew : VPseudoSStore;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.6 Vector Indexed Instructions
+//===----------------------------------------------------------------------===//
+
+// Vector Indexed Loads and Stores
+foreach eew = EEWList in {
+ defm PseudoVLUXEI # eew : VPseudoILoad;
+ defm PseudoVLOXEI # eew : VPseudoILoad;
+ defm PseudoVSOXEI # eew : VPseudoIStore;
+ defm PseudoVSUXEI # eew : VPseudoIStore;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.7. Unit-stride Fault-Only-First Loads
+//===----------------------------------------------------------------------===//
+
+// vleff may update VL register
+let hasSideEffects = 1, Defs = [VL] in
+foreach eew = EEWList in {
+ defm PseudoVLE # eew # FF : VPseudoUSLoad;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.8. Vector Load/Store Segment Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVLSEG : VPseudoUSSegLoad</*fault-only-first*/false>;
+defm PseudoVLSSEG : VPseudoSSegLoad;
+defm PseudoVLOXSEG : VPseudoISegLoad;
+defm PseudoVLUXSEG : VPseudoISegLoad;
+defm PseudoVSSEG : VPseudoUSSegStore;
+defm PseudoVSSSEG : VPseudoSSegStore;
+defm PseudoVSOXSEG : VPseudoISegStore;
+defm PseudoVSUXSEG : VPseudoISegStore;
+
+// vlseg<nf>e<eew>ff.v may update VL register
+let hasSideEffects = 1, Defs = [VL] in
+defm PseudoVLSEG : VPseudoUSSegLoad</*fault-only-first*/true>;
+
+//===----------------------------------------------------------------------===//
+// 8. Vector AMO Operations
+//===----------------------------------------------------------------------===//
+defm PseudoVAMOSWAP : VPseudoAMO;
+defm PseudoVAMOADD : VPseudoAMO;
+defm PseudoVAMOXOR : VPseudoAMO;
+defm PseudoVAMOAND : VPseudoAMO;
+defm PseudoVAMOOR : VPseudoAMO;
+defm PseudoVAMOMIN : VPseudoAMO;
+defm PseudoVAMOMAX : VPseudoAMO;
+defm PseudoVAMOMINU : VPseudoAMO;
+defm PseudoVAMOMAXU : VPseudoAMO;
+
+//===----------------------------------------------------------------------===//
+// 12. Vector Integer Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 12.1. Vector Single-Width Integer Add and Subtract
+//===----------------------------------------------------------------------===//
+defm PseudoVADD : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVSUB : VPseudoBinaryV_VV_VX;
+defm PseudoVRSUB : VPseudoBinaryV_VX_VI;
+
+//===----------------------------------------------------------------------===//
+// 12.2. Vector Widening Integer Add/Subtract
+//===----------------------------------------------------------------------===//
+defm PseudoVWADDU : VPseudoBinaryW_VV_VX;
+defm PseudoVWSUBU : VPseudoBinaryW_VV_VX;
+defm PseudoVWADD : VPseudoBinaryW_VV_VX;
+defm PseudoVWSUB : VPseudoBinaryW_VV_VX;
+defm PseudoVWADDU : VPseudoBinaryW_WV_WX;
+defm PseudoVWSUBU : VPseudoBinaryW_WV_WX;
+defm PseudoVWADD : VPseudoBinaryW_WV_WX;
+defm PseudoVWSUB : VPseudoBinaryW_WV_WX;
+
+//===----------------------------------------------------------------------===//
+// 12.3. Vector Integer Extension
+//===----------------------------------------------------------------------===//
+defm PseudoVZEXT_VF2 : PseudoUnaryV_VF2;
+defm PseudoVZEXT_VF4 : PseudoUnaryV_VF4;
+defm PseudoVZEXT_VF8 : PseudoUnaryV_VF8;
+defm PseudoVSEXT_VF2 : PseudoUnaryV_VF2;
+defm PseudoVSEXT_VF4 : PseudoUnaryV_VF4;
+defm PseudoVSEXT_VF8 : PseudoUnaryV_VF8;
+
+//===----------------------------------------------------------------------===//
+// 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVADC : VPseudoBinaryV_VM_XM_IM;
+defm PseudoVMADC : VPseudoBinaryM_VM_XM_IM<"@earlyclobber $rd">;
+defm PseudoVMADC : VPseudoBinaryM_V_X_I<"@earlyclobber $rd">;
+
+defm PseudoVSBC : VPseudoBinaryV_VM_XM;
+defm PseudoVMSBC : VPseudoBinaryM_VM_XM<"@earlyclobber $rd">;
+defm PseudoVMSBC : VPseudoBinaryM_V_X<"@earlyclobber $rd">;
+
+//===----------------------------------------------------------------------===//
+// 12.5. Vector Bitwise Logical Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVAND : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVOR : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVXOR : VPseudoBinaryV_VV_VX_VI;
+
+//===----------------------------------------------------------------------===//
+// 12.6. Vector Single-Width Bit Shift Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVSLL : VPseudoBinaryV_VV_VX_VI<uimm5>;
+defm PseudoVSRL : VPseudoBinaryV_VV_VX_VI<uimm5>;
+defm PseudoVSRA : VPseudoBinaryV_VV_VX_VI<uimm5>;
+
+//===----------------------------------------------------------------------===//
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVNSRL : VPseudoBinaryV_WV_WX_WI;
+defm PseudoVNSRA : VPseudoBinaryV_WV_WX_WI;
+
+//===----------------------------------------------------------------------===//
+// 12.8. Vector Integer Comparison Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMSEQ : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSNE : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSLTU : VPseudoBinaryM_VV_VX;
+defm PseudoVMSLT : VPseudoBinaryM_VV_VX;
+defm PseudoVMSLEU : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSLE : VPseudoBinaryM_VV_VX_VI;
+defm PseudoVMSGTU : VPseudoBinaryM_VX_VI;
+defm PseudoVMSGT : VPseudoBinaryM_VX_VI;
+
+//===----------------------------------------------------------------------===//
+// 12.9. Vector Integer Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMINU : VPseudoBinaryV_VV_VX;
+defm PseudoVMIN : VPseudoBinaryV_VV_VX;
+defm PseudoVMAXU : VPseudoBinaryV_VV_VX;
+defm PseudoVMAX : VPseudoBinaryV_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.10. Vector Single-Width Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMUL : VPseudoBinaryV_VV_VX;
+defm PseudoVMULH : VPseudoBinaryV_VV_VX;
+defm PseudoVMULHU : VPseudoBinaryV_VV_VX;
+defm PseudoVMULHSU : VPseudoBinaryV_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.11. Vector Integer Divide Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVDIVU : VPseudoBinaryV_VV_VX;
+defm PseudoVDIV : VPseudoBinaryV_VV_VX;
+defm PseudoVREMU : VPseudoBinaryV_VV_VX;
+defm PseudoVREM : VPseudoBinaryV_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.12. Vector Widening Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVWMUL : VPseudoBinaryW_VV_VX;
+defm PseudoVWMULU : VPseudoBinaryW_VV_VX;
+defm PseudoVWMULSU : VPseudoBinaryW_VV_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.13. Vector Single-Width Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMACC : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVNMSAC : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVMADD : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVNMSUB : VPseudoTernaryV_VV_VX_AAXA;
+
+//===----------------------------------------------------------------------===//
+// 12.14. Vector Widening Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVWMACCU : VPseudoTernaryW_VV_VX;
+defm PseudoVWMACC : VPseudoTernaryW_VV_VX;
+defm PseudoVWMACCSU : VPseudoTernaryW_VV_VX;
+defm PseudoVWMACCUS : VPseudoTernaryW_VX;
+
+//===----------------------------------------------------------------------===//
+// 12.16. Vector Integer Merge Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMERGE : VPseudoBinaryV_VM_XM_IM;
+
+//===----------------------------------------------------------------------===//
+// 12.17. Vector Integer Move Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMV_V : VPseudoUnaryV_V_X_I_NoDummyMask;
+
+//===----------------------------------------------------------------------===//
+// 13.1. Vector Single-Width Saturating Add and Subtract
+//===----------------------------------------------------------------------===//
+let Defs = [VXSAT], hasSideEffects = 1 in {
+ defm PseudoVSADDU : VPseudoBinaryV_VV_VX_VI;
+ defm PseudoVSADD : VPseudoBinaryV_VV_VX_VI;
+ defm PseudoVSSUBU : VPseudoBinaryV_VV_VX;
+ defm PseudoVSSUB : VPseudoBinaryV_VV_VX;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.2. Vector Single-Width Averaging Add and Subtract
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], hasSideEffects = 1 in {
+ defm PseudoVAADDU : VPseudoBinaryV_VV_VX;
+ defm PseudoVAADD : VPseudoBinaryV_VV_VX;
+ defm PseudoVASUBU : VPseudoBinaryV_VV_VX;
+ defm PseudoVASUB : VPseudoBinaryV_VV_VX;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
+ defm PseudoVSMUL : VPseudoBinaryV_VV_VX;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.4. Vector Single-Width Scaling Shift Instructions
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], hasSideEffects = 1 in {
+ defm PseudoVSSRL : VPseudoBinaryV_VV_VX_VI<uimm5>;
+ defm PseudoVSSRA : VPseudoBinaryV_VV_VX_VI<uimm5>;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.5. Vector Narrowing Fixed-Point Clip Instructions
+//===----------------------------------------------------------------------===//
+let Uses = [VL, VTYPE, VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
+ defm PseudoVNCLIP : VPseudoBinaryV_WV_WX_WI;
+ defm PseudoVNCLIPU : VPseudoBinaryV_WV_WX_WI;
+}
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFADD : VPseudoBinaryV_VV_VF;
+defm PseudoVFSUB : VPseudoBinaryV_VV_VF;
+defm PseudoVFRSUB : VPseudoBinaryV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWADD : VPseudoBinaryW_VV_VF;
+defm PseudoVFWSUB : VPseudoBinaryW_VV_VF;
+defm PseudoVFWADD : VPseudoBinaryW_WV_WF;
+defm PseudoVFWSUB : VPseudoBinaryW_WV_WF;
+
+//===----------------------------------------------------------------------===//
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFMUL : VPseudoBinaryV_VV_VF;
+defm PseudoVFDIV : VPseudoBinaryV_VV_VF;
+defm PseudoVFRDIV : VPseudoBinaryV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.5. Vector Widening Floating-Point Multiply
+//===----------------------------------------------------------------------===//
+defm PseudoVFWMUL : VPseudoBinaryW_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFMACC : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMACC : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMSAC : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMSAC : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMADD : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMADD : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMSUB : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFNMSUB : VPseudoTernaryV_VV_VF_AAXA;
+
+//===----------------------------------------------------------------------===//
+// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWMACC : VPseudoTernaryW_VV_VF;
+defm PseudoVFWNMACC : VPseudoTernaryW_VV_VF;
+defm PseudoVFWMSAC : VPseudoTernaryW_VV_VF;
+defm PseudoVFWNMSAC : VPseudoTernaryW_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.8. Vector Floating-Point Square-Root Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFSQRT : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFRSQRTE7 : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.10. Vector Floating-Point Reciprocal Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFRECE7 : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.11. Vector Floating-Point Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFMIN : VPseudoBinaryV_VV_VF;
+defm PseudoVFMAX : VPseudoBinaryV_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.12. Vector Floating-Point Sign-Injection Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFSGNJ : VPseudoBinaryV_VV_VF;
+defm PseudoVFSGNJN : VPseudoBinaryV_VV_VF;
+defm PseudoVFSGNJX : VPseudoBinaryV_VV_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.13. Vector Floating-Point Compare Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVMFEQ : VPseudoBinaryM_VV_VF;
+defm PseudoVMFNE : VPseudoBinaryM_VV_VF;
+defm PseudoVMFLT : VPseudoBinaryM_VV_VF;
+defm PseudoVMFLE : VPseudoBinaryM_VV_VF;
+defm PseudoVMFGT : VPseudoBinaryM_VF;
+defm PseudoVMFGE : VPseudoBinaryM_VF;
+
+//===----------------------------------------------------------------------===//
+// 14.14. Vector Floating-Point Classify Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFCLASS : VPseudoUnaryV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.15. Vector Floating-Point Merge Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFMERGE : VPseudoBinaryV_FM;
+
+//===----------------------------------------------------------------------===//
+// 14.16. Vector Floating-Point Move Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVFMV_V : VPseudoUnaryV_F_NoDummyMask;
+
+//===----------------------------------------------------------------------===//
+// 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFCVT_XU_F : VPseudoConversionV_V;
+defm PseudoVFCVT_X_F : VPseudoConversionV_V;
+defm PseudoVFCVT_RTZ_XU_F : VPseudoConversionV_V;
+defm PseudoVFCVT_RTZ_X_F : VPseudoConversionV_V;
+defm PseudoVFCVT_F_XU : VPseudoConversionV_V;
+defm PseudoVFCVT_F_X : VPseudoConversionV_V;
+
+//===----------------------------------------------------------------------===//
+// 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWCVT_XU_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_X_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_RTZ_XU_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_RTZ_X_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_F_XU : VPseudoConversionW_V;
+defm PseudoVFWCVT_F_X : VPseudoConversionW_V;
+defm PseudoVFWCVT_F_F : VPseudoConversionW_V;
+
+//===----------------------------------------------------------------------===//
+// 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFNCVT_XU_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_X_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_RTZ_XU_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_RTZ_X_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_F_XU : VPseudoConversionV_W;
+defm PseudoVFNCVT_F_X : VPseudoConversionV_W;
+defm PseudoVFNCVT_F_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_ROD_F_F : VPseudoConversionV_W;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 15.1. Vector Single-Width Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVREDSUM : VPseudoReductionV_VS;
+defm PseudoVREDAND : VPseudoReductionV_VS;
+defm PseudoVREDOR : VPseudoReductionV_VS;
+defm PseudoVREDXOR : VPseudoReductionV_VS;
+defm PseudoVREDMINU : VPseudoReductionV_VS;
+defm PseudoVREDMIN : VPseudoReductionV_VS;
+defm PseudoVREDMAXU : VPseudoReductionV_VS;
+defm PseudoVREDMAX : VPseudoReductionV_VS;
+
+//===----------------------------------------------------------------------===//
+// 15.2. Vector Widening Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVWREDSUMU : VPseudoReductionV_VS;
+defm PseudoVWREDSUM : VPseudoReductionV_VS;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 15.3. Vector Single-Width Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFREDOSUM : VPseudoReductionV_VS;
+defm PseudoVFREDSUM : VPseudoReductionV_VS;
+defm PseudoVFREDMIN : VPseudoReductionV_VS;
+defm PseudoVFREDMAX : VPseudoReductionV_VS;
+
+//===----------------------------------------------------------------------===//
+// 15.4. Vector Widening Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVFWREDSUM : VPseudoReductionV_VS;
+defm PseudoVFWREDOSUM : VPseudoReductionV_VS;
+
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 16. Vector Mask Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 16.1 Vector Mask-Register Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm PseudoVMAND: VPseudoBinaryM_MM;
+defm PseudoVMNAND: VPseudoBinaryM_MM;
+defm PseudoVMANDNOT: VPseudoBinaryM_MM;
+defm PseudoVMXOR: VPseudoBinaryM_MM;
+defm PseudoVMOR: VPseudoBinaryM_MM;
+defm PseudoVMNOR: VPseudoBinaryM_MM;
+defm PseudoVMORNOT: VPseudoBinaryM_MM;
+defm PseudoVMXNOR: VPseudoBinaryM_MM;
+
+// Pseudo insturctions
+defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">;
+defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;
+
+//===----------------------------------------------------------------------===//
+// 16.2. Vector mask population count vpopc
+//===----------------------------------------------------------------------===//
+
+defm PseudoVPOPC: VPseudoUnaryS_M;
+
+//===----------------------------------------------------------------------===//
+// 16.3. vfirst find-first-set mask bit
+//===----------------------------------------------------------------------===//
+
+defm PseudoVFIRST: VPseudoUnaryS_M;
+
+//===----------------------------------------------------------------------===//
+// 16.4. vmsbf.m set-before-first mask bit
+//===----------------------------------------------------------------------===//
+defm PseudoVMSBF: VPseudoUnaryM_M;
+
+//===----------------------------------------------------------------------===//
+// 16.5. vmsif.m set-including-first mask bit
+//===----------------------------------------------------------------------===//
+defm PseudoVMSIF: VPseudoUnaryM_M;
+
+//===----------------------------------------------------------------------===//
+// 16.6. vmsof.m set-only-first mask bit
+//===----------------------------------------------------------------------===//
+defm PseudoVMSOF: VPseudoUnaryM_M;
+
+//===----------------------------------------------------------------------===//
+// 16.8. Vector Iota Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVIOTA_M: VPseudoUnaryV_M;
+
+//===----------------------------------------------------------------------===//
+// 16.9. Vector Element Index Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVID : VPseudoMaskNullaryV;
+
+//===----------------------------------------------------------------------===//
+// 17. Vector Permutation Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 17.1. Integer Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, usesCustomInserter = 1,
+ Uses = [VL, VTYPE] in {
+ foreach m = MxList.m in {
+ let VLMul = m.value in {
+ let HasSEWOp = 1, BaseInstr = VMV_X_S in
+ def PseudoVMV_X_S # "_" # m.MX: Pseudo<(outs GPR:$rd),
+ (ins m.vrclass:$rs2, ixlenimm:$sew),
+ []>, RISCVVPseudo;
+ let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, WritesElement0 = 1,
+ Constraints = "$rd = $rs1" in
+ def PseudoVMV_S_X # "_" # m.MX: Pseudo<(outs m.vrclass:$rd),
+ (ins m.vrclass:$rs1, GPR:$rs2,
+ GPR:$vl, ixlenimm:$sew),
+ []>, RISCVVPseudo;
+ }
+ }
+}
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 17.2. Floating-Point Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0, usesCustomInserter = 1,
+ Uses = [VL, VTYPE] in {
+ foreach m = MxList.m in {
+ foreach f = FPList.fpinfo in {
+ let VLMul = m.value in {
+ let HasSEWOp = 1, BaseInstr = VFMV_F_S in
+ def "PseudoVFMV_" # f.FX # "_S_" # m.MX :
+ Pseudo<(outs f.fprclass:$rd),
+ (ins m.vrclass:$rs2,
+ ixlenimm:$sew),
+ []>, RISCVVPseudo;
+ let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, WritesElement0 = 1,
+ Constraints = "$rd = $rs1" in
+ def "PseudoVFMV_S_" # f.FX # "_" # m.MX :
+ Pseudo<(outs m.vrclass:$rd),
+ (ins m.vrclass:$rs1, f.fprclass:$rs2,
+ GPR:$vl, ixlenimm:$sew),
+ []>, RISCVVPseudo;
+ }
+ }
+ }
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.3. Vector Slide Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+ defm PseudoVSLIDEUP : VPseudoTernaryV_VX_VI<uimm5, "@earlyclobber $rd">;
+ defm PseudoVSLIDEDOWN : VPseudoTernaryV_VX_VI<uimm5>;
+ defm PseudoVSLIDE1UP : VPseudoBinaryV_VX<"@earlyclobber $rd">;
+ defm PseudoVSLIDE1DOWN : VPseudoBinaryV_VX;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+ defm PseudoVFSLIDE1UP : VPseudoBinaryV_VF<"@earlyclobber $rd">;
+ defm PseudoVFSLIDE1DOWN : VPseudoBinaryV_VF;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.4. Vector Register Gather Instructions
+//===----------------------------------------------------------------------===//
+defm PseudoVRGATHER : VPseudoBinaryV_VV_VX_VI<uimm5, "@earlyclobber $rd">;
+defm PseudoVRGATHEREI16 : VPseudoBinaryV_VV_EEW</* eew */ 16, "@earlyclobber $rd">;
+
+//===----------------------------------------------------------------------===//
+// 17.5. Vector Compress Instruction
+//===----------------------------------------------------------------------===//
+defm PseudoVCOMPRESS : VPseudoUnaryV_V_AnyMask;
+
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+
+//===----------------------------------------------------------------------===//
+// 7. Vector Loads and Stores
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 7.4 Vector Unit-Stride Instructions
+//===----------------------------------------------------------------------===//
+
+foreach vti = AllVectors in
+{
+ defm : VPatUSLoad<"int_riscv_vle",
+ "PseudoVLE" # vti.SEW,
+ vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+ defm : VPatUSLoadFF<"PseudoVLE" # vti.SEW # "FF",
+ vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+ defm : VPatUSStore<"int_riscv_vse",
+ "PseudoVSE" # vti.SEW,
+ vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.5 Vector Strided Instructions
+//===----------------------------------------------------------------------===//
+
+foreach vti = AllVectors in
+{
+ defm : VPatSLoad<"int_riscv_vlse",
+ "PseudoVLSE" # vti.SEW,
+ vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+ defm : VPatSStore<"int_riscv_vsse",
+ "PseudoVSSE" # vti.SEW,
+ vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
+}
+
+//===----------------------------------------------------------------------===//
+// 7.6 Vector Indexed Instructions
+//===----------------------------------------------------------------------===//
+
+foreach vti = AllVectors in
+foreach eew = EEWList in {
+ defvar vlmul = vti.LMul;
+ defvar octuple_lmul = octuple_from_str<vti.LMul.MX>.ret;
+ defvar log_sew = shift_amount<vti.SEW>.val;
+ // The data vector register group has EEW=SEW, EMUL=LMUL, while the offset
+ // vector register group has EEW encoding in the instruction and EMUL=(EEW/SEW)*LMUL.
+ // calculate octuple elmul which is (eew * octuple_lmul) >> log_sew
+ defvar octuple_elmul = !srl(!mul(eew, octuple_lmul), log_sew);
+ // legal octuple elmul should be more than 0 and less than equal 64
+ if !gt(octuple_elmul, 0) then {
+ if !le(octuple_elmul, 64) then {
+ defvar elmul_str = octuple_to_str<octuple_elmul>.ret;
+ defvar elmul =!cast<LMULInfo>("V_" # elmul_str);
+ defvar idx_vti = !cast<VTypeInfo>("VI" # eew # elmul_str);
+
+ defm : VPatILoad<"int_riscv_vluxei",
+ "PseudoVLUXEI"#eew,
+ vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+ vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+ defm : VPatILoad<"int_riscv_vloxei",
+ "PseudoVLOXEI"#eew,
+ vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+ vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+ defm : VPatIStore<"int_riscv_vsoxei",
+ "PseudoVSOXEI"#eew,
+ vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+ vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+ defm : VPatIStore<"int_riscv_vsuxei",
+ "PseudoVSUXEI"#eew,
+ vti.Vector, idx_vti.Vector, vti.Mask, vti.SEW,
+ vlmul, elmul, vti.RegClass, idx_vti.RegClass>;
+ }
+ }
+}
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 8. Vector AMO Operations
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtZvamo] in {
+ defm "" : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamoadd", "PseudoVAMOADD", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamoxor", "PseudoVAMOXOR", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamoand", "PseudoVAMOAND", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamoor", "PseudoVAMOOR", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamomin", "PseudoVAMOMIN", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamomax", "PseudoVAMOMAX", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamominu", "PseudoVAMOMINU", AllIntegerVectors>;
+ defm "" : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>;
+} // Predicates = [HasStdExtZvamo]
+
+let Predicates = [HasStdExtZvamo, HasStdExtF] in {
+ defm "" : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>;
+} // Predicates = [HasStdExtZvamo, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 12. Vector Integer Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 12.1. Vector Single-Width Integer Add and Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vadd", "PseudoVADD", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vsub", "PseudoVSUB", AllIntegerVectors>;
+defm "" : VPatBinaryV_VX_VI<"int_riscv_vrsub", "PseudoVRSUB", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.2. Vector Widening Integer Add/Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwaddu", "PseudoVWADDU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwsubu", "PseudoVWSUBU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwadd", "PseudoVWADD", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwsub", "PseudoVWSUB", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwaddu_w", "PseudoVWADDU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwsubu_w", "PseudoVWSUBU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwadd_w", "PseudoVWADD", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vwsub_w", "PseudoVWSUB", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.3. Vector Integer Extension
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vzext", "PseudoVZEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm "" : VPatUnaryV_VF<"int_riscv_vsext", "PseudoVSEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vadc", "PseudoVADC">;
+defm "" : VPatBinaryM_VM_XM_IM<"int_riscv_vmadc_carry_in", "PseudoVMADC">;
+defm "" : VPatBinaryM_V_X_I<"int_riscv_vmadc", "PseudoVMADC">;
+
+defm "" : VPatBinaryV_VM_XM<"int_riscv_vsbc", "PseudoVSBC">;
+defm "" : VPatBinaryM_VM_XM<"int_riscv_vmsbc_borrow_in", "PseudoVMSBC">;
+defm "" : VPatBinaryM_V_X<"int_riscv_vmsbc", "PseudoVMSBC">;
+
+//===----------------------------------------------------------------------===//
+// 12.5. Vector Bitwise Logical Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vand", "PseudoVAND", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vor", "PseudoVOR", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vxor", "PseudoVXOR", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.6. Vector Single-Width Bit Shift Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsll", "PseudoVSLL", AllIntegerVectors,
+ uimm5>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsrl", "PseudoVSRL", AllIntegerVectors,
+ uimm5>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
+ uimm5>;
+
+//===----------------------------------------------------------------------===//
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnsrl", "PseudoVNSRL", AllWidenableIntVectors>;
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnsra", "PseudoVNSRA", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.8. Vector Integer Comparison Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmseq", "PseudoVMSEQ", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsne", "PseudoVMSNE", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmsltu", "PseudoVMSLTU", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmslt", "PseudoVMSLT", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsleu", "PseudoVMSLEU", AllIntegerVectors>;
+defm "" : VPatBinaryM_VV_VX_VI<"int_riscv_vmsle", "PseudoVMSLE", AllIntegerVectors>;
+
+defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgtu", "PseudoVMSGTU", AllIntegerVectors>;
+defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgt", "PseudoVMSGT", AllIntegerVectors>;
+
+// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This
+// avoids the user needing to know that there is no vmslt(u).vi instruction.
+// This is limited to vmslt(u).vx as there is no vmsge().vx intrinsic or
+// instruction.
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Mask (int_riscv_vmslt (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2), GPR:$vl)),
+ (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ (NoX0 GPR:$vl),
+ vti.SEW)>;
+ def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask V0),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ (vti.Mask VR:$merge),
+ GPR:$vl)),
+ (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK")
+ VR:$merge,
+ vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ (vti.Mask V0),
+ (NoX0 GPR:$vl),
+ vti.SEW)>;
+
+ def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2), GPR:$vl)),
+ (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ (NoX0 GPR:$vl),
+ vti.SEW)>;
+ def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask V0),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar simm5_plus1:$rs2),
+ (vti.Mask VR:$merge),
+ GPR:$vl)),
+ (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK")
+ VR:$merge,
+ vti.RegClass:$rs1,
+ (DecImm simm5_plus1:$rs2),
+ (vti.Mask V0),
+ (NoX0 GPR:$vl),
+ vti.SEW)>;
+
+ // Special cases to avoid matching vmsltu.vi 0 (always false) to
+ // vmsleu.vi -1 (always true). Instead match to vmsne.vv.
+ def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar 0), GPR:$vl)),
+ (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
+ vti.RegClass:$rs1,
+ (NoX0 GPR:$vl),
+ vti.SEW)>;
+ def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask V0),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti.Scalar 0),
+ (vti.Mask VR:$merge),
+ GPR:$vl)),
+ (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK")
+ VR:$merge,
+ vti.RegClass:$rs1,
+ vti.RegClass:$rs1,
+ (vti.Mask V0),
+ (NoX0 GPR:$vl),
+ vti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 12.9. Vector Integer Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vminu", "PseudoVMINU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmin", "PseudoVMIN", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmaxu", "PseudoVMAXU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmax", "PseudoVMAX", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.10. Vector Single-Width Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmul", "PseudoVMUL", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulh", "PseudoVMULH", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulhu", "PseudoVMULHU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vmulhsu", "PseudoVMULHSU", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.11. Vector Integer Divide Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vdivu", "PseudoVDIVU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vdiv", "PseudoVDIV", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vremu", "PseudoVREMU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vrem", "PseudoVREM", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.12. Vector Widening Integer Multiply Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmul", "PseudoVWMUL", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmulu", "PseudoVWMULU", AllWidenableIntVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vwmulsu", "PseudoVWMULSU", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.13. Vector Single-Width Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmadd", "PseudoVMADD", AllIntegerVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsub", "PseudoVNMSUB", AllIntegerVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vmacc", "PseudoVMACC", AllIntegerVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vnmsac", "PseudoVNMSAC", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.14. Vector Widening Integer Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmaccu", "PseudoVWMACCU", AllWidenableIntVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmacc", "PseudoVWMACC", AllWidenableIntVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vwmaccsu", "PseudoVWMACCSU", AllWidenableIntVectors>;
+defm "" : VPatTernaryW_VX<"int_riscv_vwmaccus", "PseudoVWMACCUS", AllWidenableIntVectors>;
+
+//===----------------------------------------------------------------------===//
+// 12.16. Vector Integer Merge Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
+
+//===----------------------------------------------------------------------===//
+// 12.17. Vector Integer Move Instructions
+//===----------------------------------------------------------------------===//
+foreach vti = AllVectors in {
+ def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$rs1),
+ GPR:$vl)),
+ (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
+ $rs1, (NoX0 GPR:$vl), vti.SEW)>;
+}
+
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (int_riscv_vmv_v_x GPR:$rs2, GPR:$vl)),
+ (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
+ $rs2, (NoX0 GPR:$vl), vti.SEW)>;
+ def : Pat<(vti.Vector (int_riscv_vmv_v_x simm5:$imm5, GPR:$vl)),
+ (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
+ simm5:$imm5, (NoX0 GPR:$vl), vti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 13.1. Vector Single-Width Saturating Add and Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsaddu", "PseudoVSADDU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vsadd", "PseudoVSADD", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vssubu", "PseudoVSSUBU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vssub", "PseudoVSSUB", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 13.2. Vector Single-Width Averaging Add and Subtract
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vaaddu", "PseudoVAADDU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vaadd", "PseudoVAADD", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vasubu", "PseudoVASUBU", AllIntegerVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vasub", "PseudoVASUB", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vsmul", "PseudoVSMUL", AllIntegerVectors>;
+
+//===----------------------------------------------------------------------===//
+// 13.4. Vector Single-Width Scaling Shift Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vssrl", "PseudoVSSRL", AllIntegerVectors,
+ uimm5>;
+defm "" : VPatBinaryV_VV_VX_VI<"int_riscv_vssra", "PseudoVSSRA", AllIntegerVectors,
+ uimm5>;
+
+//===----------------------------------------------------------------------===//
+// 13.5. Vector Narrowing Fixed-Point Clip Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnclipu", "PseudoVNCLIPU", AllWidenableIntVectors>;
+defm "" : VPatBinaryV_WV_WX_WI<"int_riscv_vnclip", "PseudoVNCLIP", AllWidenableIntVectors>;
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfadd", "PseudoVFADD", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsub", "PseudoVFSUB", AllFloatVectors>;
+defm "" : VPatBinaryV_VX<"int_riscv_vfrsub", "PseudoVFRSUB", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwadd", "PseudoVFWADD", AllWidenableFloatVectors>;
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwsub", "PseudoVFWSUB", AllWidenableFloatVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vfwadd_w", "PseudoVFWADD", AllWidenableFloatVectors>;
+defm "" : VPatBinaryW_WV_WX<"int_riscv_vfwsub_w", "PseudoVFWSUB", AllWidenableFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmul", "PseudoVFMUL", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfdiv", "PseudoVFDIV", AllFloatVectors>;
+defm "" : VPatBinaryV_VX<"int_riscv_vfrdiv", "PseudoVFRDIV", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.5. Vector Widening Floating-Point Multiply
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryW_VV_VX<"int_riscv_vfwmul", "PseudoVFWMUL", AllWidenableFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmacc", "PseudoVFMACC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmacc", "PseudoVFNMACC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsac", "PseudoVFMSAC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsac", "PseudoVFNMSAC", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmadd", "PseudoVFMADD", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmadd", "PseudoVFNMADD", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfmsub", "PseudoVFMSUB", AllFloatVectors>;
+defm "" : VPatTernaryV_VV_VX_AAXA<"int_riscv_vfnmsub", "PseudoVFNMSUB", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwmacc", "PseudoVFWMACC", AllWidenableFloatVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwnmacc", "PseudoVFWNMACC", AllWidenableFloatVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwmsac", "PseudoVFWMSAC", AllWidenableFloatVectors>;
+defm "" : VPatTernaryW_VV_VX<"int_riscv_vfwnmsac", "PseudoVFWNMSAC", AllWidenableFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.8. Vector Floating-Point Square-Root Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_V<"int_riscv_vfsqrt", "PseudoVFSQRT", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_V<"int_riscv_vfrsqrte7", "PseudoVFRSQRTE7", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.10. Vector Floating-Point Reciprocal Estimate Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_V<"int_riscv_vfrece7", "PseudoVFRECE7", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.11. Vector Floating-Point Min/Max Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.12. Vector Floating-Point Sign-Injection Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN", AllFloatVectors>;
+defm "" : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.13. Vector Floating-Point Compare Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ", AllFloatVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE", AllFloatVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT", AllFloatVectors>;
+defm "" : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE", AllFloatVectors>;
+defm "" : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT", AllFloatVectors>;
+defm "" : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE", AllFloatVectors>;
+
+//===----------------------------------------------------------------------===//
+// 14.14. Vector Floating-Point Classify Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
+
+//===----------------------------------------------------------------------===//
+// 14.15. Vector Floating-Point Merge Instruction
+//===----------------------------------------------------------------------===//
+// We can use vmerge.vvm to support vector-vector vfmerge.
+defm "" : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
+ /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm "" : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
+ /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+
+foreach fvti = AllFloatVectors in {
+ defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
+ def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$rs2),
+ (fvti.Scalar (fpimm0)),
+ (fvti.Mask V0), (XLenVT GPR:$vl))),
+ (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), (NoX0 GPR:$vl), fvti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 14.16. Vector Floating-Point Move Instruction
+//===----------------------------------------------------------------------===//
+foreach fvti = AllFloatVectors in {
+ // If we're splatting fpimm0, use vmv.v.x vd, x0.
+ def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
+ (fvti.Scalar (fpimm0)), GPR:$vl)),
+ (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+ 0, (NoX0 GPR:$vl), fvti.SEW)>;
+
+ def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
+ (fvti.Scalar fvti.ScalarRegClass:$rs2), GPR:$vl)),
+ (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
+ fvti.LMul.MX)
+ (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ (NoX0 GPR:$vl), fvti.SEW)>;
+}
+
+//===----------------------------------------------------------------------===//
+// 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_xu_f_v", "PseudoVFCVT_XU_F">;
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_xu_f_v", "PseudoVFCVT_RTZ_XU_F">;
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_x_f_v", "PseudoVFCVT_X_F">;
+defm "" : VPatConversionVI_VF<"int_riscv_vfcvt_rtz_x_f_v", "PseudoVFCVT_RTZ_X_F">;
+defm "" : VPatConversionVF_VI<"int_riscv_vfcvt_f_x_v", "PseudoVFCVT_F_X">;
+defm "" : VPatConversionVF_VI<"int_riscv_vfcvt_f_xu_v", "PseudoVFCVT_F_XU">;
+
+//===----------------------------------------------------------------------===//
+// 14.18. Widening Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_xu_f_v", "PseudoVFWCVT_XU_F">;
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_x_f_v", "PseudoVFWCVT_X_F">;
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_xu_f_v", "PseudoVFWCVT_RTZ_XU_F">;
+defm "" : VPatConversionWI_VF<"int_riscv_vfwcvt_rtz_x_f_v", "PseudoVFWCVT_RTZ_X_F">;
+defm "" : VPatConversionWF_VI<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU">;
+defm "" : VPatConversionWF_VI<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X">;
+defm "" : VPatConversionWF_VF<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F">;
+
+//===----------------------------------------------------------------------===//
+// 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F">;
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F">;
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F">;
+defm "" : VPatConversionVI_WF<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F">;
+defm "" : VPatConversionVF_WI <"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU">;
+defm "" : VPatConversionVF_WI <"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X">;
+defm "" : VPatConversionVF_WF<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F">;
+defm "" : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F">;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 15.1. Vector Single-Width Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionV_VS<"int_riscv_vredsum", "PseudoVREDSUM">;
+defm "" : VPatReductionV_VS<"int_riscv_vredand", "PseudoVREDAND">;
+defm "" : VPatReductionV_VS<"int_riscv_vredor", "PseudoVREDOR">;
+defm "" : VPatReductionV_VS<"int_riscv_vredxor", "PseudoVREDXOR">;
+defm "" : VPatReductionV_VS<"int_riscv_vredminu", "PseudoVREDMINU">;
+defm "" : VPatReductionV_VS<"int_riscv_vredmin", "PseudoVREDMIN">;
+defm "" : VPatReductionV_VS<"int_riscv_vredmaxu", "PseudoVREDMAXU">;
+defm "" : VPatReductionV_VS<"int_riscv_vredmax", "PseudoVREDMAX">;
+
+//===----------------------------------------------------------------------===//
+// 15.2. Vector Widening Integer Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionW_VS<"int_riscv_vwredsumu", "PseudoVWREDSUMU">;
+defm "" : VPatReductionW_VS<"int_riscv_vwredsum", "PseudoVWREDSUM">;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+//===----------------------------------------------------------------------===//
+// 15.3. Vector Single-Width Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionV_VS<"int_riscv_vfredosum", "PseudoVFREDOSUM", /*IsFloat=*/1>;
+defm "" : VPatReductionV_VS<"int_riscv_vfredsum", "PseudoVFREDSUM", /*IsFloat=*/1>;
+defm "" : VPatReductionV_VS<"int_riscv_vfredmin", "PseudoVFREDMIN", /*IsFloat=*/1>;
+defm "" : VPatReductionV_VS<"int_riscv_vfredmax", "PseudoVFREDMAX", /*IsFloat=*/1>;
+
+//===----------------------------------------------------------------------===//
+// 15.4. Vector Widening Floating-Point Reduction Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatReductionW_VS<"int_riscv_vfwredsum", "PseudoVFWREDSUM", /*IsFloat=*/1>;
+defm "" : VPatReductionW_VS<"int_riscv_vfwredosum", "PseudoVFWREDOSUM", /*IsFloat=*/1>;
+
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 16. Vector Mask Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+//===----------------------------------------------------------------------===//
+// 16.1 Vector Mask-Register Logical Instructions
+//===----------------------------------------------------------------------===//
+defm "" : VPatBinaryM_MM<"int_riscv_vmand", "PseudoVMAND">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmnand", "PseudoVMNAND">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmandnot", "PseudoVMANDNOT">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmxor", "PseudoVMXOR">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmor", "PseudoVMOR">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmnor", "PseudoVMNOR">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmornot", "PseudoVMORNOT">;
+defm "" : VPatBinaryM_MM<"int_riscv_vmxnor", "PseudoVMXNOR">;
+
+// pseudo instructions
+defm "" : VPatNullaryM<"int_riscv_vmclr", "PseudoVMCLR">;
+defm "" : VPatNullaryM<"int_riscv_vmset", "PseudoVMSET">;
+
+//===----------------------------------------------------------------------===//
+// 16.2. Vector mask population count vpopc
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryS_M<"int_riscv_vpopc", "PseudoVPOPC">;
+
+//===----------------------------------------------------------------------===//
+// 16.3. vfirst find-first-set mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryS_M<"int_riscv_vfirst", "PseudoVFIRST">;
+
+//===----------------------------------------------------------------------===//
+// 16.4. vmsbf.m set-before-first mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryM_M<"int_riscv_vmsbf", "PseudoVMSBF">;
+
+//===----------------------------------------------------------------------===//
+// 16.5. vmsif.m set-including-first mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryM_M<"int_riscv_vmsif", "PseudoVMSIF">;
+
+//===----------------------------------------------------------------------===//
+// 16.6. vmsof.m set-only-first mask bit
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryM_M<"int_riscv_vmsof", "PseudoVMSOF">;
+
+//===----------------------------------------------------------------------===//
+// 16.8. Vector Iota Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatUnaryV_M<"int_riscv_viota", "PseudoVIOTA">;
+
+//===----------------------------------------------------------------------===//
+// 16.9. Vector Element Index Instruction
+//===----------------------------------------------------------------------===//
+defm "" : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
+
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 17. Vector Permutation Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 17.1. Integer Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+foreach vti = AllIntegerVectors in {
+ def : Pat<(riscv_vmv_x_s (vti.Vector vti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.SEW)>;
+ def : Pat<(vti.Vector (int_riscv_vmv_s_x (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2, GPR:$vl)),
+ (!cast<Instruction>("PseudoVMV_S_X_" # vti.LMul.MX)
+ (vti.Vector $rs1), $rs2, (NoX0 GPR:$vl), vti.SEW)>;
+}
+} // Predicates = [HasStdExtV]
+
+//===----------------------------------------------------------------------===//
+// 17.2. Floating-Point Scalar Move Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+foreach fvti = AllFloatVectors in {
+ defvar instr = !cast<Instruction>("PseudoVFMV_"#fvti.ScalarSuffix#"_S_" #
+ fvti.LMul.MX);
+ def : Pat<(fvti.Scalar (int_riscv_vfmv_f_s (fvti.Vector fvti.RegClass:$rs2))),
+ (instr $rs2, fvti.SEW)>;
+
+ def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Scalar fvti.ScalarRegClass:$rs2), GPR:$vl)),
+ (!cast<Instruction>("PseudoVFMV_S_"#fvti.ScalarSuffix#"_" #
+ fvti.LMul.MX)
+ (fvti.Vector $rs1),
+ (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ (NoX0 GPR:$vl), fvti.SEW)>;
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.3. Vector Slide Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+ defm "" : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllIntegerVectors, uimm5>;
+ defm "" : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllIntegerVectors, uimm5>;
+ defm "" : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
+ defm "" : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+ defm "" : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
+ defm "" : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
+ defm "" : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
+ defm "" : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.4. Vector Register Gather Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+ defm "" : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllIntegerVectors, uimm5>;
+ defm "" : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16", "PseudoVRGATHEREI16",
+ /* eew */ 16, AllIntegerVectors>;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+ defm "" : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
+ AllFloatVectors, uimm5>;
+ defm "" : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16", "PseudoVRGATHEREI16",
+ /* eew */ 16, AllFloatVectors>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// 17.5. Vector Compress Instruction
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtV] in {
+ defm "" : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+ defm "" : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+// Include the non-intrinsic ISel patterns
+include "RISCVInstrInfoVSDPatterns.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
new file mode 100644
index 000000000000..aea3d0e17ccc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -0,0 +1,643 @@
+//===- RISCVInstrInfoVSDPatterns.td - RVV SDNode patterns --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains the required infrastructure and SDNode patterns to
+/// support code generation for the standard 'V' (Vector) extension, version
+/// 0.9. This version is still experimental as the 'V' extension hasn't been
+/// ratified yet.
+///
+/// This file is included from and depends upon RISCVInstrInfoVPseudos.td
+///
+/// Note: the patterns for RVV intrinsics are found in
+/// RISCVInstrInfoVPseudos.td.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Helpers to define the SDNode patterns.
+//===----------------------------------------------------------------------===//
+
+def SDTSplatI64 : SDTypeProfile<1, 1, [
+ SDTCVecEltisVT<0, i64>, SDTCisVT<1, i32>
+]>;
+
+def rv32_splat_i64 : SDNode<"RISCVISD::SPLAT_VECTOR_I64", SDTSplatI64>;
+
+def riscv_trunc_vector : SDNode<"RISCVISD::TRUNCATE_VECTOR",
+ SDTypeProfile<1, 1,
+ [SDTCisVec<0>, SDTCisVec<1>]>>;
+
+// Penalize the generic form with Complexity=1 to give the simm5/uimm5 variants
+// precedence
+def SplatPat : ComplexPattern<vAny, 1, "selectVSplat", [], [], 1>;
+
+def SplatPat_simm5 : ComplexPattern<vAny, 1, "selectVSplatSimm5", []>;
+def SplatPat_uimm5 : ComplexPattern<vAny, 1, "selectVSplatUimm5", []>;
+
+class SwapHelper<dag Prefix, dag A, dag B, dag Suffix, bit swap> {
+ dag Value = !con(Prefix, !if(swap, B, A), !if(swap, A, B), Suffix);
+}
+
+multiclass VPatUSLoadStoreSDNode<LLVMType type,
+ LLVMType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ OutPatFrag avl,
+ RegisterClass reg_rs1,
+ VReg reg_class>
+{
+ defvar load_instr = !cast<Instruction>("PseudoVLE"#sew#"_V_"#vlmul.MX);
+ defvar store_instr = !cast<Instruction>("PseudoVSE"#sew#"_V_"#vlmul.MX);
+ // Load
+ def : Pat<(type (load reg_rs1:$rs1)),
+ (load_instr reg_rs1:$rs1, avl, sew)>;
+ // Store
+ def : Pat<(store type:$rs2, reg_rs1:$rs1),
+ (store_instr reg_class:$rs2, reg_rs1:$rs1, avl, sew)>;
+}
+
+multiclass VPatUSLoadStoreSDNodes<RegisterClass reg_rs1> {
+ foreach vti = AllVectors in
+ defm "" : VPatUSLoadStoreSDNode<vti.Vector, vti.Mask, vti.SEW, vti.LMul,
+ vti.AVL, reg_rs1, vti.RegClass>;
+}
+
+class VPatBinarySDNode_VV<SDNode vop,
+ string instruction_name,
+ ValueType result_type,
+ ValueType op_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ OutPatFrag avl,
+ VReg RetClass,
+ VReg op_reg_class> :
+ Pat<(result_type (vop
+ (op_type op_reg_class:$rs1),
+ (op_type op_reg_class:$rs2))),
+ (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX)
+ op_reg_class:$rs1,
+ op_reg_class:$rs2,
+ avl, sew)>;
+
+class VPatBinarySDNode_XI<SDNode vop,
+ string instruction_name,
+ string suffix,
+ ValueType result_type,
+ ValueType vop_type,
+ ValueType xop_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ OutPatFrag avl,
+ VReg RetClass,
+ VReg vop_reg_class,
+ ComplexPattern SplatPatKind,
+ DAGOperand xop_kind> :
+ Pat<(result_type (vop
+ (vop_type vop_reg_class:$rs1),
+ (vop_type (SplatPatKind xop_kind:$rs2)))),
+ (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX)
+ vop_reg_class:$rs1,
+ xop_kind:$rs2,
+ avl, sew)>;
+
+multiclass VPatBinarySDNode_VV_VX<SDNode vop, string instruction_name>
+{
+ foreach vti = AllIntegerVectors in {
+ def : VPatBinarySDNode_VV<vop, instruction_name,
+ vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+ vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+ def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
+ vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+ vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+ SplatPat, GPR>;
+ }
+}
+
+multiclass VPatBinarySDNode_VV_VX_VI<SDNode vop, string instruction_name,
+ Operand ImmType = simm5>
+{
+ foreach vti = AllIntegerVectors in {
+ def : VPatBinarySDNode_VV<vop, instruction_name,
+ vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+ vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+ def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
+ vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+ vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+ SplatPat, GPR>;
+ def : VPatBinarySDNode_XI<vop, instruction_name, "VI",
+ vti.Vector, vti.Vector, XLenVT, vti.Mask, vti.SEW,
+ vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+ !cast<ComplexPattern>(SplatPat#_#ImmType),
+ ImmType>;
+ }
+}
+
+class VPatBinarySDNode_VF<SDNode vop,
+ string instruction_name,
+ ValueType result_type,
+ ValueType vop_type,
+ ValueType xop_type,
+ ValueType mask_type,
+ int sew,
+ LMULInfo vlmul,
+ OutPatFrag avl,
+ VReg RetClass,
+ VReg vop_reg_class,
+ DAGOperand xop_kind> :
+ Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
+ (vop_type (splat_vector xop_kind:$rs2)))),
+ (!cast<Instruction>(instruction_name#"_"#vlmul.MX)
+ vop_reg_class:$rs1,
+ (xop_type xop_kind:$rs2),
+ avl, sew)>;
+
+multiclass VPatBinaryFPSDNode_VV_VF<SDNode vop, string instruction_name> {
+ foreach vti = AllFloatVectors in {
+ def : VPatBinarySDNode_VV<vop, instruction_name,
+ vti.Vector, vti.Vector, vti.Mask, vti.SEW,
+ vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+ def : VPatBinarySDNode_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
+ vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
+ vti.SEW, vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+ vti.ScalarRegClass>;
+ }
+}
+
+multiclass VPatBinaryFPSDNode_R_VF<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in
+ def : Pat<(fvti.Vector (vop (fvti.Vector (splat_vector fvti.Scalar:$rs2)),
+ (fvti.Vector fvti.RegClass:$rs1))),
+ (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1,
+ (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatIntegerSetCCSDNode_VV<CondCode cc,
+ string instruction_name,
+ bit swap = 0> {
+ foreach vti = AllIntegerVectors in {
+ defvar instruction = !cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX);
+ def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector vti.RegClass:$rs2), cc)),
+ SwapHelper<(instruction),
+ (instruction vti.RegClass:$rs1),
+ (instruction vti.RegClass:$rs2),
+ (instruction vti.AVL, vti.SEW),
+ swap>.Value>;
+ }
+}
+
+multiclass VPatIntegerSetCCSDNode_XI<CondCode cc,
+ string instruction_name,
+ string kind,
+ ComplexPattern SplatPatKind,
+ DAGOperand xop_kind,
+ bit swap = 0> {
+ foreach vti = AllIntegerVectors in {
+ defvar instruction = !cast<Instruction>(instruction_name#_#kind#_#vti.LMul.MX);
+ def : Pat<(vti.Mask (setcc (vti.Vector vti.RegClass:$rs1),
+ (vti.Vector (SplatPatKind xop_kind:$rs2)), cc)),
+ SwapHelper<(instruction),
+ (instruction vti.RegClass:$rs1),
+ (instruction xop_kind:$rs2),
+ (instruction vti.AVL, vti.SEW),
+ swap>.Value>;
+ }
+}
+
+multiclass VPatIntegerSetCCSDNode_VV_VX_VI<CondCode cc,
+ string instruction_name,
+ bit swap = 0> {
+ defm : VPatIntegerSetCCSDNode_VV<cc, instruction_name, swap>;
+ defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
+ SplatPat, GPR, swap>;
+ defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VI",
+ SplatPat_simm5, simm5, swap>;
+}
+
+multiclass VPatIntegerSetCCSDNode_VV_VX<CondCode cc,
+ string instruction_name,
+ bit swap = 0> {
+ defm : VPatIntegerSetCCSDNode_VV<cc, instruction_name, swap>;
+ defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
+ SplatPat, GPR, swap>;
+}
+
+multiclass VPatIntegerSetCCSDNode_VX_VI<CondCode cc,
+ string instruction_name,
+ bit swap = 0> {
+ defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VX",
+ SplatPat, GPR, swap>;
+ defm : VPatIntegerSetCCSDNode_XI<cc, instruction_name, "VI",
+ SplatPat_simm5, simm5, swap>;
+}
+
+multiclass VPatFPSetCCSDNode_VV<CondCode cc, string instruction_name> {
+ foreach fvti = AllFloatVectors in
+ def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Vector fvti.RegClass:$rs2),
+ cc)),
+ (!cast<Instruction>(instruction_name#"_VV_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1, fvti.RegClass:$rs2, fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatFPSetCCSDNode_VF<CondCode cc, string instruction_name> {
+ foreach fvti = AllFloatVectors in
+ def : Pat<(fvti.Mask (setcc (fvti.Vector fvti.RegClass:$rs1),
+ (fvti.Vector (splat_vector fvti.ScalarRegClass:$rs2)),
+ cc)),
+ (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1,
+ (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatFPSetCCSDNode_FV<CondCode cc, string swapped_op_instruction_name> {
+ foreach fvti = AllFloatVectors in
+ def : Pat<(fvti.Mask (setcc (fvti.Vector (splat_vector fvti.ScalarRegClass:$rs2)),
+ (fvti.Vector fvti.RegClass:$rs1),
+ cc)),
+ (!cast<Instruction>(swapped_op_instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ fvti.RegClass:$rs1,
+ (fvti.Scalar fvti.ScalarRegClass:$rs2),
+ fvti.AVL, fvti.SEW)>;
+}
+
+multiclass VPatFPSetCCSDNode_VV_VF_FV<CondCode cc,
+ string inst_name,
+ string swapped_op_inst_name> {
+ defm : VPatFPSetCCSDNode_VV<cc, inst_name>;
+ defm : VPatFPSetCCSDNode_VF<cc, inst_name>;
+ defm : VPatFPSetCCSDNode_FV<cc, swapped_op_inst_name>;
+}
+
+multiclass VPatExtendSDNode_V<list<SDNode> ops, string inst_name, string suffix,
+ list <VTypeInfoToFraction> fraction_list> {
+ foreach vtiTofti = fraction_list in {
+ defvar vti = vtiTofti.Vti;
+ defvar fti = vtiTofti.Fti;
+ foreach op = ops in
+ def : Pat<(vti.Vector (op (fti.Vector fti.RegClass:$rs2))),
+ (!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX)
+ fti.RegClass:$rs2, fti.AVL, vti.SEW)>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+
+// 7.4. Vector Unit-Stride Instructions
+defm "" : VPatUSLoadStoreSDNodes<GPR>;
+defm "" : VPatUSLoadStoreSDNodes<AddrFI>;
+
+// 12.1. Vector Single-Width Integer Add and Subtract
+defm "" : VPatBinarySDNode_VV_VX_VI<add, "PseudoVADD">;
+defm "" : VPatBinarySDNode_VV_VX<sub, "PseudoVSUB">;
+// Handle VRSUB specially since it's the only integer binary op with reversed
+// pattern operands
+foreach vti = AllIntegerVectors in {
+ def : Pat<(sub (vti.Vector (SplatPat XLenVT:$rs2)),
+ (vti.Vector vti.RegClass:$rs1)),
+ (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX)
+ vti.RegClass:$rs1, GPR:$rs2, vti.AVL, vti.SEW)>;
+ def : Pat<(sub (vti.Vector (SplatPat_simm5 XLenVT:$rs2)),
+ (vti.Vector vti.RegClass:$rs1)),
+ (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX)
+ vti.RegClass:$rs1, simm5:$rs2, vti.AVL, vti.SEW)>;
+}
+
+// 12.3. Vector Integer Extension
+defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm "" : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF2",
+ AllFractionableVF2IntVectors>;
+defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm "" : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF4",
+ AllFractionableVF4IntVectors>;
+defm "" : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+defm "" : VPatExtendSDNode_V<[sext], "PseudoVSEXT", "VF8",
+ AllFractionableVF8IntVectors>;
+
+// 12.5. Vector Bitwise Logical Instructions
+defm "" : VPatBinarySDNode_VV_VX_VI<and, "PseudoVAND">;
+defm "" : VPatBinarySDNode_VV_VX_VI<or, "PseudoVOR">;
+defm "" : VPatBinarySDNode_VV_VX_VI<xor, "PseudoVXOR">;
+
+// 12.6. Vector Single-Width Bit Shift Instructions
+defm "" : VPatBinarySDNode_VV_VX_VI<shl, "PseudoVSLL", uimm5>;
+defm "" : VPatBinarySDNode_VV_VX_VI<srl, "PseudoVSRL", uimm5>;
+defm "" : VPatBinarySDNode_VV_VX_VI<sra, "PseudoVSRA", uimm5>;
+
+// 12.7. Vector Narrowing Integer Right Shift Instructions
+foreach vtiTofti = AllFractionableVF2IntVectors in {
+ defvar vti = vtiTofti.Vti;
+ defvar fti = vtiTofti.Fti;
+ def : Pat<(fti.Vector (riscv_trunc_vector (vti.Vector vti.RegClass:$rs1))),
+ (!cast<Instruction>("PseudoVNSRL_WI_"#fti.LMul.MX)
+ vti.RegClass:$rs1, 0, fti.AVL, fti.SEW)>;
+}
+
+// 12.8. Vector Integer Comparison Instructions
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETEQ, "PseudoVMSEQ">;
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE, "PseudoVMSNE">;
+
+// FIXME: Support immediate forms of these by choosing SLE decrementing the
+// immediate
+defm "" : VPatIntegerSetCCSDNode_VV_VX<SETLT, "PseudoVMSLT">;
+defm "" : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">;
+
+defm "" : VPatIntegerSetCCSDNode_VV<SETGT, "PseudoVMSLT", /*swap*/1>;
+defm "" : VPatIntegerSetCCSDNode_VV<SETUGT, "PseudoVMSLTU", /*swap*/1>;
+defm "" : VPatIntegerSetCCSDNode_VX_VI<SETGT, "PseudoVMSGT">;
+defm "" : VPatIntegerSetCCSDNode_VX_VI<SETUGT, "PseudoVMSGTU">;
+
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETLE, "PseudoVMSLE">;
+defm "" : VPatIntegerSetCCSDNode_VV_VX_VI<SETULE, "PseudoVMSLEU">;
+
+// FIXME: Support immediate forms of these by choosing SGT and decrementing the
+// immediate
+defm "" : VPatIntegerSetCCSDNode_VV<SETGE, "PseudoVMSLE", /*swap*/1>;
+defm "" : VPatIntegerSetCCSDNode_VV<SETUGE, "PseudoVMSLEU", /*swap*/1>;
+
+// 12.9. Vector Integer Min/Max Instructions
+defm "" : VPatBinarySDNode_VV_VX<umin, "PseudoVMINU">;
+defm "" : VPatBinarySDNode_VV_VX<smin, "PseudoVMIN">;
+defm "" : VPatBinarySDNode_VV_VX<umax, "PseudoVMAXU">;
+defm "" : VPatBinarySDNode_VV_VX<smax, "PseudoVMAX">;
+
+// 12.10. Vector Single-Width Integer Multiply Instructions
+defm "" : VPatBinarySDNode_VV_VX<mul, "PseudoVMUL">;
+defm "" : VPatBinarySDNode_VV_VX<mulhs, "PseudoVMULH">;
+defm "" : VPatBinarySDNode_VV_VX<mulhu, "PseudoVMULHU">;
+
+// 12.11. Vector Integer Divide Instructions
+defm "" : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIVU">;
+defm "" : VPatBinarySDNode_VV_VX<udiv, "PseudoVDIV">;
+defm "" : VPatBinarySDNode_VV_VX<urem, "PseudoVREMU">;
+defm "" : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
+
+// 12.16. Vector Integer Merge Instructions
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), vti.RegClass:$rs1,
+ vti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
+ vti.RegClass:$rs2, vti.RegClass:$rs1, VMV0:$vm,
+ vti.AVL, vti.SEW)>;
+
+ def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1),
+ vti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
+ vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, vti.AVL, vti.SEW)>;
+
+ def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1),
+ vti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
+ vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, vti.AVL, vti.SEW)>;
+}
+
+// 16.1. Vector Mask-Register Logical Instructions
+foreach mti = AllMasks in {
+ def : Pat<(mti.Mask (and VR:$rs1, VR:$rs2)),
+ (!cast<Instruction>("PseudoVMAND_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ def : Pat<(mti.Mask (or VR:$rs1, VR:$rs2)),
+ (!cast<Instruction>("PseudoVMOR_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ def : Pat<(mti.Mask (xor VR:$rs1, VR:$rs2)),
+ (!cast<Instruction>("PseudoVMXOR_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+
+ def : Pat<(mti.Mask (vnot (and VR:$rs1, VR:$rs2))),
+ (!cast<Instruction>("PseudoVMNAND_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ def : Pat<(mti.Mask (vnot (or VR:$rs1, VR:$rs2))),
+ (!cast<Instruction>("PseudoVMNOR_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ def : Pat<(mti.Mask (vnot (xor VR:$rs1, VR:$rs2))),
+ (!cast<Instruction>("PseudoVMXNOR_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+
+ def : Pat<(mti.Mask (and VR:$rs1, (vnot VR:$rs2))),
+ (!cast<Instruction>("PseudoVMANDNOT_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+ def : Pat<(mti.Mask (or VR:$rs1, (vnot VR:$rs2))),
+ (!cast<Instruction>("PseudoVMORNOT_MM_"#mti.LMul.MX)
+ VR:$rs1, VR:$rs2, mti.AVL, mti.SEW)>;
+}
+
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+
+// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+defm "" : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">;
+defm "" : VPatBinaryFPSDNode_VV_VF<fsub, "PseudoVFSUB">;
+defm "" : VPatBinaryFPSDNode_R_VF<fsub, "PseudoVFRSUB">;
+
+// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+defm "" : VPatBinaryFPSDNode_VV_VF<fmul, "PseudoVFMUL">;
+defm "" : VPatBinaryFPSDNode_VV_VF<fdiv, "PseudoVFDIV">;
+defm "" : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">;
+
+// 14.11. Vector Floating-Point Compare Instructions
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOEQ, "PseudoVMFEQ", "PseudoVMFEQ">;
+
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETNE, "PseudoVMFNE", "PseudoVMFNE">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETUNE, "PseudoVMFNE", "PseudoVMFNE">;
+
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETLT, "PseudoVMFLT", "PseudoVMFGT">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOLT, "PseudoVMFLT", "PseudoVMFGT">;
+
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETLE, "PseudoVMFLE", "PseudoVMFGE">;
+defm "" : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
+
+// Floating-point vselects:
+// 12.16. Vector Integer Merge Instructions
+// 14.13. Vector Floating-Point Merge Instruction
+foreach fvti = AllFloatVectors in {
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+ fvti.RegClass:$rs2, fvti.RegClass:$rs1, VMV0:$vm,
+ fvti.AVL, fvti.SEW)>;
+
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+ (splat_vector fvti.ScalarRegClass:$rs1),
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
+ fvti.RegClass:$rs2,
+ (fvti.Scalar fvti.ScalarRegClass:$rs1),
+ VMV0:$vm, fvti.AVL, fvti.SEW)>;
+
+ def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+ (splat_vector (fvti.Scalar fpimm0)),
+ fvti.RegClass:$rs2)),
+ (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+ fvti.RegClass:$rs2, 0, VMV0:$vm, fvti.AVL, fvti.SEW)>;
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Vector Splats
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+foreach vti = AllIntegerVectors in {
+ def : Pat<(vti.Vector (splat_vector GPR:$rs1)),
+ (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
+ GPR:$rs1, vti.AVL, vti.SEW)>;
+ def : Pat<(vti.Vector (splat_vector simm5:$rs1)),
+ (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
+ simm5:$rs1, vti.AVL, vti.SEW)>;
+}
+
+foreach mti = AllMasks in {
+ def : Pat<(mti.Mask immAllOnesV),
+ (!cast<Instruction>("PseudoVMSET_M_"#mti.BX) mti.AVL, mti.SEW)>;
+ def : Pat<(mti.Mask immAllZerosV),
+ (!cast<Instruction>("PseudoVMCLR_M_"#mti.BX) mti.AVL, mti.SEW)>;
+}
+} // Predicates = [HasStdExtV]
+
+let Predicates = [HasStdExtV, IsRV32] in {
+foreach vti = AllIntegerVectors in {
+ if !eq(vti.SEW, 64) then {
+ def : Pat<(vti.Vector (rv32_splat_i64 GPR:$rs1)),
+ (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
+ GPR:$rs1, vti.AVL, vti.SEW)>;
+ def : Pat<(vti.Vector (rv32_splat_i64 simm5:$rs1)),
+ (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
+ simm5:$rs1, vti.AVL, vti.SEW)>;
+ }
+}
+} // Predicates = [HasStdExtV, IsRV32]
+
+let Predicates = [HasStdExtV, HasStdExtF] in {
+foreach fvti = AllFloatVectors in {
+ def : Pat<(fvti.Vector (splat_vector fvti.ScalarRegClass:$rs1)),
+ (!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ (fvti.Scalar fvti.ScalarRegClass:$rs1),
+ fvti.AVL, fvti.SEW)>;
+
+ def : Pat<(fvti.Vector (splat_vector (fvti.Scalar fpimm0))),
+ (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+ 0, fvti.AVL, fvti.SEW)>;
+}
+} // Predicates = [HasStdExtV, HasStdExtF]
+
+//===----------------------------------------------------------------------===//
+// Vector Element Inserts/Extracts
+//===----------------------------------------------------------------------===//
+
+// The built-in TableGen 'extractelt' and 'insertelt' nodes must return the
+// same type as the vector element type. On RISC-V, XLenVT is the only legal
+// integer type, so for integer inserts/extracts we use a custom node which
+// returns XLenVT.
+def riscv_insert_vector_elt
+ : SDNode<"ISD::INSERT_VECTOR_ELT",
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, XLenVT>,
+ SDTCisPtrTy<3>]>, []>;
+def riscv_extract_vector_elt
+ : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, XLenVT>, SDTCisPtrTy<2>]>, []>;
+
+multiclass VPatInsertExtractElt_XI_Idx<bit IsFloat> {
+ defvar vtilist = !if(IsFloat, AllFloatVectors, AllIntegerVectors);
+ defvar insertelt_node = !if(IsFloat, insertelt, riscv_insert_vector_elt);
+ defvar extractelt_node = !if(IsFloat, extractelt, riscv_extract_vector_elt);
+ foreach vti = vtilist in {
+ defvar MX = vti.LMul.MX;
+ defvar vmv_xf_s_inst = !cast<Instruction>(!strconcat("PseudoV",
+ !if(IsFloat, "F", ""),
+ "MV_",
+ vti.ScalarSuffix,
+ "_S_", MX));
+ defvar vmv_s_xf_inst = !cast<Instruction>(!strconcat("PseudoV",
+ !if(IsFloat, "F", ""),
+ "MV_S_",
+ vti.ScalarSuffix,
+ "_", MX));
+ // Only pattern-match insert/extract-element operations where the index is
+ // 0. Any other index will have been custom-lowered to slide the vector
+ // correctly into place (and, in the case of insert, slide it back again
+ // afterwards).
+ def : Pat<(vti.Scalar (extractelt_node (vti.Vector vti.RegClass:$rs2), 0)),
+ (vmv_xf_s_inst vti.RegClass:$rs2, vti.SEW)>;
+
+ def : Pat<(vti.Vector (insertelt_node (vti.Vector vti.RegClass:$merge),
+ vti.ScalarRegClass:$rs1, 0)),
+ (vmv_s_xf_inst vti.RegClass:$merge,
+ (vti.Scalar vti.ScalarRegClass:$rs1),
+ vti.AVL, vti.SEW)>;
+ }
+}
+
+let Predicates = [HasStdExtV] in
+defm "" : VPatInsertExtractElt_XI_Idx</*IsFloat*/0>;
+let Predicates = [HasStdExtV, HasStdExtF] in
+defm "" : VPatInsertExtractElt_XI_Idx</*IsFloat*/1>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous RISCVISD SDNodes
+//===----------------------------------------------------------------------===//
+
+def riscv_vid
+ : SDNode<"RISCVISD::VID", SDTypeProfile<1, 0, [SDTCisVec<0>]>, []>;
+
+def SDTRVVSlide : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisVT<3, XLenVT>
+]>;
+
+def riscv_slideup : SDNode<"RISCVISD::VSLIDEUP", SDTRVVSlide, []>;
+def riscv_slidedown : SDNode<"RISCVISD::VSLIDEDOWN", SDTRVVSlide, []>;
+
+let Predicates = [HasStdExtV] in {
+
+foreach vti = AllIntegerVectors in
+ def : Pat<(vti.Vector riscv_vid),
+ (!cast<Instruction>("PseudoVID_V_"#vti.LMul.MX) vti.AVL, vti.SEW)>;
+
+foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
+ def : Pat<(vti.Vector (riscv_slideup (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ uimm5:$rs2)),
+ (!cast<Instruction>("PseudoVSLIDEUP_VI_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
+ vti.AVL, vti.SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slideup (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2)),
+ (!cast<Instruction>("PseudoVSLIDEUP_VX_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+ vti.AVL, vti.SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slidedown (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ uimm5:$rs2)),
+ (!cast<Instruction>("PseudoVSLIDEDOWN_VI_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, uimm5:$rs2,
+ vti.AVL, vti.SEW)>;
+
+ def : Pat<(vti.Vector (riscv_slidedown (vti.Vector vti.RegClass:$rs3),
+ (vti.Vector vti.RegClass:$rs1),
+ GPR:$rs2)),
+ (!cast<Instruction>("PseudoVSLIDEDOWN_VX_"#vti.LMul.MX)
+ vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
+ vti.AVL, vti.SEW)>;
+}
+} // Predicates = [HasStdExtV]
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
new file mode 100644
index 000000000000..85ebe054499e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -0,0 +1,371 @@
+//===-- RISCVInstrInfoFH.td - RISC-V 'FH' instructions -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard 'Zfh'
+// half-precision floating-point extension, version 0.1.
+// This version is still experimental as the 'Zfh' extension hasn't been
+// ratified yet.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// RISC-V specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_RISCVFMV_H_X
+ : SDTypeProfile<1, 1, [SDTCisVT<0, f16>, SDTCisVT<1, XLenVT>]>;
+def SDT_RISCVFMV_X_ANYEXTH
+ : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, f16>]>;
+
+def riscv_fmv_h_x
+ : SDNode<"RISCVISD::FMV_H_X", SDT_RISCVFMV_H_X>;
+def riscv_fmv_x_anyexth
+ : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_ANYEXTH>;
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPFMAH_rrr_frm<RISCVOpcode opcode, string opcodestr>
+ : RVInstR4<0b10, opcode, (outs FPR16:$rd),
+ (ins FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, frmarg:$funct3),
+ opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
+
+class FPFMAHDynFrmAlias<FPFMAH_rrr_frm Inst, string OpcodeStr>
+ : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
+ (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPALUH_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
+ : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR16:$rd),
+ (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPALUH_rr_frm<bits<7> funct7, string opcodestr>
+ : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR16:$rd),
+ (ins FPR16:$rs1, FPR16:$rs2, frmarg:$funct3), opcodestr,
+ "$rd, $rs1, $rs2, $funct3">;
+
+class FPALUHDynFrmAlias<FPALUH_rr_frm Inst, string OpcodeStr>
+ : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
+ (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, 0b111)>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class FPCmpH_rr<bits<3> funct3, string opcodestr>
+ : RVInstR<0b1010010, funct3, OPC_OP_FP, (outs GPR:$rd),
+ (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">,
+ Sched<[]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZfh] in {
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+def FLH : RVInstI<0b001, OPC_LOAD_FP, (outs FPR16:$rd),
+ (ins GPR:$rs1, simm12:$imm12),
+ "flh", "$rd, ${imm12}(${rs1})">,
+ Sched<[]>;
+
+// Operands for stores are in the order srcreg, base, offset rather than
+// reflecting the order these fields are specified in the instruction
+// encoding.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+def FSH : RVInstS<0b001, OPC_STORE_FP, (outs),
+ (ins FPR16:$rs2, GPR:$rs1, simm12:$imm12),
+ "fsh", "$rs2, ${imm12}(${rs1})">,
+ Sched<[]>;
+
+def FMADD_H : FPFMAH_rrr_frm<OPC_MADD, "fmadd.h">,
+ Sched<[]>;
+def : FPFMAHDynFrmAlias<FMADD_H, "fmadd.h">;
+def FMSUB_H : FPFMAH_rrr_frm<OPC_MSUB, "fmsub.h">,
+ Sched<[]>;
+def : FPFMAHDynFrmAlias<FMSUB_H, "fmsub.h">;
+def FNMSUB_H : FPFMAH_rrr_frm<OPC_NMSUB, "fnmsub.h">,
+ Sched<[]>;
+def : FPFMAHDynFrmAlias<FNMSUB_H, "fnmsub.h">;
+def FNMADD_H : FPFMAH_rrr_frm<OPC_NMADD, "fnmadd.h">,
+ Sched<[]>;
+def : FPFMAHDynFrmAlias<FNMADD_H, "fnmadd.h">;
+
+def FADD_H : FPALUH_rr_frm<0b0000010, "fadd.h">,
+ Sched<[]>;
+def : FPALUHDynFrmAlias<FADD_H, "fadd.h">;
+def FSUB_H : FPALUH_rr_frm<0b0000110, "fsub.h">,
+ Sched<[]>;
+def : FPALUHDynFrmAlias<FSUB_H, "fsub.h">;
+def FMUL_H : FPALUH_rr_frm<0b0001010, "fmul.h">,
+ Sched<[]>;
+def : FPALUHDynFrmAlias<FMUL_H, "fmul.h">;
+def FDIV_H : FPALUH_rr_frm<0b0001110, "fdiv.h">,
+ Sched<[]>;
+def : FPALUHDynFrmAlias<FDIV_H, "fdiv.h">;
+
+def FSQRT_H : FPUnaryOp_r_frm<0b0101110, FPR16, FPR16, "fsqrt.h">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+def : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>;
+
+def FSGNJ_H : FPALUH_rr<0b0010010, 0b000, "fsgnj.h">,
+ Sched<[]>;
+def FSGNJN_H : FPALUH_rr<0b0010010, 0b001, "fsgnjn.h">,
+ Sched<[]>;
+def FSGNJX_H : FPALUH_rr<0b0010010, 0b010, "fsgnjx.h">,
+ Sched<[]>;
+
+def FMIN_H : FPALUH_rr<0b0010110, 0b000, "fmin.h">,
+ Sched<[]>;
+def FMAX_H : FPALUH_rr<0b0010110, 0b001, "fmax.h">,
+ Sched<[]>;
+
+def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.w.h">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>;
+
+def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.wu.h">,
+ Sched<[]> {
+ let rs2 = 0b00001;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>;
+
+def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.w">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>;
+
+def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.wu">,
+ Sched<[]> {
+ let rs2 = 0b00001;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
+
+def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, FPR16, FPR32, "fcvt.h.s">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
+
+def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b000, FPR32, FPR16, "fcvt.s.h">,
+ Sched<[]> {
+ let rs2 = 0b00010;
+}
+
+def FMV_X_H : FPUnaryOp_r<0b1110010, 0b000, GPR, FPR16, "fmv.x.h">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+
+def FMV_H_X : FPUnaryOp_r<0b1111010, 0b000, FPR16, GPR, "fmv.h.x">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+
+def FEQ_H : FPCmpH_rr<0b010, "feq.h">;
+def FLT_H : FPCmpH_rr<0b001, "flt.h">;
+def FLE_H : FPCmpH_rr<0b000, "fle.h">;
+
+def FCLASS_H : FPUnaryOp_r<0b1110010, 0b001, GPR, FPR16, "fclass.h">,
+ Sched<[]> {
+ let rs2 = 0b00000;
+}
+} // Predicates = [HasStdExtZfh]
+
+let Predicates = [HasStdExtZfh, IsRV64] in {
+def FCVT_L_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.l.h">,
+ Sched<[]> {
+ let rs2 = 0b00010;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>;
+
+def FCVT_LU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.lu.h">,
+ Sched<[]> {
+ let rs2 = 0b00011;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>;
+
+def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.l">,
+ Sched<[]> {
+ let rs2 = 0b00010;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>;
+
+def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.lu">,
+ Sched<[]> {
+ let rs2 = 0b00011;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
+} // Predicates = [HasStdExtZfh, IsRV64]
+
+let Predicates = [HasStdExtZfh, HasStdExtD] in {
+def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, FPR16, FPR64, "fcvt.h.d">,
+ Sched<[]> {
+ let rs2 = 0b00001;
+}
+def : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
+
+def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR16, "fcvt.d.h">,
+ Sched<[]> {
+ let rs2 = 0b00010;
+}
+} // Predicates = [HasStdExtZfh, HasStdExtD]
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZfh] in {
+def : InstAlias<"flh $rd, (${rs1})", (FLH FPR16:$rd, GPR:$rs1, 0), 0>;
+def : InstAlias<"fsh $rs2, (${rs1})", (FSH FPR16:$rs2, GPR:$rs1, 0), 0>;
+
+def : InstAlias<"fmv.h $rd, $rs", (FSGNJ_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
+def : InstAlias<"fabs.h $rd, $rs", (FSGNJX_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
+def : InstAlias<"fneg.h $rd, $rs", (FSGNJN_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
+
+// fgt.h/fge.h are recognised by the GNU assembler but the canonical
+// flt.h/fle.h forms will always be printed. Therefore, set a zero weight.
+def : InstAlias<"fgt.h $rd, $rs, $rt",
+ (FLT_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
+def : InstAlias<"fge.h $rd, $rs, $rt",
+ (FLE_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
+
+def PseudoFLH : PseudoFloatLoad<"flh", FPR16>;
+def PseudoFSH : PseudoStore<"fsh", FPR16>;
+} // Predicates = [HasStdExtZfh]
+
+//===----------------------------------------------------------------------===//
+// Pseudo-instructions and codegen patterns
+//===----------------------------------------------------------------------===//
+
+/// Generic pattern classes
+class PatFpr16Fpr16<SDPatternOperator OpNode, RVInstR Inst>
+ : Pat<(OpNode FPR16:$rs1, FPR16:$rs2), (Inst $rs1, $rs2)>;
+
+class PatFpr16Fpr16DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
+ : Pat<(OpNode FPR16:$rs1, FPR16:$rs2), (Inst $rs1, $rs2, 0b111)>;
+
+let Predicates = [HasStdExtZfh] in {
+
+/// Float constants
+def : Pat<(f16 (fpimm0)), (FMV_H_X X0)>;
+
+/// Float conversion operations
+
+// [u]int32<->float conversion patterns must be gated on IsRV32 or IsRV64, so
+// are defined later.
+
+/// Float arithmetic operations
+
+def : PatFpr16Fpr16DynFrm<fadd, FADD_H>;
+def : PatFpr16Fpr16DynFrm<fsub, FSUB_H>;
+def : PatFpr16Fpr16DynFrm<fmul, FMUL_H>;
+def : PatFpr16Fpr16DynFrm<fdiv, FDIV_H>;
+
+def : Pat<(fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>;
+
+def : Pat<(fneg FPR16:$rs1), (FSGNJN_H $rs1, $rs1)>;
+def : Pat<(fabs FPR16:$rs1), (FSGNJX_H $rs1, $rs1)>;
+
+def : PatFpr16Fpr16<fcopysign, FSGNJ_H>;
+def : Pat<(fcopysign FPR16:$rs1, (fneg FPR16:$rs2)), (FSGNJN_H $rs1, $rs2)>;
+def : Pat<(fcopysign FPR16:$rs1, FPR32:$rs2),
+ (FSGNJ_H $rs1, (FCVT_H_S $rs2, 0b111))>;
+def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
+ (FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>;
+def : Pat<(fcopysign FPR32:$rs1, FPR16:$rs2), (FSGNJ_S $rs1, (FCVT_S_H $rs2))>;
+def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>;
+
+// fmadd: rs1 * rs2 + rs3
+def : Pat<(fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3),
+ (FMADD_H $rs1, $rs2, $rs3, 0b111)>;
+
+// fmsub: rs1 * rs2 - rs3
+def : Pat<(fma FPR16:$rs1, FPR16:$rs2, (fneg FPR16:$rs3)),
+ (FMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+// fnmsub: -rs1 * rs2 + rs3
+def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3),
+ (FNMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+// fnmadd: -rs1 * rs2 - rs3
+def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)),
+ (FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
+
+def : PatFpr16Fpr16<fminnum, FMIN_H>;
+def : PatFpr16Fpr16<fmaxnum, FMAX_H>;
+
+/// Setcc
+
+def : PatFpr16Fpr16<seteq, FEQ_H>;
+def : PatFpr16Fpr16<setoeq, FEQ_H>;
+def : PatFpr16Fpr16<setlt, FLT_H>;
+def : PatFpr16Fpr16<setolt, FLT_H>;
+def : PatFpr16Fpr16<setle, FLE_H>;
+def : PatFpr16Fpr16<setole, FLE_H>;
+
+def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>;
+
+/// Loads
+
+defm : LdPat<load, FLH>;
+
+/// Stores
+
+defm : StPat<store, FSH, FPR16>;
+
+/// Float conversion operations
+
+// f32 -> f16, f16 -> f32
+def : Pat<(fpround FPR32:$rs1), (FCVT_H_S FPR32:$rs1, 0b111)>;
+def : Pat<(fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
+
+// Moves (no conversion)
+def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>;
+def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>;
+} // Predicates = [HasStdExtZfh]
+
+let Predicates = [HasStdExtZfh, IsRV32] in {
+// float->[u]int. Round-to-zero must be used.
+def : Pat<(fp_to_sint FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
+
+// [u]int->float. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_H_WU $rs1, 0b111)>;
+} // Predicates = [HasStdExtZfh, IsRV32]
+
+let Predicates = [HasStdExtZfh, IsRV64] in {
+// FP->[u]int32 is mostly handled by the FP->[u]int64 patterns. This is safe
+// because fpto[u|s]i produces poison if the value can't fit into the target.
+// We match the single case below because fcvt.wu.s sign-extends its result so
+// is cheaper than fcvt.lu.h+sext.w.
+def : Pat<(sext_inreg (assertzexti32 (fp_to_uint FPR16:$rs1)), i32),
+ (FCVT_WU_H $rs1, 0b001)>;
+
+// FP->[u]int64
+def : Pat<(fp_to_sint FPR16:$rs1), (FCVT_L_H $rs1, 0b001)>;
+def : Pat<(fp_to_uint FPR16:$rs1), (FCVT_LU_H $rs1, 0b001)>;
+
+// [u]int->fp. Match GCC and default to using dynamic rounding mode.
+def : Pat<(sint_to_fp (sexti32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(uint_to_fp (zexti32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>;
+def : Pat<(sint_to_fp GPR:$rs1), (FCVT_H_L $rs1, 0b111)>;
+def : Pat<(uint_to_fp GPR:$rs1), (FCVT_H_LU $rs1, 0b111)>;
+} // Predicates = [HasStdExtZfh, IsRV64]
+
+let Predicates = [HasStdExtZfh, HasStdExtD] in {
+/// Float conversion operations
+// f64 -> f16, f16 -> f64
+def : Pat<(fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
+def : Pat<(fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index b1dbcfa7f738..3c38dd1bf64d 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "RISCV.h"
+#include "RISCVSubtarget.h"
#include "MCTargetDesc/RISCVMCExpr.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -121,12 +122,93 @@ bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
case MachineOperand::MO_ConstantPoolIndex:
MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP);
break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = lowerSymbolOperand(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+ break;
+ }
+ return true;
+}
+
+static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
+ MCInst &OutMI) {
+ const RISCVVPseudosTable::PseudoInfo *RVV =
+ RISCVVPseudosTable::getPseudoInfo(MI->getOpcode());
+ if (!RVV)
+ return false;
+
+ OutMI.setOpcode(RVV->BaseInstr);
+
+ const MachineBasicBlock *MBB = MI->getParent();
+ assert(MBB && "MI expected to be in a basic block");
+ const MachineFunction *MF = MBB->getParent();
+ assert(MF && "MBB expected to be in a machine function");
+
+ const TargetRegisterInfo *TRI =
+ MF->getSubtarget<RISCVSubtarget>().getRegisterInfo();
+ assert(TRI && "TargetRegisterInfo expected");
+
+ uint64_t TSFlags = MI->getDesc().TSFlags;
+ int NumOps = MI->getNumExplicitOperands();
+
+ for (const MachineOperand &MO : MI->explicit_operands()) {
+ int OpNo = (int)MI->getOperandNo(&MO);
+ assert(OpNo >= 0 && "Operand number doesn't fit in an 'int' type");
+
+ // Skip VL and SEW operands which are the last two operands if present.
+ if ((TSFlags & RISCVII::HasVLOpMask) && OpNo == (NumOps - 2))
+ continue;
+ if ((TSFlags & RISCVII::HasSEWOpMask) && OpNo == (NumOps - 1))
+ continue;
+
+ // Skip merge op. It should be the first operand after the result.
+ if ((TSFlags & RISCVII::HasMergeOpMask) && OpNo == 1) {
+ assert(MI->getNumExplicitDefs() == 1);
+ continue;
+ }
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ llvm_unreachable("Unknown operand type");
+ case MachineOperand::MO_Register: {
+ unsigned Reg = MO.getReg();
+
+ if (RISCV::VRM2RegClass.contains(Reg) ||
+ RISCV::VRM4RegClass.contains(Reg) ||
+ RISCV::VRM8RegClass.contains(Reg)) {
+ Reg = TRI->getSubReg(Reg, RISCV::sub_vrm1_0);
+ assert(Reg && "Subregister does not exist");
+ } else if (RISCV::FPR16RegClass.contains(Reg)) {
+ Reg = TRI->getMatchingSuperReg(Reg, RISCV::sub_16, &RISCV::FPR32RegClass);
+ assert(Reg && "Subregister does not exist");
+ } else if (RISCV::FPR64RegClass.contains(Reg)) {
+ Reg = TRI->getSubReg(Reg, RISCV::sub_32);
+ assert(Reg && "Superregister does not exist");
+ }
+
+ MCOp = MCOperand::createReg(Reg);
+ break;
+ }
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::createImm(MO.getImm());
+ break;
+ }
+ OutMI.addOperand(MCOp);
}
+
+ // Unmasked pseudo instructions need to append dummy mask operand to
+ // V instructions. All V instructions are modeled as the masked version.
+ if (TSFlags & RISCVII::HasDummyMaskOpMask)
+ OutMI.addOperand(MCOperand::createReg(RISCV::NoRegister));
+
return true;
}
void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
const AsmPrinter &AP) {
+ if (lowerRISCVVMachineInstrToMCInst(MI, OutMI))
+ return;
+
OutMI.setOpcode(MI->getOpcode());
for (const MachineOperand &MO : MI->operands()) {
@@ -134,4 +216,20 @@ void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
OutMI.addOperand(MCOp);
}
+
+ if (OutMI.getOpcode() == RISCV::PseudoReadVLENB) {
+ OutMI.setOpcode(RISCV::CSRRS);
+ OutMI.addOperand(MCOperand::createImm(
+ RISCVSysReg::lookupSysRegByName("VLENB")->Encoding));
+ OutMI.addOperand(MCOperand::createReg(RISCV::X0));
+ return;
+ }
+
+ if (OutMI.getOpcode() == RISCV::PseudoReadVL) {
+ OutMI.setOpcode(RISCV::CSRRS);
+ OutMI.addOperand(MCOperand::createImm(
+ RISCVSysReg::lookupSysRegByName("VL")->Encoding));
+ OutMI.addOperand(MCOperand::createReg(RISCV::X0));
+ return;
+ }
}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 4c9013aa1e23..87586023caa4 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -64,7 +64,7 @@ private:
} // end anonymous namespace
char RISCVMergeBaseOffsetOpt::ID = 0;
-INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, "riscv-merge-base-offset",
+INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
RISCV_MERGE_BASE_OFFSET_NAME, false, false)
// Detect the pattern:
@@ -216,12 +216,14 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
case RISCV::LHU:
case RISCV::LWU:
case RISCV::LD:
+ case RISCV::FLH:
case RISCV::FLW:
case RISCV::FLD:
case RISCV::SB:
case RISCV::SH:
case RISCV::SW:
case RISCV::SD:
+ case RISCV::FSH:
case RISCV::FSW:
case RISCV::FSD: {
// Transforms the sequence: Into:
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index cb7d55eb0f0c..631077ef83f5 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -29,6 +29,9 @@ using namespace llvm;
static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive");
static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive");
+static_assert(RISCV::F1_H == RISCV::F0_H + 1, "Register list not consecutive");
+static_assert(RISCV::F31_H == RISCV::F0_H + 31,
+ "Register list not consecutive");
static_assert(RISCV::F1_F == RISCV::F0_F + 1, "Register list not consecutive");
static_assert(RISCV::F31_F == RISCV::F0_F + 31,
"Register list not consecutive");
@@ -45,6 +48,8 @@ RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
const MCPhysReg *
RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
auto &Subtarget = MF->getSubtarget<RISCVSubtarget>();
+ if (MF->getFunction().getCallingConv() == CallingConv::GHC)
+ return CSR_NoRegs_SaveList;
if (MF->getFunction().hasFnAttribute("interrupt")) {
if (Subtarget.hasStdExtD())
return CSR_XLEN_F64_Interrupt_SaveList;
@@ -89,6 +94,13 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// variable-sized objects at runtime.
if (TFI->hasBP(MF))
markSuperRegs(Reserved, RISCVABI::getBPReg()); // bp
+
+ // V registers for code generation. We handle them manually.
+ markSuperRegs(Reserved, RISCV::VL);
+ markSuperRegs(Reserved, RISCV::VTYPE);
+ markSuperRegs(Reserved, RISCV::VXSAT);
+ markSuperRegs(Reserved, RISCV::VXRM);
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
@@ -152,9 +164,10 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
Register FrameReg;
- int Offset =
- getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg) +
- MI.getOperand(FIOperandNum + 1).getImm();
+ int Offset = getFrameLowering(MF)
+ ->getFrameIndexReference(MF, FrameIndex, FrameReg)
+ .getFixed() +
+ MI.getOperand(FIOperandNum + 1).getImm();
if (!isInt<32>(Offset)) {
report_fatal_error(
@@ -190,9 +203,11 @@ Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const uint32_t *
RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
- CallingConv::ID /*CC*/) const {
+ CallingConv::ID CC) const {
auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
+ if (CC == CallingConv::GHC)
+ return CSR_NoRegs_RegMask;
switch (Subtarget.getTargetABI()) {
default:
llvm_unreachable("Unrecognized ABI");
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 7544b4b3b845..e1a11fd9389f 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -16,14 +16,23 @@ class RISCVReg<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
let AltNames = alt;
}
-class RISCVReg32<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
+class RISCVReg16<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
let HWEncoding{4-0} = Enc;
let AltNames = alt;
}
+def sub_16 : SubRegIndex<16>;
+class RISCVReg32<RISCVReg16 subreg> : Register<""> {
+ let HWEncoding{4-0} = subreg.HWEncoding{4-0};
+ let SubRegs = [subreg];
+ let SubRegIndices = [sub_16];
+ let AsmName = subreg.AsmName;
+ let AltNames = subreg.AltNames;
+}
+
// Because RISCVReg64 register have AsmName and AltNames that alias with their
-// 32-bit sub-register, RISCVAsmParser will need to coerce a register number
-// from a RISCVReg32 to the equivalent RISCVReg64 when appropriate.
+// 16/32-bit sub-register, RISCVAsmParser will need to coerce a register number
+// from a RISCVReg16/RISCVReg32 to the equivalent RISCVReg64 when appropriate.
def sub_32 : SubRegIndex<32>;
class RISCVReg64<RISCVReg32 subreg> : Register<""> {
let HWEncoding{4-0} = subreg.HWEncoding{4-0};
@@ -42,12 +51,21 @@ class RISCVRegWithSubRegs<bits<5> Enc, string n, list<Register> subregs,
def ABIRegAltName : RegAltNameIndex;
-def sub_vrm2 : SubRegIndex<64, -1>;
-def sub_vrm2_hi : SubRegIndex<64, -1>;
-def sub_vrm4 : SubRegIndex<128, -1>;
-def sub_vrm4_hi : SubRegIndex<128, -1>;
-def sub_vrm8 : SubRegIndex<256, -1>;
-def sub_vrm8_hi : SubRegIndex<256, -1>;
+def sub_vrm1_0 : SubRegIndex<64, -1>;
+def sub_vrm1_1 : SubRegIndex<64, -1>;
+def sub_vrm1_2 : SubRegIndex<64, -1>;
+def sub_vrm1_3 : SubRegIndex<64, -1>;
+def sub_vrm1_4 : SubRegIndex<64, -1>;
+def sub_vrm1_5 : SubRegIndex<64, -1>;
+def sub_vrm1_6 : SubRegIndex<64, -1>;
+def sub_vrm1_7 : SubRegIndex<64, -1>;
+def sub_vrm2_0 : SubRegIndex<128, -1>;
+def sub_vrm2_1 : SubRegIndex<128, -1>;
+def sub_vrm2_2 : SubRegIndex<128, -1>;
+def sub_vrm2_3 : SubRegIndex<128, -1>;
+def sub_vrm4_0 : SubRegIndex<256, -1>;
+def sub_vrm4_1 : SubRegIndex<256, -1>;
+
} // Namespace = "RISCV"
// Integer registers
@@ -97,8 +115,8 @@ let RegAltNameIndices = [ABIRegAltName] in {
}
}
-def XLenVT : ValueTypeByHwMode<[RV32, RV64, DefaultMode],
- [i32, i64, i32]>;
+def XLenVT : ValueTypeByHwMode<[RV32, RV64],
+ [i32, i64]>;
// The order of registers represents the preferred allocation sequence.
// Registers are listed in the order caller-save, callee-save, specials.
@@ -111,14 +129,14 @@ def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add
(sequence "X%u", 0, 4)
)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
// The order of registers represents the preferred allocation sequence.
@@ -132,8 +150,8 @@ def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add
(sequence "X%u", 1, 4)
)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -145,8 +163,8 @@ def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
X1, X3, X4
)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -154,8 +172,8 @@ def GPRC : RegisterClass<"RISCV", [XLenVT], 32, (add
(sequence "X%u", 8, 9)
)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
// For indirect tail calls, we can't use callee-saved registers, as they are
@@ -167,50 +185,55 @@ def GPRTC : RegisterClass<"RISCV", [XLenVT], 32, (add
(sequence "X%u", 28, 31)
)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
let RegInfos = RegInfoByHwMode<
- [RV32, RV64, DefaultMode],
- [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+ [RV32, RV64],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
}
// Floating point registers
let RegAltNameIndices = [ABIRegAltName] in {
- def F0_F : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
- def F1_F : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
- def F2_F : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
- def F3_F : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
- def F4_F : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
- def F5_F : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
- def F6_F : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
- def F7_F : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
- def F8_F : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
- def F9_F : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
- def F10_F : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
- def F11_F : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
- def F12_F : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
- def F13_F : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
- def F14_F : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
- def F15_F : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
- def F16_F : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
- def F17_F : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
- def F18_F : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
- def F19_F : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
- def F20_F : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
- def F21_F : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
- def F22_F : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
- def F23_F : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
- def F24_F : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
- def F25_F : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
- def F26_F : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
- def F27_F : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
- def F28_F : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
- def F29_F : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
- def F30_F : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
- def F31_F : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
+ def F0_H : RISCVReg16<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
+ def F1_H : RISCVReg16<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
+ def F2_H : RISCVReg16<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
+ def F3_H : RISCVReg16<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
+ def F4_H : RISCVReg16<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
+ def F5_H : RISCVReg16<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
+ def F6_H : RISCVReg16<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
+ def F7_H : RISCVReg16<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
+ def F8_H : RISCVReg16<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
+ def F9_H : RISCVReg16<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
+ def F10_H : RISCVReg16<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
+ def F11_H : RISCVReg16<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
+ def F12_H : RISCVReg16<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
+ def F13_H : RISCVReg16<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
+ def F14_H : RISCVReg16<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
+ def F15_H : RISCVReg16<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
+ def F16_H : RISCVReg16<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
+ def F17_H : RISCVReg16<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
+ def F18_H : RISCVReg16<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
+ def F19_H : RISCVReg16<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
+ def F20_H : RISCVReg16<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
+ def F21_H : RISCVReg16<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
+ def F22_H : RISCVReg16<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
+ def F23_H : RISCVReg16<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
+ def F24_H : RISCVReg16<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
+ def F25_H : RISCVReg16<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
+ def F26_H : RISCVReg16<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
+ def F27_H : RISCVReg16<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
+ def F28_H : RISCVReg16<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
+ def F29_H : RISCVReg16<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
+ def F30_H : RISCVReg16<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
+ def F31_H : RISCVReg16<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
+
+ foreach Index = 0-31 in {
+ def F#Index#_F : RISCVReg32<!cast<RISCVReg16>("F"#Index#"_H")>,
+ DwarfRegNum<[!add(Index, 32)]>;
+ }
foreach Index = 0-31 in {
def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
@@ -220,6 +243,14 @@ let RegAltNameIndices = [ABIRegAltName] in {
// The order of registers represents the preferred allocation sequence,
// meaning caller-save regs are listed before callee-save.
+def FPR16 : RegisterClass<"RISCV", [f16], 16, (add
+ (sequence "F%u_H", 0, 7),
+ (sequence "F%u_H", 10, 17),
+ (sequence "F%u_H", 28, 31),
+ (sequence "F%u_H", 8, 9),
+ (sequence "F%u_H", 18, 27)
+)>;
+
def FPR32 : RegisterClass<"RISCV", [f32], 32, (add
(sequence "F%u_F", 0, 7),
(sequence "F%u_F", 10, 17),
@@ -248,10 +279,139 @@ def FPR64C : RegisterClass<"RISCV", [f64], 64, (add
(sequence "F%u_D", 8, 9)
)>;
+// Vector type mapping to LLVM types.
+//
+// Though the V extension allows that VLEN be as small as 8,
+// this approach assumes that VLEN>=64.
+// Additionally, the only supported ELEN values are 32 and 64,
+// thus `vscale` can be defined as VLEN/64,
+// allowing the same types with either ELEN value.
+//
+// MF8 MF4 MF2 M1 M2 M4 M8
+// i64* N/A N/A N/A nxv1i64 nxv2i64 nxv4i64 nxv8i64
+// i32 N/A N/A nxv1i32 nxv2i32 nxv4i32 nxv8i32 nxv16i32
+// i16 N/A nxv1i16 nxv2i16 nxv4i16 nxv8i16 nxv16i16 nxv32i16
+// i8 nxv1i8 nxv2i8 nxv4i8 nxv8i8 nxv16i8 nxv32i8 nxv64i8
+// double* N/A N/A N/A nxv1f64 nxv2f64 nxv4f64 nxv8f64
+// float N/A N/A nxv1f32 nxv2f32 nxv4f32 nxv8f32 nxv16f32
+// half N/A nxv1f16 nxv2f16 nxv4f16 nxv8f16 nxv16f16 nxv32f16
+// * ELEN=64
+
+defvar vint8mf8_t = nxv1i8;
+defvar vint8mf4_t = nxv2i8;
+defvar vint8mf2_t = nxv4i8;
+defvar vint8m1_t = nxv8i8;
+defvar vint8m2_t = nxv16i8;
+defvar vint8m4_t = nxv32i8;
+defvar vint8m8_t = nxv64i8;
+
+defvar vint16mf4_t = nxv1i16;
+defvar vint16mf2_t = nxv2i16;
+defvar vint16m1_t = nxv4i16;
+defvar vint16m2_t = nxv8i16;
+defvar vint16m4_t = nxv16i16;
+defvar vint16m8_t = nxv32i16;
+
+defvar vint32mf2_t = nxv1i32;
+defvar vint32m1_t = nxv2i32;
+defvar vint32m2_t = nxv4i32;
+defvar vint32m4_t = nxv8i32;
+defvar vint32m8_t = nxv16i32;
+
+defvar vint64m1_t = nxv1i64;
+defvar vint64m2_t = nxv2i64;
+defvar vint64m4_t = nxv4i64;
+defvar vint64m8_t = nxv8i64;
+
+defvar vfloat16mf4_t = nxv1f16;
+defvar vfloat16mf2_t = nxv2f16;
+defvar vfloat16m1_t = nxv4f16;
+defvar vfloat16m2_t = nxv8f16;
+defvar vfloat16m4_t = nxv16f16;
+defvar vfloat16m8_t = nxv32f16;
+
+defvar vfloat32mf2_t = nxv1f32;
+defvar vfloat32m1_t = nxv2f32;
+defvar vfloat32m2_t = nxv4f32;
+defvar vfloat32m4_t = nxv8f32;
+defvar vfloat32m8_t = nxv16f32;
+
+defvar vfloat64m1_t = nxv1f64;
+defvar vfloat64m2_t = nxv2f64;
+defvar vfloat64m4_t = nxv4f64;
+defvar vfloat64m8_t = nxv8f64;
+
+defvar vbool1_t = nxv64i1;
+defvar vbool2_t = nxv32i1;
+defvar vbool4_t = nxv16i1;
+defvar vbool8_t = nxv8i1;
+defvar vbool16_t = nxv4i1;
+defvar vbool32_t = nxv2i1;
+defvar vbool64_t = nxv1i1;
+
+// There is no need to define register classes for fractional LMUL.
+def LMULList {
+ list<int> m = [1, 2, 4, 8];
+}
+
+//===----------------------------------------------------------------------===//
+// Utility classes for segment load/store.
+//===----------------------------------------------------------------------===//
+// The set of legal NF for LMUL = lmul.
+// LMUL == 1, NF = 2, 3, 4, 5, 6, 7, 8
+// LMUL == 2, NF = 2, 3, 4
+// LMUL == 4, NF = 2
+class NFList<int lmul> {
+ list<int> L = !cond(!eq(lmul, 1): [2, 3, 4, 5, 6, 7, 8],
+ !eq(lmul, 2): [2, 3, 4],
+ !eq(lmul, 4): [2],
+ !eq(lmul, 8): []);
+}
+
+// Generate [start, end) SubRegIndex list.
+class SubRegSet<list<SubRegIndex> LIn, int start, int nf, int lmul> {
+ list<SubRegIndex> L = !foldl([]<SubRegIndex>,
+ [0, 1, 2, 3, 4, 5, 6, 7],
+ AccList, i,
+ !listconcat(AccList,
+ !if(!lt(i, nf),
+ [!cast<SubRegIndex>("sub_vrm" # lmul # "_" # i)],
+ [])));
+}
+
+class IndexSet<int index, int nf, int lmul> {
+ list<int> R =
+ !foldl([]<int>,
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30, 31],
+ L, i,
+ !listconcat(L,
+ !if(!and(
+ !le(!mul(index, lmul), !mul(i, lmul)),
+ !le(!mul(i, lmul),
+ !sub(!add(32, !mul(index, lmul)), !mul(nf, lmul)))
+ ), [!mul(i, lmul)], [])));
+}
+
+class VRegList<list<dag> LIn, int start, int nf, int lmul> {
+ list<dag> L =
+ !if(!ge(start, nf),
+ LIn,
+ !listconcat(
+ [!dag(add,
+ !foreach(i, IndexSet<start, nf, lmul>.R,
+ !cast<Register>("V" # i # !cond(!eq(lmul, 2): "M2",
+ !eq(lmul, 4): "M4",
+ true: ""))),
+ !listsplat("", !size(IndexSet<start, nf, lmul>.R)))],
+ VRegList<LIn, !add(start, 1), nf, lmul>.L));
+}
+
// Vector registers
let RegAltNameIndices = [ABIRegAltName] in {
foreach Index = 0-31 in {
- def V#Index : RISCVReg<Index, "v"#Index, ["v"#Index]>, DwarfRegNum<[!add(Index, 64)]>;
+ def V#Index : RISCVReg<Index, "v"#Index, ["v"#Index]>, DwarfRegNum<[!add(Index, 96)]>;
}
foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
@@ -261,7 +421,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
!cast<Register>("V"#!add(Index, 1))],
["v"#Index]>,
DwarfRegAlias<!cast<Register>("V"#Index)> {
- let SubRegIndices = [sub_vrm2, sub_vrm2_hi];
+ let SubRegIndices = [sub_vrm1_0, sub_vrm1_1];
}
}
@@ -271,7 +431,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
!cast<Register>("V"#!add(Index, 2)#"M2")],
["v"#Index]>,
DwarfRegAlias<!cast<Register>("V"#Index)> {
- let SubRegIndices = [sub_vrm4, sub_vrm4_hi];
+ let SubRegIndices = [sub_vrm2_0, sub_vrm2_1];
}
}
@@ -281,54 +441,91 @@ let RegAltNameIndices = [ABIRegAltName] in {
!cast<Register>("V"#!add(Index, 4)#"M4")],
["v"#Index]>,
DwarfRegAlias<!cast<Register>("V"#Index)> {
- let SubRegIndices = [sub_vrm8, sub_vrm8_hi];
+ let SubRegIndices = [sub_vrm4_0, sub_vrm4_1];
}
}
def VTYPE : RISCVReg<0, "vtype", ["vtype"]>;
def VL : RISCVReg<0, "vl", ["vl"]>;
+ def VXSAT : RISCVReg<0, "vxsat", ["vxsat"]>;
+ def VXRM : RISCVReg<0, "vxrm", ["vxrm"]>;
}
-class RegisterTypes<list<ValueType> reg_types> {
- list<ValueType> types = reg_types;
-}
-
-// The order of registers represents the preferred allocation sequence,
-// meaning caller-save regs are listed before callee-save.
-def VR : RegisterClass<"RISCV", [nxv8i8, nxv4i16, nxv2i32, nxv1i64],
- 64, (add
- (sequence "V%u", 25, 31),
- (sequence "V%u", 8, 24),
- (sequence "V%u", 0, 7)
- )> {
- let Size = 64;
-}
-
-def VRM2 : RegisterClass<"RISCV", [nxv16i8, nxv8i16, nxv4i32, nxv2i64], 64,
- (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
- V18M2, V20M2, V22M2, V24M2, V0M2, V2M2, V4M2, V6M2)> {
- let Size = 128;
-}
-
-def VRM4 : RegisterClass<"RISCV", [nxv32i8, nxv16i16, nxv8i32, nxv4i64], 64,
- (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V0M4, V4M4)> {
- let Size = 256;
+foreach m = [1, 2, 4] in {
+ foreach n = NFList<m>.L in {
+ def "VN" # n # "M" # m: RegisterTuples<SubRegSet<[], 0, n, m>.L,
+ VRegList<[], 0, n, m>.L>;
+ }
}
-def VRM8 : RegisterClass<"RISCV", [nxv32i16, nxv16i32, nxv8i64], 64,
- (add V8M8, V16M8, V24M8, V0M8)> {
- let Size = 512;
+class VReg<list<ValueType> regTypes, dag regList, int Vlmul>
+ : RegisterClass<"RISCV",
+ regTypes,
+ 64, // The maximum supported ELEN is 64.
+ regList> {
+ int VLMul = Vlmul;
+ int Size = !mul(Vlmul, 64);
}
-def VMaskVT : RegisterTypes<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1, nxv32i1]>;
-
-def VM : RegisterClass<"RISCV", VMaskVT.types, 64, (add
- (sequence "V%u", 25, 31),
- (sequence "V%u", 8, 24),
- (sequence "V%u", 0, 7))> {
+def VR : VReg<[vint8mf2_t, vint8mf4_t, vint8mf8_t,
+ vint16mf2_t, vint16mf4_t, vint32mf2_t,
+ vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
+ vfloat16mf4_t, vfloat16mf2_t, vfloat16m1_t,
+ vfloat32mf2_t, vfloat32m1_t, vfloat64m1_t,
+ vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
+ vbool2_t, vbool1_t],
+ (add (sequence "V%u", 25, 31),
+ (sequence "V%u", 8, 24),
+ (sequence "V%u", 0, 7)), 1>;
+
+def VRNoV0 : VReg<[vint8mf2_t, vint8mf4_t, vint8mf8_t,
+ vint16mf2_t, vint16mf4_t, vint32mf2_t,
+ vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
+ vfloat16mf4_t, vfloat16mf2_t, vfloat16m1_t,
+ vfloat32mf2_t, vfloat32m1_t, vfloat64m1_t,
+ vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
+ vbool2_t, vbool1_t],
+ (add (sequence "V%u", 25, 31),
+ (sequence "V%u", 8, 24),
+ (sequence "V%u", 1, 7)), 1>;
+
+def VRM2 : VReg<[vint8m2_t, vint16m2_t, vint32m2_t, vint64m2_t,
+ vfloat16m2_t, vfloat32m2_t, vfloat64m2_t],
+ (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
+ V18M2, V20M2, V22M2, V24M2, V0M2, V2M2, V4M2, V6M2), 2>;
+
+def VRM2NoV0 : VReg<[vint8m2_t, vint16m2_t, vint32m2_t, vint64m2_t,
+ vfloat16m2_t, vfloat32m2_t, vfloat64m2_t],
+ (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
+ V18M2, V20M2, V22M2, V24M2, V2M2, V4M2, V6M2), 2>;
+
+def VRM4 : VReg<[vint8m4_t, vint16m4_t, vint32m4_t, vint64m4_t,
+ vfloat16m4_t, vfloat32m4_t, vfloat64m4_t],
+ (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V0M4, V4M4), 4>;
+
+def VRM4NoV0 : VReg<[vint8m4_t, vint16m4_t, vint32m4_t, vint64m4_t,
+ vfloat16m4_t, vfloat32m4_t, vfloat64m4_t],
+ (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V4M4), 4>;
+
+def VRM8 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
+ vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
+ (add V8M8, V16M8, V24M8, V0M8), 8>;
+
+def VRM8NoV0 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
+ vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
+ (add V8M8, V16M8, V24M8), 8>;
+
+defvar VMaskVTs = [vbool64_t, vbool32_t, vbool16_t, vbool8_t,
+ vbool4_t, vbool2_t, vbool1_t];
+
+def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
let Size = 64;
}
-def VMV0 : RegisterClass<"RISCV", VMaskVT.types, 64, (add V0)> {
- let Size = 64;
+foreach m = LMULList.m in {
+ foreach nf = NFList<m>.L in {
+ def "VRN" # nf # "M" # m : VReg<[untyped],
+ (add !cast<RegisterTuples>("VN" # nf # "M" # m)),
+ !mul(nf, m)>;
+ }
}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
new file mode 100644
index 000000000000..de2cdf512e87
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -0,0 +1,233 @@
+//==- RISCVSchedRocket.td - Rocket Scheduling Definitions ----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedule.h for details.
+
+// Rocket machine model for scheduling and other instruction cost heuristics.
+def RocketModel : SchedMachineModel {
+ let MicroOpBufferSize = 0; // Rocket is in-order.
+ let IssueWidth = 1; // 1 micro-op is dispatched per cycle.
+ let LoadLatency = 3;
+ let MispredictPenalty = 3;
+ let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Rocket is in-order.
+
+let BufferSize = 0 in {
+def RocketUnitALU : ProcResource<1>; // Int ALU
+def RocketUnitIMul : ProcResource<1>; // Int Multiply
+def RocketUnitMem : ProcResource<1>; // Load/Store
+def RocketUnitB : ProcResource<1>; // Branch
+
+def RocketUnitFPALU : ProcResource<1>; // FP ALU
+}
+
+let BufferSize = 1 in {
+def RocketUnitIDiv : ProcResource<1>; // Int Division
+def RocketUnitFPDivSqrt : ProcResource<1>; // FP Divide/Sqrt
+}
+
+//===----------------------------------------------------------------------===//
+
+let SchedModel = RocketModel in {
+
+// Branching
+def : WriteRes<WriteJmp, [RocketUnitB]>;
+def : WriteRes<WriteJal, [RocketUnitB]>;
+def : WriteRes<WriteJalr, [RocketUnitB]>;
+def : WriteRes<WriteJmpReg, [RocketUnitB]>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU32, [RocketUnitALU]>;
+def : WriteRes<WriteIALU, [RocketUnitALU]>;
+def : WriteRes<WriteShift32, [RocketUnitALU]>;
+def : WriteRes<WriteShift, [RocketUnitALU]>;
+
+// Integer multiplication
+let Latency = 4 in {
+def : WriteRes<WriteIMul, [RocketUnitIMul]>;
+def : WriteRes<WriteIMul32, [RocketUnitIMul]>;
+}
+
+// Integer division
+// Worst case latency is used.
+def : WriteRes<WriteIDiv32, [RocketUnitIDiv]> {
+ let Latency = 34;
+ let ResourceCycles = [34];
+}
+def : WriteRes<WriteIDiv, [RocketUnitIDiv]> {
+ let Latency = 33;
+ let ResourceCycles = [33];
+}
+
+// Memory
+def : WriteRes<WriteSTB, [RocketUnitMem]>;
+def : WriteRes<WriteSTH, [RocketUnitMem]>;
+def : WriteRes<WriteSTW, [RocketUnitMem]>;
+def : WriteRes<WriteSTD, [RocketUnitMem]>;
+def : WriteRes<WriteFST32, [RocketUnitMem]>;
+def : WriteRes<WriteFST64, [RocketUnitMem]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteLDB, [RocketUnitMem]>;
+def : WriteRes<WriteLDH, [RocketUnitMem]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteLDW, [RocketUnitMem]>;
+def : WriteRes<WriteLDWU, [RocketUnitMem]>;
+def : WriteRes<WriteLDD, [RocketUnitMem]>;
+def : WriteRes<WriteFLD32, [RocketUnitMem]>;
+def : WriteRes<WriteFLD64, [RocketUnitMem]>;
+
+// Atomic memory
+def : WriteRes<WriteAtomicW, [RocketUnitMem]>;
+def : WriteRes<WriteAtomicD, [RocketUnitMem]>;
+
+def : WriteRes<WriteAtomicLDW, [RocketUnitMem]>;
+def : WriteRes<WriteAtomicLDD, [RocketUnitMem]>;
+}
+
+def : WriteRes<WriteAtomicSTW, [RocketUnitMem]>;
+def : WriteRes<WriteAtomicSTD, [RocketUnitMem]>;
+
+// Single precision.
+let Latency = 4 in {
+def : WriteRes<WriteFALU32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFSGNJ32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMinMax32, [RocketUnitFPALU]>;
+}
+
+// Double precision
+let Latency = 6 in {
+def : WriteRes<WriteFALU64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFSGNJ64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMinMax64, [RocketUnitFPALU]>;
+}
+
+// Conversions
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtI32ToF64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtI64ToF32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtI64ToF64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToF64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToF32, [RocketUnitFPALU]>;
+
+def : WriteRes<WriteFClass32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFClass64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCmp32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFCmp64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovF32ToI32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovI32ToF32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovF64ToI64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMovI64ToF64, [RocketUnitFPALU]>;
+}
+
+// FP multiplication
+let Latency = 5 in {
+def : WriteRes<WriteFMul32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulAdd32, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulSub32, [RocketUnitFPALU]>;
+}
+
+let Latency = 7 in {
+def : WriteRes<WriteFMul64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulAdd64, [RocketUnitFPALU]>;
+def : WriteRes<WriteFMulSub64, [RocketUnitFPALU]>;
+}
+
+// FP division
+// FP division unit on Rocket is not pipelined, so set resource cycles to latency.
+let Latency = 20, ResourceCycles = [20] in {
+def : WriteRes<WriteFDiv32, [RocketUnitFPDivSqrt]>;
+def : WriteRes<WriteFDiv64, [RocketUnitFPDivSqrt]>;
+}
+
+// FP square root unit on Rocket is not pipelined, so set resource cycles to latency.
+def : WriteRes<WriteFSqrt32, [RocketUnitFPDivSqrt]> { let Latency = 20;
+ let ResourceCycles = [20]; }
+def : WriteRes<WriteFSqrt64, [RocketUnitFPDivSqrt]> { let Latency = 25;
+ let ResourceCycles = [25]; }
+
+// Others
+def : WriteRes<WriteCSR, []>;
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShift, 0>;
+def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFALU32, 0>;
+def : ReadAdvance<ReadFALU64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMulAdd32, 0>;
+def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMulAdd64, 0>;
+def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
deleted file mode 100644
index 305e2b9b5927..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
+++ /dev/null
@@ -1,227 +0,0 @@
-//==- RISCVSchedRocket32.td - Rocket Scheduling Definitions -*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedule.h for details.
-
-// Rocket machine model for scheduling and other instruction cost heuristics.
-def Rocket32Model : SchedMachineModel {
- let MicroOpBufferSize = 0; // Explicitly set to zero since Rocket is in-order.
- let IssueWidth = 1; // 1 micro-ops are dispatched per cycle.
- let LoadLatency = 3;
- let MispredictPenalty = 3;
- let CompleteModel = 1;
- let UnsupportedFeatures = [HasStdExtV];
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
-// Rocket is in-order.
-
-let BufferSize = 0 in {
-def Rocket32UnitALU : ProcResource<1>; // Int ALU
-def Rocket32UnitIMul : ProcResource<1>; // Int Multiply
-def Rocket32UnitMem : ProcResource<1>; // Load/Store
-def Rocket32UnitB : ProcResource<1>; // Branch
-
-def Rocket32UnitFPALU : ProcResource<1>; // FP ALU
-}
-
-let BufferSize = 1 in {
-def Rocket32UnitIDiv : ProcResource<1>; // Int Division
-def Rocket32UnitFPDivSqrt : ProcResource<1>; // FP Divide/Sqrt'
-}
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-let SchedModel = Rocket32Model in {
-
-def : WriteRes<WriteJmp, [Rocket32UnitB]>;
-def : WriteRes<WriteJal, [Rocket32UnitB]>;
-def : WriteRes<WriteJalr, [Rocket32UnitB]>;
-def : WriteRes<WriteJmpReg, [Rocket32UnitB]>;
-
-def : WriteRes<WriteIALU, [Rocket32UnitALU]>;
-def : WriteRes<WriteShift, [Rocket32UnitALU]>;
-
-// Multiplies on Rocket differ by implementation; placeholder until
-// we can determine how to read from command line
-def : WriteRes<WriteIMul, [Rocket32UnitIMul]> { let Latency = 4; }
-
-// 32-bit divides have worse case latency of 34 cycle
-def : WriteRes<WriteIDiv, [Rocket32UnitIDiv]> {
- let Latency = 34;
- let ResourceCycles = [34];
-}
-
-// Memory
-def : WriteRes<WriteSTB, [Rocket32UnitMem]>;
-def : WriteRes<WriteSTH, [Rocket32UnitMem]>;
-def : WriteRes<WriteSTW, [Rocket32UnitMem]>;
-def : WriteRes<WriteFST32, [Rocket32UnitMem]>;
-def : WriteRes<WriteFST64, [Rocket32UnitMem]>;
-
-let Latency = 3 in {
-def : WriteRes<WriteLDB, [Rocket32UnitMem]>;
-def : WriteRes<WriteLDH, [Rocket32UnitMem]>;
-def : WriteRes<WriteCSR, [Rocket32UnitALU]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteLDW, [Rocket32UnitMem]>;
-def : WriteRes<WriteFLD32, [Rocket32UnitMem]>;
-def : WriteRes<WriteFLD64, [Rocket32UnitMem]>;
-
-def : WriteRes<WriteAtomicW, [Rocket32UnitMem]>;
-def : WriteRes<WriteAtomicLDW, [Rocket32UnitMem]>;
-}
-
-def : WriteRes<WriteAtomicSTW, [Rocket32UnitMem]>;
-
-// Most FP single precision operations are 4 cycles
-let Latency = 4 in {
-def : WriteRes<WriteFALU32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFSGNJ32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMinMax32, [Rocket32UnitFPALU]>;
-}
-
-// Most FP double precision operations are 6 cycles
-let Latency = 6 in {
-def : WriteRes<WriteFALU64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFSGNJ64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMinMax64, [Rocket32UnitFPALU]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteFCvtI32ToF32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToF32, [Rocket32UnitFPALU]>;
-
-def : WriteRes<WriteFClass32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFClass64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCmp32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCmp64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMovF32ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMovI32ToF32, [Rocket32UnitFPALU]>;
-}
-
-let Latency = 5 in {
-def : WriteRes<WriteFMul32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulAdd32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulSub32, [Rocket32UnitFPALU]>;
-}
-
-let Latency = 7 in {
-def : WriteRes<WriteFMul64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulAdd64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFMulSub64, [Rocket32UnitFPALU]>;
-}
-
-// FP Divide unit on Rocket is not pipelined, so set resource cycles to latency
-let Latency = 20, ResourceCycles = [20] in {
-def : WriteRes<WriteFDiv32, [Rocket32UnitFPDivSqrt]>;
-def : WriteRes<WriteFDiv64, [Rocket32UnitFPDivSqrt]>;
-}
-
-// FP Sqrt unit on Rocket is not pipelined, so set resource cycles to latency
-def : WriteRes<WriteFSqrt32, [Rocket32UnitFPDivSqrt]> { let Latency = 20;
- let ResourceCycles = [20];}
-def : WriteRes<WriteFSqrt64, [Rocket32UnitFPDivSqrt]> { let Latency = 25;
- let ResourceCycles = [25];}
-
-def : WriteRes<WriteNop, []>;
-
-def : InstRW<[WriteIALU], (instrs COPY)>;
-
-let Unsupported = 1 in {
-def : WriteRes<WriteIALU32, []>;
-def : WriteRes<WriteShift32, []>;
-def : WriteRes<WriteIMul32, []>;
-def : WriteRes<WriteIDiv32, []>;
-def : WriteRes<WriteSTD, []>;
-def : WriteRes<WriteLDWU, []>;
-def : WriteRes<WriteLDD, []>;
-def : WriteRes<WriteAtomicD, []>;
-def : WriteRes<WriteAtomicLDD, []>;
-def : WriteRes<WriteAtomicSTD, []>;
-def : WriteRes<WriteFCvtI64ToF32, []>;
-def : WriteRes<WriteFCvtI64ToF64, []>;
-def : WriteRes<WriteFCvtF64ToI64, []>;
-def : WriteRes<WriteFCvtF32ToI64, []>;
-def : WriteRes<WriteFMovI64ToF64, []>;
-def : WriteRes<WriteFMovF64ToI64, []>;
-}
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types with cycles.
-// Dummy definitions for RocketCore.
-def : ReadAdvance<ReadJmp, 0>;
-def : ReadAdvance<ReadJalr, 0>;
-def : ReadAdvance<ReadCSR, 0>;
-def : ReadAdvance<ReadStoreData, 0>;
-def : ReadAdvance<ReadMemBase, 0>;
-def : ReadAdvance<ReadIALU, 0>;
-def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShift, 0>;
-def : ReadAdvance<ReadShift32, 0>;
-def : ReadAdvance<ReadIDiv, 0>;
-def : ReadAdvance<ReadIDiv32, 0>;
-def : ReadAdvance<ReadIMul, 0>;
-def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFALU32, 0>;
-def : ReadAdvance<ReadFALU64, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMulAdd32, 0>;
-def : ReadAdvance<ReadFMulSub32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMulAdd64, 0>;
-def : ReadAdvance<ReadFMulSub64, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
deleted file mode 100644
index e8514a275c45..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
+++ /dev/null
@@ -1,228 +0,0 @@
-//==- RISCVSchedRocket64.td - Rocket Scheduling Definitions -*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedule.h for details.
-
-// Rocket machine model for scheduling and other instruction cost heuristics.
-def Rocket64Model : SchedMachineModel {
- let MicroOpBufferSize = 0; // Explicitly set to zero since Rocket is in-order.
- let IssueWidth = 1; // 1 micro-ops are dispatched per cycle.
- let LoadLatency = 3;
- let MispredictPenalty = 3;
- let UnsupportedFeatures = [HasStdExtV];
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
-// Rocket is in-order.
-
-let BufferSize = 0 in {
-def Rocket64UnitALU : ProcResource<1>; // Int ALU
-def Rocket64UnitIMul : ProcResource<1>; // Int Multiply
-def Rocket64UnitMem : ProcResource<1>; // Load/Store
-def Rocket64UnitB : ProcResource<1>; // Branch
-
-def Rocket64UnitFPALU : ProcResource<1>; // FP ALU
-}
-
-let BufferSize = 1 in {
-def Rocket64UnitIDiv : ProcResource<1>; // Int Division
-def Rocket64UnitFPDivSqrt : ProcResource<1>; // FP Divide/Sqrt
-}
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-let SchedModel = Rocket64Model in {
-
-def : WriteRes<WriteJmp, [Rocket64UnitB]>;
-def : WriteRes<WriteJal, [Rocket64UnitB]>;
-def : WriteRes<WriteJalr, [Rocket64UnitB]>;
-def : WriteRes<WriteJmpReg, [Rocket64UnitB]>;
-
-def : WriteRes<WriteIALU32, [Rocket64UnitALU]>;
-def : WriteRes<WriteIALU, [Rocket64UnitALU]>;
-def : WriteRes<WriteShift32, [Rocket64UnitALU]>;
-def : WriteRes<WriteShift, [Rocket64UnitALU]>;
-
-let Latency = 4 in {
-def : WriteRes<WriteIMul, [Rocket64UnitIMul]>;
-def : WriteRes<WriteIMul32, [Rocket64UnitIMul]>;
-}
-
-// Integer divide varies based on operand magnitude and sign; worse case latency is 34.
-def : WriteRes<WriteIDiv32, [Rocket64UnitIDiv]> {
- let Latency = 34;
- let ResourceCycles = [34];
-}
-def : WriteRes<WriteIDiv, [Rocket64UnitIDiv]> {
- let Latency = 33;
- let ResourceCycles = [33];
-}
-
-// Memory
-def : WriteRes<WriteSTB, [Rocket64UnitMem]>;
-def : WriteRes<WriteSTH, [Rocket64UnitMem]>;
-def : WriteRes<WriteSTW, [Rocket64UnitMem]>;
-def : WriteRes<WriteSTD, [Rocket64UnitMem]>;
-def : WriteRes<WriteFST32, [Rocket64UnitMem]>;
-def : WriteRes<WriteFST64, [Rocket64UnitMem]>;
-
-let Latency = 3 in {
-def : WriteRes<WriteLDB, [Rocket64UnitMem]>;
-def : WriteRes<WriteLDH, [Rocket64UnitMem]>;
-def : WriteRes<WriteCSR, [Rocket64UnitALU]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteLDW, [Rocket64UnitMem]>;
-def : WriteRes<WriteLDWU, [Rocket64UnitMem]>;
-def : WriteRes<WriteLDD, [Rocket64UnitMem]>;
-def : WriteRes<WriteFLD32, [Rocket64UnitMem]>;
-def : WriteRes<WriteFLD64, [Rocket64UnitMem]>;
-
-def : WriteRes<WriteAtomicW, [Rocket64UnitMem]>;
-def : WriteRes<WriteAtomicD, [Rocket64UnitMem]>;
-
-def : WriteRes<WriteAtomicLDW, [Rocket64UnitMem]>;
-def : WriteRes<WriteAtomicLDD, [Rocket64UnitMem]>;
-}
-
-def : WriteRes<WriteAtomicSTW, [Rocket64UnitMem]>;
-def : WriteRes<WriteAtomicSTD, [Rocket64UnitMem]>;
-
-// Most FP single precision operations are 4 cycles
-let Latency = 4 in {
-def : WriteRes<WriteFALU32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFSGNJ32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMinMax32, [Rocket64UnitFPALU]>;
-}
-
-let Latency = 6 in {
-// Most FP double precision operations are 6 cycles
-def : WriteRes<WriteFALU64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFSGNJ64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMinMax64, [Rocket64UnitFPALU]>;
-}
-
-// Conversion instructions
-let Latency = 2 in {
-def : WriteRes<WriteFCvtI32ToF32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI64ToF32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtI64ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToI64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToI32, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToI64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF32ToF64, [Rocket32UnitFPALU]>;
-def : WriteRes<WriteFCvtF64ToF32, [Rocket32UnitFPALU]>;
-
-def : WriteRes<WriteFClass32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFClass64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFCmp32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFCmp64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovF32ToI32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovI32ToF32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovF64ToI64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMovI64ToF64, [Rocket64UnitFPALU]>;
-}
-
-let Latency = 5 in {
-def : WriteRes<WriteFMul32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulAdd32, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulSub32, [Rocket64UnitFPALU]>;
-}
-
-let Latency = 7 in {
-def : WriteRes<WriteFMul64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulAdd64, [Rocket64UnitFPALU]>;
-def : WriteRes<WriteFMulSub64, [Rocket64UnitFPALU]>;
-}
-
-// FP Divide unit on Rocket is not pipelined, so set resource cycles to latency
-let Latency = 20, ResourceCycles = [20] in {
-def : WriteRes<WriteFDiv32, [Rocket64UnitFPDivSqrt]>;
-def : WriteRes<WriteFDiv64, [Rocket64UnitFPDivSqrt]>;
-}
-
-// FP Sqrt unit on Rocket is not pipelined, so set resource cycles to latency
-def : WriteRes<WriteFSqrt32, [Rocket64UnitFPDivSqrt]> { let Latency = 20;
- let ResourceCycles = [20]; }
-def : WriteRes<WriteFSqrt64, [Rocket64UnitFPDivSqrt]> { let Latency = 25;
- let ResourceCycles = [25]; }
-
-def : WriteRes<WriteNop, []>;
-
-def : InstRW<[WriteIALU], (instrs COPY)>;
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types with cycles.
-// Dummy definitions for RocketCore.
-def : ReadAdvance<ReadJmp, 0>;
-def : ReadAdvance<ReadJalr, 0>;
-def : ReadAdvance<ReadCSR, 0>;
-def : ReadAdvance<ReadStoreData, 0>;
-def : ReadAdvance<ReadMemBase, 0>;
-def : ReadAdvance<ReadIALU, 0>;
-def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShift, 0>;
-def : ReadAdvance<ReadShift32, 0>;
-def : ReadAdvance<ReadIDiv, 0>;
-def : ReadAdvance<ReadIDiv32, 0>;
-def : ReadAdvance<ReadIMul, 0>;
-def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFALU32, 0>;
-def : ReadAdvance<ReadFALU64, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMulAdd32, 0>;
-def : ReadAdvance<ReadFMulSub32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMulAdd64, 0>;
-def : ReadAdvance<ReadFMulSub64, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
new file mode 100644
index 000000000000..e57ba4f61b98
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -0,0 +1,222 @@
+//==- RISCVSchedSiFive7.td - SiFive7 Scheduling Definitions --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// SiFive7 machine model for scheduling and other instruction cost heuristics.
+def SiFive7Model : SchedMachineModel {
+ let MicroOpBufferSize = 0; // Explicitly set to zero since SiFive7 is in-order.
+ let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+ let LoadLatency = 3;
+ let MispredictPenalty = 3;
+ let CompleteModel = 0;
+ let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
+}
+
+// The SiFive7 microarchitecure has two pipelines: A and B.
+// Pipe A can handle memory, integer alu and vector operations.
+// Pipe B can handle integer alu, control flow, integer multiply and divide,
+// and floating point computation.
+let SchedModel = SiFive7Model in {
+let BufferSize = 0 in {
+def SiFive7PipeA : ProcResource<1>;
+def SiFive7PipeB : ProcResource<1>;
+}
+
+let BufferSize = 1 in {
+def SiFive7IDiv : ProcResource<1> { let Super = SiFive7PipeB; } // Int Division
+def SiFive7FDiv : ProcResource<1> { let Super = SiFive7PipeB; } // FP Division/Sqrt
+}
+
+def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>;
+
+// Branching
+def : WriteRes<WriteJmp, [SiFive7PipeB]>;
+def : WriteRes<WriteJal, [SiFive7PipeB]>;
+def : WriteRes<WriteJalr, [SiFive7PipeB]>;
+def : WriteRes<WriteJmpReg, [SiFive7PipeB]>;
+
+// Integer arithmetic and logic
+let Latency = 3 in {
+def : WriteRes<WriteIALU, [SiFive7PipeAB]>;
+def : WriteRes<WriteIALU32, [SiFive7PipeAB]>;
+def : WriteRes<WriteShift, [SiFive7PipeAB]>;
+def : WriteRes<WriteShift32, [SiFive7PipeAB]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [SiFive7PipeB]>;
+def : WriteRes<WriteIMul32, [SiFive7PipeB]>;
+}
+
+// Integer division
+def : WriteRes<WriteIDiv, [SiFive7PipeB, SiFive7IDiv]> {
+ let Latency = 16;
+ let ResourceCycles = [1, 15];
+}
+def : WriteRes<WriteIDiv32, [SiFive7PipeB, SiFive7IDiv]> {
+ let Latency = 16;
+ let ResourceCycles = [1, 15];
+}
+
+// Memory
+def : WriteRes<WriteSTB, [SiFive7PipeA]>;
+def : WriteRes<WriteSTH, [SiFive7PipeA]>;
+def : WriteRes<WriteSTW, [SiFive7PipeA]>;
+def : WriteRes<WriteSTD, [SiFive7PipeA]>;
+def : WriteRes<WriteFST32, [SiFive7PipeA]>;
+def : WriteRes<WriteFST64, [SiFive7PipeA]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteLDB, [SiFive7PipeA]>;
+def : WriteRes<WriteLDH, [SiFive7PipeA]>;
+def : WriteRes<WriteLDW, [SiFive7PipeA]>;
+def : WriteRes<WriteLDWU, [SiFive7PipeA]>;
+def : WriteRes<WriteLDD, [SiFive7PipeA]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFLD32, [SiFive7PipeA]>;
+def : WriteRes<WriteFLD64, [SiFive7PipeA]>;
+}
+
+// Atomic memory
+def : WriteRes<WriteAtomicSTW, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicSTD, [SiFive7PipeA]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteAtomicW, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicD, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicLDW, [SiFive7PipeA]>;
+def : WriteRes<WriteAtomicLDD, [SiFive7PipeA]>;
+}
+
+// Single precision.
+let Latency = 5 in {
+def : WriteRes<WriteFALU32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMul32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulAdd32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulSub32, [SiFive7PipeB]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFSGNJ32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMinMax32, [SiFive7PipeB]>;
+}
+
+def : WriteRes<WriteFDiv32, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 27;
+ let ResourceCycles = [1, 26]; }
+def : WriteRes<WriteFSqrt32, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 27;
+ let ResourceCycles = [1, 26]; }
+
+// Double precision
+let Latency = 7 in {
+def : WriteRes<WriteFALU64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMul64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulAdd64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMulSub64, [SiFive7PipeB]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFSGNJ64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMinMax64, [SiFive7PipeB]>;
+}
+
+def : WriteRes<WriteFDiv64, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 56;
+ let ResourceCycles = [1, 55]; }
+def : WriteRes<WriteFSqrt64, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 56;
+ let ResourceCycles = [1, 55]; }
+
+// Conversions
+let Latency = 3 in {
+def : WriteRes<WriteFCvtI32ToF32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtI32ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtI64ToF32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtI64ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF32ToI32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF32ToI64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF32ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF64ToI32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF64ToI64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCvtF64ToF32, [SiFive7PipeB]>;
+
+def : WriteRes<WriteFClass32, [SiFive7PipeB]>;
+def : WriteRes<WriteFClass64, [SiFive7PipeB]>;
+def : WriteRes<WriteFCmp32, [SiFive7PipeB]>;
+def : WriteRes<WriteFCmp64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovI32ToF32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovF32ToI32, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovI64ToF64, [SiFive7PipeB]>;
+def : WriteRes<WriteFMovF64ToI64, [SiFive7PipeB]>;
+}
+
+// Others
+def : WriteRes<WriteCSR, [SiFive7PipeB]>;
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShift, 0>;
+def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFALU32, 0>;
+def : ReadAdvance<ReadFALU64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMulAdd32, 0>;
+def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMulAdd64, 0>;
+def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
index bbcd03d46236..0806be8a8d87 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -1,4 +1,4 @@
-//===-- RISCVSchedule.td - RISCV Scheduling Definitions -------*- tablegen -*-===//
+//===-- RISCVSchedule.td - RISCV Scheduling Definitions ----*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 47a48c820a29..df11d237a16c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -30,13 +30,16 @@ using namespace llvm;
void RISCVSubtarget::anchor() {}
RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
- const Triple &TT, StringRef CPU, StringRef FS, StringRef ABIName) {
+ const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, StringRef ABIName) {
// Determine default and user-specified characteristics
bool Is64Bit = TT.isArch64Bit();
std::string CPUName = std::string(CPU);
+ std::string TuneCPUName = std::string(TuneCPU);
if (CPUName.empty())
CPUName = Is64Bit ? "generic-rv64" : "generic-rv32";
- ParseSubtargetFeatures(CPUName, FS);
+ if (TuneCPUName.empty())
+ TuneCPUName = CPUName;
+ ParseSubtargetFeatures(CPUName, TuneCPUName, FS);
if (Is64Bit) {
XLenVT = MVT::i64;
XLen = 64;
@@ -47,11 +50,12 @@ RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
return *this;
}
-RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU,
+ StringRef TuneCPU, StringRef FS,
StringRef ABIName, const TargetMachine &TM)
- : RISCVGenSubtargetInfo(TT, CPU, FS),
+ : RISCVGenSubtargetInfo(TT, CPU, TuneCPU, FS),
UserReservedRegister(RISCV::NUM_TARGET_REGS),
- FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)),
+ FrameLowering(initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {
CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering()));
Legalizer.reset(new RISCVLegalizerInfo(*this));
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
index fe1285f23b15..561b04cc0b44 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -13,10 +13,10 @@
#ifndef LLVM_LIB_TARGET_RISCV_RISCVSUBTARGET_H
#define LLVM_LIB_TARGET_RISCV_RISCVSUBTARGET_H
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "RISCVFrameLowering.h"
#include "RISCVISelLowering.h"
#include "RISCVInstrInfo.h"
-#include "Utils/RISCVBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -40,6 +40,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
bool HasStdExtD = false;
bool HasStdExtC = false;
bool HasStdExtB = false;
+ bool HasStdExtZba = false;
bool HasStdExtZbb = false;
bool HasStdExtZbc = false;
bool HasStdExtZbe = false;
@@ -51,6 +52,9 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
bool HasStdExtZbt = false;
bool HasStdExtZbproposedc = false;
bool HasStdExtV = false;
+ bool HasStdExtZvlsseg = false;
+ bool HasStdExtZvamo = false;
+ bool HasStdExtZfh = false;
bool HasRV64 = false;
bool IsRV32E = false;
bool EnableLinkerRelax = false;
@@ -69,17 +73,19 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
/// Initializes using the passed in CPU and feature strings so that we can
/// use initializer lists for subtarget initialization.
RISCVSubtarget &initializeSubtargetDependencies(const Triple &TT,
- StringRef CPU, StringRef FS,
+ StringRef CPU,
+ StringRef TuneCPU,
+ StringRef FS,
StringRef ABIName);
public:
// Initializes the data members to match that of the specified triple.
- RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
- StringRef ABIName, const TargetMachine &TM);
+ RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+ StringRef FS, StringRef ABIName, const TargetMachine &TM);
// Parses features string setting specified subtarget options. The
// definition of this function is auto-generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
const RISCVFrameLowering *getFrameLowering() const override {
return &FrameLowering;
@@ -101,6 +107,7 @@ public:
bool hasStdExtD() const { return HasStdExtD; }
bool hasStdExtC() const { return HasStdExtC; }
bool hasStdExtB() const { return HasStdExtB; }
+ bool hasStdExtZba() const { return HasStdExtZba; }
bool hasStdExtZbb() const { return HasStdExtZbb; }
bool hasStdExtZbc() const { return HasStdExtZbc; }
bool hasStdExtZbe() const { return HasStdExtZbe; }
@@ -112,6 +119,9 @@ public:
bool hasStdExtZbt() const { return HasStdExtZbt; }
bool hasStdExtZbproposedc() const { return HasStdExtZbproposedc; }
bool hasStdExtV() const { return HasStdExtV; }
+ bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; }
+ bool hasStdExtZvamo() const { return HasStdExtZvamo; }
+ bool hasStdExtZfh() const { return HasStdExtZfh; }
bool is64Bit() const { return HasRV64; }
bool isRV32E() const { return IsRV32E; }
bool enableLinkerRelax() const { return EnableLinkerRelax; }
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 8e75647bd4a9..16399fea150e 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -310,7 +310,9 @@ def: SysReg<"mhpmcounter31h", 0xB9F>;
//===--------------------------
// Machine Counter Setup
//===--------------------------
+let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
def : SysReg<"mcountinhibit", 0x320>;
+
def : SysReg<"mhpmevent3", 0x323>;
def : SysReg<"mhpmevent4", 0x324>;
def : SysReg<"mhpmevent5", 0x325>;
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 75683e2fd8e9..32fb7cd6753c 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -11,11 +11,11 @@
//===----------------------------------------------------------------------===//
#include "RISCVTargetMachine.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "RISCV.h"
#include "RISCVTargetObjectFile.h"
#include "RISCVTargetTransformInfo.h"
#include "TargetInfo/RISCVTargetInfo.h"
-#include "Utils/RISCVBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -35,18 +35,18 @@ using namespace llvm;
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
- auto PR = PassRegistry::getPassRegistry();
+ auto *PR = PassRegistry::getPassRegistry();
initializeGlobalISel(*PR);
+ initializeRISCVMergeBaseOffsetOptPass(*PR);
initializeRISCVExpandPseudoPass(*PR);
+ initializeRISCVCleanupVSETVLIPass(*PR);
}
static StringRef computeDataLayout(const Triple &TT) {
- if (TT.isArch64Bit()) {
+ if (TT.isArch64Bit())
return "e-m:e-p:64:64-i64:64-i128:128-n64-S128";
- } else {
- assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
- return "e-m:e-p:32:32-i64:64-n32-S128";
- }
+ assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
+ return "e-m:e-p:32:32-i64:64-n32-S128";
}
static Reloc::Model getEffectiveRelocModel(const Triple &TT,
@@ -75,15 +75,16 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
const RISCVSubtarget *
RISCVTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute TuneAttr = F.getFnAttribute("tune-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
- std::string Key = CPU + FS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string TuneCPU =
+ TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+ std::string Key = CPU + TuneCPU + FS;
auto &I = SubtargetMap[Key];
if (!I) {
// This needs to be done before we create a new subtarget since any
@@ -100,7 +101,7 @@ RISCVTargetMachine::getSubtargetImpl(const Function &F) const {
}
ABIName = ModuleTargetABI->getString();
}
- I = std::make_unique<RISCVSubtarget>(TargetTriple, CPU, FS, ABIName, *this);
+ I = std::make_unique<RISCVSubtarget>(TargetTriple, CPU, TuneCPU, FS, ABIName, *this);
}
return I.get();
}
@@ -110,6 +111,15 @@ RISCVTargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(RISCVTTIImpl(this, F));
}
+// A RISC-V hart has a single byte-addressable address space of 2^XLEN bytes
+// for all memory accesses, so it is reasonable to assume that an
+// implementation has no-op address space casts. If an implementation makes a
+// change to this, they can override it here.
+bool RISCVTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DstAS) const {
+ return true;
+}
+
namespace {
class RISCVPassConfig : public TargetPassConfig {
public:
@@ -131,7 +141,7 @@ public:
void addPreSched2() override;
void addPreRegAlloc() override;
};
-}
+} // namespace
TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
return new RISCVPassConfig(*this, PM);
@@ -149,7 +159,7 @@ bool RISCVPassConfig::addInstSelector() {
}
bool RISCVPassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
@@ -181,5 +191,8 @@ void RISCVPassConfig::addPreEmitPass2() {
}
void RISCVPassConfig::addPreRegAlloc() {
- addPass(createRISCVMergeBaseOffsetOptPass());
+ if (TM->getOptLevel() != CodeGenOpt::None) {
+ addPass(createRISCVMergeBaseOffsetOptPass());
+ addPass(createRISCVCleanupVSETVLIPass());
+ }
}
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index a4476fa40a7d..3156333f7ee1 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -43,7 +43,10 @@ public:
}
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+ virtual bool isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DstAS) const override;
};
-}
+} // namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bd78f801c59a..27714cffc989 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
#include "RISCVTargetTransformInfo.h"
-#include "Utils/RISCVMatInt.h"
+#include "MCTargetDesc/RISCVMatInt.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -30,8 +30,10 @@ int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
getST()->is64Bit());
}
-int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind) {
+int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy() &&
"getIntImmCost can only estimate cost of materialising integers");
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 392700707760..8d077e946305 100644
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -42,8 +42,9 @@ public:
TLI(ST->getTargetLowering()) {}
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind);
};
diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
deleted file mode 100644
index 4e6cdd8606b1..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ /dev/null
@@ -1,223 +0,0 @@
-//===-- RISCVBaseInfo.h - Top level definitions for RISCV MC ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone enum definitions for the RISCV target
-// useful for the compiler back-end and the MC libraries.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
-#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVBASEINFO_H
-
-#include "RISCVRegisterInfo.h"
-#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/SubtargetFeature.h"
-
-namespace llvm {
-
-// RISCVII - This namespace holds all of the target specific flags that
-// instruction info tracks. All definitions must match RISCVInstrFormats.td.
-namespace RISCVII {
-enum {
- InstFormatPseudo = 0,
- InstFormatR = 1,
- InstFormatR4 = 2,
- InstFormatI = 3,
- InstFormatS = 4,
- InstFormatB = 5,
- InstFormatU = 6,
- InstFormatJ = 7,
- InstFormatCR = 8,
- InstFormatCI = 9,
- InstFormatCSS = 10,
- InstFormatCIW = 11,
- InstFormatCL = 12,
- InstFormatCS = 13,
- InstFormatCA = 14,
- InstFormatCB = 15,
- InstFormatCJ = 16,
- InstFormatOther = 17,
-
- InstFormatMask = 31,
-};
-
-// RISC-V Specific Machine Operand Flags
-enum {
- MO_None = 0,
- MO_CALL = 1,
- MO_PLT = 2,
- MO_LO = 3,
- MO_HI = 4,
- MO_PCREL_LO = 5,
- MO_PCREL_HI = 6,
- MO_GOT_HI = 7,
- MO_TPREL_LO = 8,
- MO_TPREL_HI = 9,
- MO_TPREL_ADD = 10,
- MO_TLS_GOT_HI = 11,
- MO_TLS_GD_HI = 12,
-
- // Used to differentiate between target-specific "direct" flags and "bitmask"
- // flags. A machine operand can only have one "direct" flag, but can have
- // multiple "bitmask" flags.
- MO_DIRECT_FLAG_MASK = 15
-};
-} // namespace RISCVII
-
-namespace RISCVOp {
-enum OperandType : unsigned {
- OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
- OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
- OPERAND_UIMM5,
- OPERAND_UIMM12,
- OPERAND_SIMM12,
- OPERAND_SIMM13_LSB0,
- OPERAND_UIMM20,
- OPERAND_SIMM21_LSB0,
- OPERAND_UIMMLOG2XLEN,
- OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
-};
-} // namespace RISCVOp
-
-// Describes the predecessor/successor bits used in the FENCE instruction.
-namespace RISCVFenceField {
-enum FenceField {
- I = 8,
- O = 4,
- R = 2,
- W = 1
-};
-}
-
-// Describes the supported floating point rounding mode encodings.
-namespace RISCVFPRndMode {
-enum RoundingMode {
- RNE = 0,
- RTZ = 1,
- RDN = 2,
- RUP = 3,
- RMM = 4,
- DYN = 7,
- Invalid
-};
-
-inline static StringRef roundingModeToString(RoundingMode RndMode) {
- switch (RndMode) {
- default:
- llvm_unreachable("Unknown floating point rounding mode");
- case RISCVFPRndMode::RNE:
- return "rne";
- case RISCVFPRndMode::RTZ:
- return "rtz";
- case RISCVFPRndMode::RDN:
- return "rdn";
- case RISCVFPRndMode::RUP:
- return "rup";
- case RISCVFPRndMode::RMM:
- return "rmm";
- case RISCVFPRndMode::DYN:
- return "dyn";
- }
-}
-
-inline static RoundingMode stringToRoundingMode(StringRef Str) {
- return StringSwitch<RoundingMode>(Str)
- .Case("rne", RISCVFPRndMode::RNE)
- .Case("rtz", RISCVFPRndMode::RTZ)
- .Case("rdn", RISCVFPRndMode::RDN)
- .Case("rup", RISCVFPRndMode::RUP)
- .Case("rmm", RISCVFPRndMode::RMM)
- .Case("dyn", RISCVFPRndMode::DYN)
- .Default(RISCVFPRndMode::Invalid);
-}
-
-inline static bool isValidRoundingMode(unsigned Mode) {
- switch (Mode) {
- default:
- return false;
- case RISCVFPRndMode::RNE:
- case RISCVFPRndMode::RTZ:
- case RISCVFPRndMode::RDN:
- case RISCVFPRndMode::RUP:
- case RISCVFPRndMode::RMM:
- case RISCVFPRndMode::DYN:
- return true;
- }
-}
-} // namespace RISCVFPRndMode
-
-namespace RISCVSysReg {
-struct SysReg {
- const char *Name;
- unsigned Encoding;
- const char *AltName;
- // FIXME: add these additional fields when needed.
- // Privilege Access: Read, Write, Read-Only.
- // unsigned ReadWrite;
- // Privilege Mode: User, System or Machine.
- // unsigned Mode;
- // Check field name.
- // unsigned Extra;
- // Register number without the privilege bits.
- // unsigned Number;
- FeatureBitset FeaturesRequired;
- bool isRV32Only;
-
- bool haveRequiredFeatures(FeatureBitset ActiveFeatures) const {
- // Not in 32-bit mode.
- if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
- return false;
- // No required feature associated with the system register.
- if (FeaturesRequired.none())
- return true;
- return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
- }
-};
-
-#define GET_SysRegsList_DECL
-#include "RISCVGenSystemOperands.inc"
-} // end namespace RISCVSysReg
-
-namespace RISCVABI {
-
-enum ABI {
- ABI_ILP32,
- ABI_ILP32F,
- ABI_ILP32D,
- ABI_ILP32E,
- ABI_LP64,
- ABI_LP64F,
- ABI_LP64D,
- ABI_Unknown
-};
-
-// Returns the target ABI, or else a StringError if the requested ABIName is
-// not supported for the given TT and FeatureBits combination.
-ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
- StringRef ABIName);
-
-ABI getTargetABI(StringRef ABIName);
-
-// Returns the register used to hold the stack pointer after realignment.
-Register getBPReg();
-
-} // namespace RISCVABI
-
-namespace RISCVFeatures {
-
-// Validates if the given combination of features are valid for the target
-// triple. Exits with report_fatal_error if not.
-void validate(const Triple &TT, const FeatureBitset &FeatureBits);
-
-} // namespace RISCVFeatures
-
-} // namespace llvm
-
-#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 16e159621672..5f1bf316e871 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -82,6 +82,11 @@ class SparcAsmParser : public MCTargetAsmParser {
OperandMatchResultTy parseMembarTag(OperandVector &Operands);
+ template <unsigned N>
+ OperandMatchResultTy parseShiftAmtImm(OperandVector &Operands);
+
+ OperandMatchResultTy parseCallTarget(OperandVector &Operands);
+
OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
OperandMatchResultTy
@@ -262,6 +267,36 @@ public:
bool isMEMri() const { return Kind == k_MemoryImm; }
bool isMembarTag() const { return Kind == k_Immediate; }
+ bool isCallTarget() const {
+ if (!isImm())
+ return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val))
+ return CE->getValue() % 4 == 0;
+
+ return true;
+ }
+
+ bool isShiftAmtImm5() const {
+ if (!isImm())
+ return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val))
+ return isUInt<5>(CE->getValue());
+
+ return false;
+ }
+
+ bool isShiftAmtImm6() const {
+ if (!isImm())
+ return false;
+
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val))
+ return isUInt<6>(CE->getValue());
+
+ return false;
+ }
+
bool isIntReg() const {
return (Kind == k_Register && Reg.Kind == rk_IntReg);
}
@@ -343,6 +378,15 @@ public:
addExpr(Inst, Expr);
}
+ void addShiftAmtImm5Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+ void addShiftAmtImm6Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
void addExpr(MCInst &Inst, const MCExpr *Expr) const{
// Add as immediate when possible. Null MCExpr = 0.
if (!Expr)
@@ -377,6 +421,11 @@ public:
addExpr(Inst, Expr);
}
+ void addCallTargetOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
auto Op = std::make_unique<SparcOperand>(k_Token);
Op->Tok.Data = Str.data();
@@ -645,7 +694,7 @@ OperandMatchResultTy SparcAsmParser::tryParseRegister(unsigned &RegNo,
EndLoc = Tok.getEndLoc();
RegNo = 0;
if (getLexer().getKind() != AsmToken::Percent)
- return MatchOperand_Success;
+ return MatchOperand_NoMatch;
Parser.Lex();
unsigned regKind = SparcOperand::rk_None;
if (matchRegisterName(Tok, RegNo, regKind)) {
@@ -729,37 +778,74 @@ ParseDirective(AsmToken DirectiveID)
OperandMatchResultTy
SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
SMLoc S, E;
- unsigned BaseReg = 0;
- if (ParseRegister(BaseReg, S, E)) {
+ std::unique_ptr<SparcOperand> LHS;
+ if (parseSparcAsmOperand(LHS) != MatchOperand_Success)
return MatchOperand_NoMatch;
+
+ // Single immediate operand
+ if (LHS->isImm()) {
+ Operands.push_back(SparcOperand::MorphToMEMri(Sparc::G0, std::move(LHS)));
+ return MatchOperand_Success;
}
- switch (getLexer().getKind()) {
- default: return MatchOperand_NoMatch;
+ if (!LHS->isIntReg()) {
+ Error(LHS->getStartLoc(), "invalid register kind for this operand");
+ return MatchOperand_ParseFail;
+ }
- case AsmToken::Comma:
- case AsmToken::RBrac:
- case AsmToken::EndOfStatement:
- Operands.push_back(SparcOperand::CreateMEMr(BaseReg, S, E));
- return MatchOperand_Success;
+ AsmToken Tok = getLexer().getTok();
+ // The plus token may be followed by a register or an immediate value, the
+ // minus one is always interpreted as sign for the immediate value
+ if (Tok.is(AsmToken::Plus) || Tok.is(AsmToken::Minus)) {
+ (void)Parser.parseOptionalToken(AsmToken::Plus);
- case AsmToken:: Plus:
- Parser.Lex(); // Eat the '+'
- break;
- case AsmToken::Minus:
- break;
+ std::unique_ptr<SparcOperand> RHS;
+ if (parseSparcAsmOperand(RHS) != MatchOperand_Success)
+ return MatchOperand_NoMatch;
+
+ if (RHS->isReg() && !RHS->isIntReg()) {
+ Error(RHS->getStartLoc(), "invalid register kind for this operand");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ RHS->isImm()
+ ? SparcOperand::MorphToMEMri(LHS->getReg(), std::move(RHS))
+ : SparcOperand::MorphToMEMrr(LHS->getReg(), std::move(RHS)));
+
+ return MatchOperand_Success;
}
- std::unique_ptr<SparcOperand> Offset;
- OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
- if (ResTy != MatchOperand_Success || !Offset)
+ Operands.push_back(SparcOperand::CreateMEMr(LHS->getReg(), S, E));
+ return MatchOperand_Success;
+}
+
+template <unsigned N>
+OperandMatchResultTy SparcAsmParser::parseShiftAmtImm(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+ // This is a register, not an immediate
+ if (getLexer().getKind() == AsmToken::Percent)
return MatchOperand_NoMatch;
- Operands.push_back(
- Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
- : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
+ const MCExpr *Expr;
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_ParseFail;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+ if (!CE) {
+ Error(S, "constant expression expected");
+ return MatchOperand_ParseFail;
+ }
+
+ if (!isUInt<N>(CE->getValue())) {
+ Error(S, "immediate shift value out of range");
+ return MatchOperand_ParseFail;
+ }
+ Operands.push_back(SparcOperand::CreateImm(Expr, S, E));
return MatchOperand_Success;
}
@@ -809,6 +895,33 @@ OperandMatchResultTy SparcAsmParser::parseMembarTag(OperandVector &Operands) {
return MatchOperand_Success;
}
+OperandMatchResultTy SparcAsmParser::parseCallTarget(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+ switch (getLexer().getKind()) {
+ default:
+ return MatchOperand_NoMatch;
+ case AsmToken::LParen:
+ case AsmToken::Integer:
+ case AsmToken::Identifier:
+ case AsmToken::Dot:
+ break;
+ }
+
+ const MCExpr *DestValue;
+ if (getParser().parseExpression(DestValue))
+ return MatchOperand_NoMatch;
+
+ bool IsPic = getContext().getObjectFileInfo()->isPositionIndependent();
+ SparcMCExpr::VariantKind Kind =
+ IsPic ? SparcMCExpr::VK_Sparc_WPLT30 : SparcMCExpr::VK_Sparc_WDISP30;
+
+ const MCExpr *DestExpr = SparcMCExpr::create(Kind, DestValue, getContext());
+ Operands.push_back(SparcOperand::CreateImm(DestExpr, S, E));
+ return MatchOperand_Success;
+}
+
OperandMatchResultTy
SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
@@ -936,6 +1049,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
}
break;
+ case AsmToken::Plus:
case AsmToken::Minus:
case AsmToken::Integer:
case AsmToken::LParen:
@@ -1272,7 +1386,7 @@ const SparcMCExpr *
SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
const MCExpr *subExpr) {
// When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
- // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
+ // If the expression refers contains _GLOBAL_OFFSET_TABLE, it is
// actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
// as %got10 or %got22 relocation.
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
index e9d3aaeb9cfe..6ad6940c6b51 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -10,14 +10,13 @@
//===----------------------------------------------------------------------===//
#include "LeonPasses.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/raw_ostream.h"
+
using namespace llvm;
LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h
index b165bc93780f..9bc4569a1298 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.h
@@ -12,14 +12,11 @@
#ifndef LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
#define LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
-
-#include "Sparc.h"
-#include "SparcSubtarget.h"
namespace llvm {
+class SparcSubtarget;
+
class LLVM_LIBRARY_VISIBILITY LEONMachineFunctionPass
: public MachineFunctionPass {
protected:
@@ -33,13 +30,11 @@ protected:
protected:
LEONMachineFunctionPass(char &ID);
- int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
void clearUsedRegisterList() { UsedRegisters.clear(); }
void markRegisterUsed(int registerIndex) {
UsedRegisters.push_back(registerIndex);
}
- int getUnusedFPRegister(MachineRegisterInfo &MRI);
};
class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 83c44e0682ce..5a9ecfe74ecc 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -255,12 +255,6 @@ namespace {
}
}
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- // FIXME.
- return false;
- }
-
/// fixupNeedsRelaxation - Target specific predicate for whether a given
/// fixup requires the associated instruction to be relaxed.
bool fixupNeedsRelaxation(const MCFixup &Fixup,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index f6728a070736..8e4621946008 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -141,24 +141,34 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
const MCSubtargetInfo &STI,
raw_ostream &O, const char *Modifier) {
- printOperand(MI, opNum, STI, O);
-
// If this is an ADD operand, emit it like normal operands.
if (Modifier && !strcmp(Modifier, "arith")) {
+ printOperand(MI, opNum, STI, O);
O << ", ";
- printOperand(MI, opNum+1, STI, O);
+ printOperand(MI, opNum + 1, STI, O);
return;
}
- const MCOperand &MO = MI->getOperand(opNum+1);
- if (MO.isReg() && MO.getReg() == SP::G0)
- return; // don't print "+%g0"
- if (MO.isImm() && MO.getImm() == 0)
- return; // don't print "+0"
+ const MCOperand &Op1 = MI->getOperand(opNum);
+ const MCOperand &Op2 = MI->getOperand(opNum + 1);
+
+ bool PrintedFirstOperand = false;
+ if (Op1.isReg() && Op1.getReg() != SP::G0) {
+ printOperand(MI, opNum, STI, O);
+ PrintedFirstOperand = true;
+ }
- O << "+";
+ // Skip the second operand iff it adds nothing (literal 0 or %g0) and we've
+ // already printed the first one
+ const bool SkipSecondOperand =
+ PrintedFirstOperand && ((Op2.isReg() && Op2.getReg() == SP::G0) ||
+ (Op2.isImm() && Op2.getImm() == 0));
- printOperand(MI, opNum+1, STI, O);
+ if (!SkipSecondOperand) {
+ if (PrintedFirstOperand)
+ O << '+';
+ printOperand(MI, opNum + 1, STI, O);
+ }
}
void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
index 11587f165ef2..91b78bd03fc3 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
@@ -31,6 +31,7 @@ public:
bool isV9(const MCSubtargetInfo &STI) const;
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
bool printAliasInstr(const MCInst *MI, uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 7e908011bd50..9f8522541332 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -68,13 +69,15 @@ public:
unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
-
unsigned getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ unsigned getSImm13OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
unsigned getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
@@ -146,20 +149,50 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
return 0;
}
+unsigned
+SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ if (MO.isImm())
+ return MO.getImm();
+
+ assert(MO.isExpr() &&
+ "getSImm13OpValue expects only expressions or an immediate");
+
+ const MCExpr *Expr = MO.getExpr();
+
+ // Constant value, no fixup is needed
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ return CE->getValue();
+
+ MCFixupKind Kind;
+ if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+ Kind = MCFixupKind(SExpr->getFixupKind());
+ } else {
+ bool IsPic = Ctx.getObjectFileInfo()->isPositionIndependent();
+ Kind = IsPic ? MCFixupKind(Sparc::fixup_sparc_got13)
+ : MCFixupKind(Sparc::fixup_sparc_13);
+ }
+
+ Fixups.push_back(MCFixup::create(0, Expr, Kind));
+ return 0;
+}
+
unsigned SparcMCCodeEmitter::
getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
const MCOperand &MO = MI.getOperand(OpNo);
- if (MO.isReg() || MO.isImm())
- return getMachineOpValue(MI, MO, Fixups, STI);
+ const MCExpr *Expr = MO.getExpr();
+ const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr);
if (MI.getOpcode() == SP::TLS_CALL) {
// No fixups for __tls_get_addr. Will emit for fixups for tls_symbol in
// encodeInstruction.
#ifndef NDEBUG
// Verify that the callee is actually __tls_get_addr.
- const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr());
assert(SExpr && SExpr->getSubExpr()->getKind() == MCExpr::SymbolRef &&
"Unexpected expression in TLS_CALL");
const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(SExpr->getSubExpr());
@@ -169,15 +202,8 @@ getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
return 0;
}
- MCFixupKind fixupKind = (MCFixupKind)Sparc::fixup_sparc_call30;
-
- if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr())) {
- if (SExpr->getKind() == SparcMCExpr::VK_Sparc_WPLT30)
- fixupKind = (MCFixupKind)Sparc::fixup_sparc_wplt30;
- }
-
- Fixups.push_back(MCFixup::create(0, MO.getExpr(), fixupKind));
-
+ MCFixupKind Kind = MCFixupKind(SExpr->getFixupKind());
+ Fixups.push_back(MCFixup::create(0, Expr, Kind));
return 0;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 00f319fc37e1..b84ecf074455 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -41,46 +41,46 @@ void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
{
- bool closeParen = true;
switch (Kind) {
- case VK_Sparc_None: closeParen = false; break;
- case VK_Sparc_LO: OS << "%lo("; break;
- case VK_Sparc_HI: OS << "%hi("; break;
- case VK_Sparc_H44: OS << "%h44("; break;
- case VK_Sparc_M44: OS << "%m44("; break;
- case VK_Sparc_L44: OS << "%l44("; break;
- case VK_Sparc_HH: OS << "%hh("; break;
- case VK_Sparc_HM: OS << "%hm("; break;
+ case VK_Sparc_None: return false;
+ case VK_Sparc_LO: OS << "%lo("; return true;
+ case VK_Sparc_HI: OS << "%hi("; return true;
+ case VK_Sparc_H44: OS << "%h44("; return true;
+ case VK_Sparc_M44: OS << "%m44("; return true;
+ case VK_Sparc_L44: OS << "%l44("; return true;
+ case VK_Sparc_HH: OS << "%hh("; return true;
+ case VK_Sparc_HM: OS << "%hm("; return true;
// FIXME: use %pc22/%pc10, if system assembler supports them.
- case VK_Sparc_PC22: OS << "%hi("; break;
- case VK_Sparc_PC10: OS << "%lo("; break;
+ case VK_Sparc_PC22: OS << "%hi("; return true;
+ case VK_Sparc_PC10: OS << "%lo("; return true;
// FIXME: use %got22/%got10, if system assembler supports them.
- case VK_Sparc_GOT22: OS << "%hi("; break;
- case VK_Sparc_GOT10: OS << "%lo("; break;
- case VK_Sparc_GOT13: closeParen = false; break;
- case VK_Sparc_13: closeParen = false; break;
- case VK_Sparc_WPLT30: closeParen = false; break;
- case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
- case VK_Sparc_TLS_GD_HI22: OS << "%tgd_hi22("; break;
- case VK_Sparc_TLS_GD_LO10: OS << "%tgd_lo10("; break;
- case VK_Sparc_TLS_GD_ADD: OS << "%tgd_add("; break;
- case VK_Sparc_TLS_GD_CALL: OS << "%tgd_call("; break;
- case VK_Sparc_TLS_LDM_HI22: OS << "%tldm_hi22("; break;
- case VK_Sparc_TLS_LDM_LO10: OS << "%tldm_lo10("; break;
- case VK_Sparc_TLS_LDM_ADD: OS << "%tldm_add("; break;
- case VK_Sparc_TLS_LDM_CALL: OS << "%tldm_call("; break;
- case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; break;
- case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; break;
- case VK_Sparc_TLS_LDO_ADD: OS << "%tldo_add("; break;
- case VK_Sparc_TLS_IE_HI22: OS << "%tie_hi22("; break;
- case VK_Sparc_TLS_IE_LO10: OS << "%tie_lo10("; break;
- case VK_Sparc_TLS_IE_LD: OS << "%tie_ld("; break;
- case VK_Sparc_TLS_IE_LDX: OS << "%tie_ldx("; break;
- case VK_Sparc_TLS_IE_ADD: OS << "%tie_add("; break;
- case VK_Sparc_TLS_LE_HIX22: OS << "%tle_hix22("; break;
- case VK_Sparc_TLS_LE_LOX10: OS << "%tle_lox10("; break;
+ case VK_Sparc_GOT22: OS << "%hi("; return true;
+ case VK_Sparc_GOT10: OS << "%lo("; return true;
+ case VK_Sparc_GOT13: return false;
+ case VK_Sparc_13: return false;
+ case VK_Sparc_WDISP30: return false;
+ case VK_Sparc_WPLT30: return false;
+ case VK_Sparc_R_DISP32: OS << "%r_disp32("; return true;
+ case VK_Sparc_TLS_GD_HI22: OS << "%tgd_hi22("; return true;
+ case VK_Sparc_TLS_GD_LO10: OS << "%tgd_lo10("; return true;
+ case VK_Sparc_TLS_GD_ADD: OS << "%tgd_add("; return true;
+ case VK_Sparc_TLS_GD_CALL: OS << "%tgd_call("; return true;
+ case VK_Sparc_TLS_LDM_HI22: OS << "%tldm_hi22("; return true;
+ case VK_Sparc_TLS_LDM_LO10: OS << "%tldm_lo10("; return true;
+ case VK_Sparc_TLS_LDM_ADD: OS << "%tldm_add("; return true;
+ case VK_Sparc_TLS_LDM_CALL: OS << "%tldm_call("; return true;
+ case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; return true;
+ case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; return true;
+ case VK_Sparc_TLS_LDO_ADD: OS << "%tldo_add("; return true;
+ case VK_Sparc_TLS_IE_HI22: OS << "%tie_hi22("; return true;
+ case VK_Sparc_TLS_IE_LO10: OS << "%tie_lo10("; return true;
+ case VK_Sparc_TLS_IE_LD: OS << "%tie_ld("; return true;
+ case VK_Sparc_TLS_IE_LDX: OS << "%tie_ldx("; return true;
+ case VK_Sparc_TLS_IE_ADD: OS << "%tie_add("; return true;
+ case VK_Sparc_TLS_LE_HIX22: OS << "%tle_hix22("; return true;
+ case VK_Sparc_TLS_LE_LOX10: OS << "%tle_lox10("; return true;
}
- return closeParen;
+ llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
}
SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
@@ -137,6 +137,7 @@ Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
case VK_Sparc_GOT13: return Sparc::fixup_sparc_got13;
case VK_Sparc_13: return Sparc::fixup_sparc_13;
case VK_Sparc_WPLT30: return Sparc::fixup_sparc_wplt30;
+ case VK_Sparc_WDISP30: return Sparc::fixup_sparc_call30;
case VK_Sparc_TLS_GD_HI22: return Sparc::fixup_sparc_tls_gd_hi22;
case VK_Sparc_TLS_GD_LO10: return Sparc::fixup_sparc_tls_gd_lo10;
case VK_Sparc_TLS_GD_ADD: return Sparc::fixup_sparc_tls_gd_add;
@@ -205,10 +206,8 @@ void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
MCSymbol *Symbol = Asm.getContext().getOrCreateSymbol("__tls_get_addr");
Asm.registerSymbol(*Symbol);
auto ELFSymbol = cast<MCSymbolELF>(Symbol);
- if (!ELFSymbol->isBindingSet()) {
+ if (!ELFSymbol->isBindingSet())
ELFSymbol->setBinding(ELF::STB_GLOBAL);
- ELFSymbol->setExternal(true);
- }
LLVM_FALLTHROUGH;
}
case VK_Sparc_TLS_GD_HI22:
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index c2467faca257..76603530e521 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -38,6 +38,7 @@ public:
VK_Sparc_GOT13,
VK_Sparc_13,
VK_Sparc_WPLT30,
+ VK_Sparc_WDISP30,
VK_Sparc_R_DISP32,
VK_Sparc_TLS_GD_HI22,
VK_Sparc_TLS_GD_LO10,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index fb2bcdc6c91b..9531e3105fe2 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -68,7 +68,7 @@ static MCSubtargetInfo *
createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
if (CPU.empty())
CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
- return createSparcMCSubtargetInfoImpl(TT, CPU, FS);
+ return createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCTargetStreamer *
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 7845a18b14c1..ee0b85292cfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -80,7 +80,7 @@ static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
}
static MCOperand createPCXCallOP(MCSymbol *Label,
MCContext &OutContext) {
- return createSparcMCOperand(SparcMCExpr::VK_Sparc_None, Label, OutContext);
+ return createSparcMCOperand(SparcMCExpr::VK_Sparc_WDISP30, Label, OutContext);
}
static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 8d8424641cd9..63187fdce999 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -257,9 +257,9 @@ bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
MFI.isFrameAddressTaken();
}
-int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -295,10 +295,10 @@ int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF,
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
- return FrameOffset;
+ return StackOffset::getFixed(FrameOffset);
} else {
FrameReg = SP::O6; // %sp
- return FrameOffset + MF.getFrameInfo().getStackSize();
+ return StackOffset::getFixed(FrameOffset + MF.getFrameInfo().getStackSize());
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h
index 3ec9dc8b85dd..ab0ceb6591c6 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -15,6 +15,7 @@
#include "Sparc.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -38,8 +39,8 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
/// targetHandlesStackFrameRounding - Returns true if the target is
/// responsible for rounding up the stack frame (probably at emitPrologue
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 116352e08382..e5c7794b7d2f 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -939,7 +939,8 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
// If the callee is a GlobalAddress node (quite common, every direct call is)
// turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
// Likewise ExternalSymbol -> TargetExternalSymbol.
- unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+ unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30
+ : SparcMCExpr::VK_Sparc_WDISP30;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1242,7 +1243,8 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
// Likewise ExternalSymbol -> TargetExternalSymbol.
SDValue Callee = CLI.Callee;
bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CB);
- unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+ unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30
+ : SparcMCExpr::VK_Sparc_WDISP30;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1877,8 +1879,7 @@ void SparcTargetLowering::computeKnownBitsForTargetNode
Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
- Known.One &= Known2.One;
- Known.Zero &= Known2.Zero;
+ Known = KnownBits::commonBits(Known, Known2);
break;
}
}
@@ -2139,7 +2140,7 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
int FI = MFI.CreateStackObject(16, Align(8), false);
SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
- /* Alignment = */ 8);
+ Align(8));
Entry.Node = FIPtr;
Entry.Ty = PointerType::getUnqual(ArgTy);
@@ -2198,7 +2199,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
// Load RetPtr to get the return value.
return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
- MachinePointerInfo(), /* Alignment = */ 8);
+ MachinePointerInfo(), Align(8));
}
SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
@@ -2541,8 +2542,9 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
MachinePointerInfo(SV));
// Load the actual argument out of the pointer VAList.
// We can't count on greater alignment than the word size.
- return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
- std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
+ return DAG.getLoad(
+ VT, DL, InChain, VAList, MachinePointerInfo(),
+ std::min(PtrVT.getFixedSizeInBits(), VT.getFixedSizeInBits()) / 8);
}
static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
@@ -2731,23 +2733,21 @@ static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
{
SDLoc dl(Op);
- LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
- assert(LdNode && LdNode->getOffset().isUndef()
- && "Unexpected node type");
+ LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+ assert(LdNode->getOffset().isUndef() && "Unexpected node type");
- unsigned alignment = LdNode->getAlignment();
- if (alignment > 8)
- alignment = 8;
+ Align Alignment = commonAlignment(LdNode->getOriginalAlign(), 8);
SDValue Hi64 =
DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
- LdNode->getPointerInfo(), alignment);
+ LdNode->getPointerInfo(), Alignment);
EVT addrVT = LdNode->getBasePtr().getValueType();
SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
LdNode->getBasePtr(),
DAG.getConstant(8, dl, addrVT));
SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
- LdNode->getPointerInfo(), alignment);
+ LdNode->getPointerInfo().getWithOffset(8),
+ Alignment);
SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
@@ -2785,9 +2785,9 @@ static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
// Lower a f128 store into two f64 stores.
static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
- StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
- assert(StNode && StNode->getOffset().isUndef()
- && "Unexpected node type");
+ StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
+ assert(StNode->getOffset().isUndef() && "Unexpected node type");
+
SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
@@ -2802,20 +2802,20 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
StNode->getValue(),
SubRegOdd);
- unsigned alignment = StNode->getAlignment();
- if (alignment > 8)
- alignment = 8;
+ Align Alignment = commonAlignment(StNode->getOriginalAlign(), 8);
SDValue OutChains[2];
OutChains[0] =
DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
- StNode->getBasePtr(), MachinePointerInfo(), alignment);
+ StNode->getBasePtr(), StNode->getPointerInfo(),
+ Alignment);
EVT addrVT = StNode->getBasePtr().getValueType();
SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
StNode->getBasePtr(),
DAG.getConstant(8, dl, addrVT));
OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
- MachinePointerInfo(), alignment);
+ StNode->getPointerInfo().getWithOffset(8),
+ Alignment);
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
@@ -2834,7 +2834,8 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
SDValue Chain = DAG.getStore(
St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo());
+ St->getOriginalAlign(), St->getMemOperand()->getFlags(),
+ St->getAAInfo());
return Chain;
}
@@ -3400,8 +3401,9 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
SDLoc dl(N);
SDValue LoadRes = DAG.getExtLoad(
Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
- Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
- Ld->getMemOperand()->getFlags(), Ld->getAAInfo());
+ Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32,
+ Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags(),
+ Ld->getAAInfo());
SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
Results.push_back(Res);
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td
index 9a200a36cd3e..df65c5457c1d 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -42,9 +42,9 @@ def : Pat<(i64 (sext i32:$val)), (SRAri $val, 0)>;
def : Pat<(i64 (and i64:$val, 0xffffffff)), (SRLri $val, 0)>;
def : Pat<(i64 (sext_inreg i64:$val, i32)), (SRAri $val, 0)>;
-defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, I64Regs>;
-defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, I64Regs>;
-defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
+defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, shift_imm6, I64Regs>;
+defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, shift_imm6, I64Regs>;
+defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, shift_imm6, I64Regs>;
} // Predicates = [Is64Bit]
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
index 2d8f063f7ed1..da53307bcb1c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -224,13 +224,13 @@ class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
// Define rr and ri shift instructions with patterns.
multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
- ValueType VT, RegisterClass RC,
+ ValueType VT, ValueType SIT, RegisterClass RC,
InstrItinClass itin = IIC_iu_instr> {
def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
!strconcat(OpcStr, " $rs1, $rs2, $rd"),
[(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))],
itin>;
- def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
+ def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, SIT:$shcnt),
!strconcat(OpcStr, " $rs1, $shcnt, $rd"),
[(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))],
itin>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 8b01313c7911..d1190ae03d2c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -113,6 +113,18 @@ def SETHIimm_not : PatLeaf<(i32 imm), [{
def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
+// Constrained operands for the shift operations.
+class ShiftAmtImmAsmOperand<int Bits> : AsmOperandClass {
+ let Name = "ShiftAmtImm" # Bits;
+ let ParserMethod = "parseShiftAmtImm<" # Bits # ">";
+}
+def shift_imm5 : Operand<i32> {
+ let ParserMatchClass = ShiftAmtImmAsmOperand<5>;
+}
+def shift_imm6 : Operand<i32> {
+ let ParserMatchClass = ShiftAmtImmAsmOperand<6>;
+}
+
// Address operands
def SparcMEMrrAsmOperand : AsmOperandClass {
let Name = "MEMrr";
@@ -160,13 +172,20 @@ def bprtarget16 : Operand<OtherVT> {
let EncoderMethod = "getBranchOnRegTargetOpValue";
}
+def SparcCallTargetAsmOperand : AsmOperandClass {
+ let Name = "CallTarget";
+ let ParserMethod = "parseCallTarget";
+}
+
def calltarget : Operand<i32> {
let EncoderMethod = "getCallTargetOpValue";
let DecoderMethod = "DecodeCall";
+ let ParserMatchClass = SparcCallTargetAsmOperand;
}
def simm13Op : Operand<i32> {
let DecoderMethod = "DecodeSIMM13";
+ let EncoderMethod = "getSImm13OpValue";
}
// Operand for printing out a condition code.
@@ -691,9 +710,9 @@ let Defs = [ICC] in {
}
// Section B.12 - Shift Instructions, p. 107
-defm SLL : F3_12<"sll", 0b100101, shl, IntRegs, i32, simm13Op>;
-defm SRL : F3_12<"srl", 0b100110, srl, IntRegs, i32, simm13Op>;
-defm SRA : F3_12<"sra", 0b100111, sra, IntRegs, i32, simm13Op>;
+defm SLL : F3_S<"sll", 0b100101, 0, shl, i32, shift_imm5, IntRegs>;
+defm SRL : F3_S<"srl", 0b100110, 0, srl, i32, shift_imm5, IntRegs>;
+defm SRA : F3_S<"sra", 0b100111, 0, sra, i32, shift_imm5, IntRegs>;
// Section B.13 - Add Instructions, p. 108
defm ADD : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index 990dbe23e7ac..21dced23210c 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -175,7 +175,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Register FrameReg;
int Offset;
- Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
Offset += MI.getOperand(FIOperandNum + 1).getImm();
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index dbc6cf8e5b86..abc47ef51563 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -55,7 +55,7 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
CPUName = (Is64Bit) ? "v9" : "v8";
// Parse features string.
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
// Popc is a v9-only instruction.
if (!IsV9)
@@ -67,9 +67,9 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM,
bool is64Bit)
- : SparcGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), Is64Bit(is64Bit),
- InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
- FrameLowering(*this) {}
+ : SparcGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TargetTriple(TT),
+ Is64Bit(is64Bit), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+ TLInfo(TM, *this), FrameLowering(*this) {}
int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h
index db19f99e3c9c..82a4aa510355 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -16,8 +16,8 @@
#include "SparcFrameLowering.h"
#include "SparcISelLowering.h"
#include "SparcInstrInfo.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include <string>
@@ -101,7 +101,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
bool is64Bit() const { return Is64Bit; }
diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index d48d94e2faf1..ae5228db5827 100644
--- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -55,9 +55,7 @@ static std::string computeDataLayout(const Triple &T, bool is64Bit) {
}
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::Static;
- return *RM;
+ return RM.getValueOr(Reloc::Static);
}
// Code models. Some only make sense for 64-bit code.
@@ -111,12 +109,10 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index d5a3a19446c7..2b815a366ccd 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -565,7 +565,7 @@ struct InsnMatchEntry {
StringRef Format;
uint64_t Opcode;
int32_t NumOperands;
- MatchClassKind OperandKinds[5];
+ MatchClassKind OperandKinds[7];
};
// For equal_range comparison.
@@ -633,7 +633,20 @@ static struct InsnMatchEntry InsnMatchTable[] = {
{ "sse", SystemZ::InsnSSE, 3,
{ MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12 } },
{ "ssf", SystemZ::InsnSSF, 4,
- { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } }
+ { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } },
+ { "vri", SystemZ::InsnVRI, 6,
+ { MCK_U48Imm, MCK_VR128, MCK_VR128, MCK_U12Imm, MCK_U4Imm, MCK_U4Imm } },
+ { "vrr", SystemZ::InsnVRR, 7,
+ { MCK_U48Imm, MCK_VR128, MCK_VR128, MCK_VR128, MCK_U4Imm, MCK_U4Imm,
+ MCK_U4Imm } },
+ { "vrs", SystemZ::InsnVRS, 5,
+ { MCK_U48Imm, MCK_AnyReg, MCK_VR128, MCK_BDAddr64Disp12, MCK_U4Imm } },
+ { "vrv", SystemZ::InsnVRV, 4,
+ { MCK_U48Imm, MCK_VR128, MCK_BDVAddr64Disp12, MCK_U4Imm } },
+ { "vrx", SystemZ::InsnVRX, 4,
+ { MCK_U48Imm, MCK_VR128, MCK_BDXAddr64Disp12, MCK_U4Imm } },
+ { "vsi", SystemZ::InsnVSI, 4,
+ { MCK_U48Imm, MCK_VR128, MCK_BDAddr64Disp12, MCK_U8Imm } }
};
static void printMCExpr(const MCExpr *E, raw_ostream &OS) {
@@ -838,10 +851,11 @@ SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterKind Kind) {
// Parse any type of register (including integers) and add it to Operands.
OperandMatchResultTy
SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
+ SMLoc StartLoc = Parser.getTok().getLoc();
+
// Handle integer values.
if (Parser.getTok().is(AsmToken::Integer)) {
const MCExpr *Register;
- SMLoc StartLoc = Parser.getTok().getLoc();
if (Parser.parseExpression(Register))
return MatchOperand_ParseFail;
@@ -863,6 +877,11 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
if (parseRegister(Reg))
return MatchOperand_ParseFail;
+ if (Reg.Num > 15) {
+ Error(StartLoc, "invalid register");
+ return MatchOperand_ParseFail;
+ }
+
// Map to the correct register kind.
RegisterKind Kind;
unsigned RegNo;
@@ -1195,10 +1214,14 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
OperandMatchResultTy ResTy;
if (Kind == MCK_AnyReg)
ResTy = parseAnyReg(Operands);
+ else if (Kind == MCK_VR128)
+ ResTy = parseVR128(Operands);
else if (Kind == MCK_BDXAddr64Disp12 || Kind == MCK_BDXAddr64Disp20)
ResTy = parseBDXAddr64(Operands);
else if (Kind == MCK_BDAddr64Disp12 || Kind == MCK_BDAddr64Disp20)
ResTy = parseBDAddr64(Operands);
+ else if (Kind == MCK_BDVAddr64Disp12)
+ ResTy = parseBDVAddr64(Operands);
else if (Kind == MCK_PCRel32)
ResTy = parsePCRel32(Operands);
else if (Kind == MCK_PCRel16)
@@ -1243,6 +1266,8 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
ZOperand.addBDAddrOperands(Inst, 2);
else if (ZOperand.isMem(BDXMem))
ZOperand.addBDXAddrOperands(Inst, 3);
+ else if (ZOperand.isMem(BDVMem))
+ ZOperand.addBDVAddrOperands(Inst, 3);
else if (ZOperand.isImm())
ZOperand.addImmOperands(Inst, 1);
else
@@ -1297,6 +1322,11 @@ OperandMatchResultTy SystemZAsmParser::tryParseRegister(unsigned &RegNo,
bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
+
+ // Apply mnemonic aliases first, before doing anything else, in
+ // case the target uses it.
+ applyMnemonicAliases(Name, getAvailableFeatures(), 0 /*VariantID*/);
+
Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
// Read the remaining operands.
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index e42aa14fe589..e81db1030c01 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -468,8 +468,10 @@ DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
// Read any remaining bytes.
- if (Bytes.size() < Size)
+ if (Bytes.size() < Size) {
+ Size = Bytes.size();
return MCDisassembler::Fail;
+ }
// Construct the instruction.
uint64_t Inst = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
index cfe1bd89c3eb..0db7279a06c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
@@ -27,6 +27,7 @@ public:
: MCInstPrinter(MAI, MII, MRI) {}
// Automatically generated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index e62f5040898f..5f276f793578 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -54,10 +54,6 @@ public:
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const override;
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *Fragment,
const MCAsmLayout &Layout) const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index e540ff4e4811..76df8cf0f3b2 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -17,6 +17,8 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
CalleeSaveStackSlotSize = 8;
IsLittleEndian = false;
+ MaxInstLength = 6;
+
CommentString = "#";
ZeroDirective = "\t.space\t";
Data64bitsDirective = "\t.quad\t";
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index f2ef1ad6c698..5c191d17ebc5 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -171,7 +171,7 @@ static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *
createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
- return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
+ return createSystemZMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 4109bfc11337..584737e1d940 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -236,14 +236,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
break;
case SystemZ::CallBR:
- LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+ LoweredMI = MCInstBuilder(SystemZ::BR)
+ .addReg(MI->getOperand(0).getReg());
break;
case SystemZ::CallBCR:
LoweredMI = MCInstBuilder(SystemZ::BCR)
.addImm(MI->getOperand(0).getImm())
.addImm(MI->getOperand(1).getImm())
- .addReg(SystemZ::R1D);
+ .addReg(MI->getOperand(2).getReg());
break;
case SystemZ::CRBCall:
@@ -251,7 +252,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -260,7 +261,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -269,7 +270,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addImm(MI->getOperand(1).getImm())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -278,7 +279,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addImm(MI->getOperand(1).getImm())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -287,7 +288,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -296,7 +297,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -305,7 +306,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addImm(MI->getOperand(1).getImm())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
@@ -314,7 +315,7 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
.addReg(MI->getOperand(0).getReg())
.addImm(MI->getOperand(1).getImm())
.addImm(MI->getOperand(2).getImm())
- .addReg(SystemZ::R1D)
+ .addReg(MI->getOperand(3).getReg())
.addImm(0);
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 2f0cf0317029..19b703bbb226 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -640,18 +640,22 @@ bool SystemZElimCompare::fuseCompareOperations(
MachineOperand CCMask(MBBI->getOperand(1));
assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
"Invalid condition-code mask for integer comparison");
- // This is only valid for CompareAndBranch.
+ // This is only valid for CompareAndBranch and CompareAndSibcall.
MachineOperand Target(MBBI->getOperand(
- Type == SystemZII::CompareAndBranch ? 2 : 0));
+ (Type == SystemZII::CompareAndBranch ||
+ Type == SystemZII::CompareAndSibcall) ? 2 : 0));
const uint32_t *RegMask;
if (Type == SystemZII::CompareAndSibcall)
- RegMask = MBBI->getOperand(2).getRegMask();
+ RegMask = MBBI->getOperand(3).getRegMask();
// Clear out all current operands.
int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
assert(CCUse >= 0 && "BRC/BCR must use CC");
Branch->RemoveOperand(CCUse);
- // Remove target (branch) or regmask (sibcall).
+ // Remove regmask (sibcall).
+ if (Type == SystemZII::CompareAndSibcall)
+ Branch->RemoveOperand(3);
+ // Remove target (branch or sibcall).
if (Type == SystemZII::CompareAndBranch ||
Type == SystemZII::CompareAndSibcall)
Branch->RemoveOperand(2);
@@ -678,8 +682,10 @@ bool SystemZElimCompare::fuseCompareOperations(
RegState::ImplicitDefine | RegState::Dead);
}
- if (Type == SystemZII::CompareAndSibcall)
+ if (Type == SystemZII::CompareAndSibcall) {
+ MIB.add(Target);
MIB.addRegMask(RegMask);
+ }
// Clear any intervening kills of SrcReg and SrcReg2.
MBBI = Compare;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
index 28f58cb310af..b1706a4a899a 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -196,7 +196,7 @@ def Arch11NewFeatures : SystemZFeatureList<[
//===----------------------------------------------------------------------===//
//
-// New features added in the Twelvth Edition of the z/Architecture
+// New features added in the Twelfth Edition of the z/Architecture
//
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 985722fdcab4..994f471b75b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -316,6 +316,8 @@ void SystemZFrameLowering::
processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const {
MachineFrameInfo &MFFrame = MF.getFrameInfo();
+ SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
bool BackChain = MF.getFunction().hasFnAttribute("backchain");
if (!usePackedStack(MF) || BackChain)
@@ -344,6 +346,14 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
}
+
+ // If R6 is used as an argument register it is still callee saved. If it in
+ // this case is not clobbered (and restored) it should never be marked as
+ // killed.
+ if (MF.front().isLiveIn(SystemZ::R6D) &&
+ ZFI->getRestoreGPRRegs().LowGPR != SystemZ::R6D)
+ for (auto &MO : MRI->use_nodbg_operands(SystemZ::R6D))
+ MO.setIsKill(false);
}
// Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
@@ -478,15 +488,6 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
MFFrame.setStackSize(StackSize);
if (StackSize) {
- // Determine if we want to store a backchain.
- bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
-
- // If we need backchain, save current stack pointer. R1 is free at this
- // point.
- if (StoreBackchain)
- BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
- .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
-
// Allocate StackSize bytes.
int64_t Delta = -int64_t(StackSize);
const unsigned ProbeSize = TLI.getStackProbeSize(MF);
@@ -502,18 +503,20 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(StackSize);
}
else {
+ bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
+ // If we need backchain, save current stack pointer. R1 is free at
+ // this point.
+ if (StoreBackchain)
+ BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+ .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
buildCFAOffs(MBB, MBBI, DL, SPOffsetFromCFA + Delta, ZII);
+ if (StoreBackchain)
+ BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
+ .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
+ .addImm(getBackchainOffset(MF)).addReg(0);
}
SPOffsetFromCFA += Delta;
-
- if (StoreBackchain) {
- // The back chain is stored topmost with packed-stack.
- int Offset = usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
- BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
- .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
- .addImm(Offset).addReg(0);
- }
}
if (HasFP) {
@@ -555,7 +558,8 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
Register IgnoredFrameReg;
int64_t Offset =
- getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+ getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg)
+ .getFixed();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, DwarfReg, SPOffsetFromCFA + Offset));
@@ -657,6 +661,13 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
.addMemOperand(MMO);
};
+ bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
+ if (StoreBackchain)
+ BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+ .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
+
+ MachineBasicBlock *DoneMBB = nullptr;
+ MachineBasicBlock *LoopMBB = nullptr;
if (NumFullBlocks < 3) {
// Emit unrolled probe statements.
for (unsigned int i = 0; i < NumFullBlocks; i++)
@@ -666,15 +677,16 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
uint64_t LoopAlloc = ProbeSize * NumFullBlocks;
SPOffsetFromCFA -= LoopAlloc;
- BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R1D)
+ // Use R0D to hold the exit value.
+ BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R0D)
.addReg(SystemZ::R15D);
- buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R1D, ZII);
- emitIncrement(*MBB, MBBI, DL, SystemZ::R1D, -int64_t(LoopAlloc), ZII);
+ buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R0D, ZII);
+ emitIncrement(*MBB, MBBI, DL, SystemZ::R0D, -int64_t(LoopAlloc), ZII);
buildCFAOffs(*MBB, MBBI, DL, -int64_t(SystemZMC::CallFrameSize + LoopAlloc),
ZII);
- MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB);
- MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(MBB);
+ DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB);
+ LoopMBB = SystemZ::emitBlockAfter(MBB);
MBB->addSuccessor(LoopMBB);
LoopMBB->addSuccessor(LoopMBB);
LoopMBB->addSuccessor(DoneMBB);
@@ -682,22 +694,29 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
MBB = LoopMBB;
allocateAndProbe(*MBB, MBB->end(), ProbeSize, false/*EmitCFI*/);
BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::CLGR))
- .addReg(SystemZ::R15D).addReg(SystemZ::R1D);
+ .addReg(SystemZ::R15D).addReg(SystemZ::R0D);
BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_GT).addMBB(MBB);
MBB = DoneMBB;
MBBI = DoneMBB->begin();
buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R15D, ZII);
-
- recomputeLiveIns(*DoneMBB);
- recomputeLiveIns(*LoopMBB);
}
if (Residual)
allocateAndProbe(*MBB, MBBI, Residual, true/*EmitCFI*/);
+ if (StoreBackchain)
+ BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::STG))
+ .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
+ .addImm(getBackchainOffset(MF)).addReg(0);
+
StackAllocMI->eraseFromParent();
+ if (DoneMBB != nullptr) {
+ // Compute the live-in lists for the new blocks.
+ recomputeLiveIns(*DoneMBB);
+ recomputeLiveIns(*LoopMBB);
+ }
}
bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
@@ -715,14 +734,14 @@ SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return true;
}
-int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset
+SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
// Our incoming SP is actually SystemZMC::CallFrameSize below the CFA, so
// add that difference here.
- int64_t Offset =
- TargetFrameLowering::getFrameIndexReference(MF, FI, FrameReg);
- return Offset + SystemZMC::CallFrameSize;
+ StackOffset Offset =
+ TargetFrameLowering::getFrameIndexReference(MF, FI, FrameReg);
+ return Offset + StackOffset::getFixed(SystemZMC::CallFrameSize);
}
MachineBasicBlock::iterator SystemZFrameLowering::
@@ -765,8 +784,7 @@ getOrCreateFramePointerSaveIndex(MachineFunction &MF) const {
int FI = ZFI->getFramePointerSaveIndex();
if (!FI) {
MachineFrameInfo &MFFrame = MF.getFrameInfo();
- // The back chain is stored topmost with packed-stack.
- int Offset = usePackedStack(MF) ? -8 : -SystemZMC::CallFrameSize;
+ int Offset = getBackchainOffset(MF) - SystemZMC::CallFrameSize;
FI = MFFrame.CreateFixedObject(8, Offset, false);
ZFI->setFramePointerSaveIndex(FI);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 8752acc7e5ae..085c31ca0f18 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -9,8 +9,10 @@
#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
#include "llvm/ADT/IndexedMap.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
class SystemZTargetMachine;
@@ -47,8 +49,8 @@ public:
MachineBasicBlock &PrologMBB) const override;
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
@@ -62,6 +64,12 @@ public:
int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const;
bool usePackedStack(MachineFunction &MF) const;
+
+ // Return the offset of the backchain.
+ unsigned getBackchainOffset(MachineFunction &MF) const {
+ // The back chain is stored topmost with packed-stack.
+ return usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
+ }
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index e2af02227999..c0a173df7ba2 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -74,8 +74,8 @@ unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
}
ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
-getHazardType(SUnit *m, int Stalls) {
- return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard);
+getHazardType(SUnit *SU, int Stalls) {
+ return (fitsIntoCurrentGroup(SU) ? NoHazard : Hazard);
}
void SystemZHazardRecognizer::Reset() {
@@ -179,7 +179,7 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
*SchedModel->getProcResource(PI->ProcResourceIdx);
std::string FU(PRD.Name);
// trim e.g. Z13_FXaUnit -> FXa
- FU = FU.substr(FU.find("_") + 1);
+ FU = FU.substr(FU.find('_') + 1);
size_t Pos = FU.find("Unit");
if (Pos != std::string::npos)
FU.resize(Pos);
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 38bf41ebe96a..b2ee64a1bb4a 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -113,7 +113,7 @@ public:
Reset();
}
- HazardType getHazardType(SUnit *m, int Stalls = 0) override;
+ HazardType getHazardType(SUnit *SU, int Stalls = 0) override;
void Reset() override;
void EmitInstruction(SUnit *SU) override;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 37328684399b..9d90a4940cba 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -338,6 +338,10 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
// to X.
bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+ // Return true if N (a load or a store) fullfills the alignment
+ // requirements for a PC-relative access.
+ bool storeLoadIsAligned(SDNode *N) const;
+
// Try to expand a boolean SELECT_CCMASK using an IPM sequence.
SDValue expandSelectBoolean(SDNode *Node);
@@ -1460,6 +1464,46 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
canUseBlockOperation(StoreA, LoadB);
}
+bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
+
+ auto *MemAccess = cast<LSBaseSDNode>(N);
+ TypeSize StoreSize = MemAccess->getMemoryVT().getStoreSize();
+ SDValue BasePtr = MemAccess->getBasePtr();
+ MachineMemOperand *MMO = MemAccess->getMemOperand();
+ assert(MMO && "Expected a memory operand.");
+
+ // The memory access must have a proper alignment and no index register.
+ if (MemAccess->getAlignment() < StoreSize ||
+ !MemAccess->getOffset().isUndef())
+ return false;
+
+ // The MMO must not have an unaligned offset.
+ if (MMO->getOffset() % StoreSize != 0)
+ return false;
+
+ // An access to GOT or the Constant Pool is aligned.
+ if (const PseudoSourceValue *PSV = MMO->getPseudoValue())
+ if ((PSV->isGOT() || PSV->isConstantPool()))
+ return true;
+
+ // Check the alignment of a Global Address.
+ if (BasePtr.getNumOperands())
+ if (GlobalAddressSDNode *GA =
+ dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) {
+ // The immediate offset must be aligned.
+ if (GA->getOffset() % StoreSize != 0)
+ return false;
+
+ // The alignment of the symbol itself must be at least the store size.
+ const GlobalValue *GV = GA->getGlobal();
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ if (GV->getPointerAlignment(DL).value() < StoreSize)
+ return false;
+ }
+
+ return true;
+}
+
void SystemZDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index eb1e51341ec4..603446755aaf 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -164,6 +164,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
++I) {
MVT VT = MVT::SimpleValueType(I);
if (isTypeLegal(VT)) {
+ setOperationAction(ISD::ABS, VT, Legal);
+
// Expand individual DIV and REMs into DIVREMs.
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
@@ -358,6 +360,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUB, VT, Legal);
if (VT != MVT::v2i64)
setOperationAction(ISD::MUL, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::AND, VT, Legal);
setOperationAction(ISD::OR, VT, Legal);
setOperationAction(ISD::XOR, VT, Legal);
@@ -784,10 +787,11 @@ bool SystemZVectorConstantInfo::isVectorConstantLegal(
SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
-
- // Find the smallest splat.
SplatBits = FPImm.bitcastToAPInt();
unsigned Width = SplatBits.getBitWidth();
+ IntBits <<= (SystemZ::VectorBits - Width);
+
+ // Find the smallest splat.
while (Width > 8) {
unsigned HalfSize = Width / 2;
APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
@@ -981,16 +985,16 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
return false;
- unsigned FromBits = FromType->getPrimitiveSizeInBits();
- unsigned ToBits = ToType->getPrimitiveSizeInBits();
+ unsigned FromBits = FromType->getPrimitiveSizeInBits().getFixedSize();
+ unsigned ToBits = ToType->getPrimitiveSizeInBits().getFixedSize();
return FromBits > ToBits;
}
bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
if (!FromVT.isInteger() || !ToVT.isInteger())
return false;
- unsigned FromBits = FromVT.getSizeInBits();
- unsigned ToBits = ToVT.getSizeInBits();
+ unsigned FromBits = FromVT.getFixedSizeInBits();
+ unsigned ToBits = ToVT.getFixedSizeInBits();
return FromBits > ToBits;
}
@@ -2285,7 +2289,8 @@ static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
C.Op1.getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
- if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
+ if (L->getMemoryVT().getStoreSizeInBits().getFixedSize() <=
+ C.Op0.getValueSizeInBits().getFixedSize()) {
unsigned Type = L->getExtensionType();
if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
(Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
@@ -2958,7 +2963,7 @@ static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
// Return the absolute or negative absolute of Op; IsNegative decides which.
static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
bool IsNegative) {
- Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
+ Op = DAG.getNode(ISD::ABS, DL, Op.getValueType(), Op);
if (IsNegative)
Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
DAG.getConstant(0, DL, Op.getValueType()), Op);
@@ -3414,14 +3419,14 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
// If user has set the no alignment function attribute, ignore
// alloca alignments.
- uint64_t AlignVal = (RealignOpt ?
- dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+ uint64_t AlignVal =
+ (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
uint64_t StackAlign = TFI->getStackAlignment();
uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
- unsigned SPReg = getStackPointerRegisterToSaveRestore();
+ Register SPReg = getStackPointerRegisterToSaveRestore();
SDValue NeededSpace = Size;
// Get a reference to the stack pointer.
@@ -3430,7 +3435,8 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
// If we need a backchain, save it now.
SDValue Backchain;
if (StoreBackchain)
- Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+ Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
+ MachinePointerInfo());
// Add extra space for alignment if needed.
if (ExtraAlignSpace)
@@ -3467,7 +3473,8 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
}
if (StoreBackchain)
- Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+ Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
+ MachinePointerInfo());
SDValue Ops[2] = { Result, Chain };
return DAG.getMergeValues(Ops, DL);
@@ -4090,13 +4097,15 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
if (StoreBackchain) {
SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
- Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+ Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
+ MachinePointerInfo());
}
Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
if (StoreBackchain)
- Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+ Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
+ MachinePointerInfo());
return Chain;
}
@@ -5557,7 +5566,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(TLS_LDCALL);
OPCODE(PCREL_WRAPPER);
OPCODE(PCREL_OFFSET);
- OPCODE(IABS);
OPCODE(ICMP);
OPCODE(FCMP);
OPCODE(STRICT_FCMP);
@@ -6815,8 +6823,7 @@ static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
DAG.computeKnownBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
KnownBits RHSKnown =
DAG.computeKnownBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
- Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
- Known.One = LHSKnown.One & RHSKnown.One;
+ Known = KnownBits::commonBits(LHSKnown, RHSKnown);
}
void
@@ -7246,6 +7253,15 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+ // ISel pattern matching also adds a load memory operand of the same
+ // address, so take special care to find the storing memory operand.
+ MachineMemOperand *MMO = nullptr;
+ for (auto *I : MI.memoperands())
+ if (I->isStore()) {
+ MMO = I;
+ break;
+ }
+
// Use STOCOpcode if possible. We could use different store patterns in
// order to avoid matching the index register, but the performance trade-offs
// might be more complicated in that case.
@@ -7253,15 +7269,6 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
if (Invert)
CCMask ^= CCValid;
- // ISel pattern matching also adds a load memory operand of the same
- // address, so take special care to find the storing memory operand.
- MachineMemOperand *MMO = nullptr;
- for (auto *I : MI.memoperands())
- if (I->isStore()) {
- MMO = I;
- break;
- }
-
BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
.addReg(SrcReg)
.add(Base)
@@ -7306,7 +7313,8 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
.addReg(SrcReg)
.add(Base)
.addImm(Disp)
- .addReg(IndexReg);
+ .addReg(IndexReg)
+ .addMemOperand(MMO);
MBB->addSuccessor(JoinMBB);
MI.eraseFromParent();
@@ -8140,6 +8148,16 @@ MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
return DoneMBB;
}
+SDValue SystemZTargetLowering::
+getBackchainAddress(SDValue SP, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ auto *TFL =
+ static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+ SDLoc DL(SP);
+ return DAG.getNode(ISD::ADD, DL, MVT::i64, SP,
+ DAG.getIntPtrConstant(TFL->getBackchainOffset(MF), DL));
+}
+
MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *MBB) const {
switch (MI.getOpcode()) {
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 27637762296a..955587da626f 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -50,9 +50,6 @@ enum NodeType : unsigned {
// as a register base.
PCREL_OFFSET,
- // Integer absolute.
- IABS,
-
// Integer comparisons. There are three operands: the two values
// to compare, and an integer of type SystemZICMP.
ICMP,
@@ -701,6 +698,8 @@ private:
MachineBasicBlock *emitProbedAlloca(MachineInstr &MI,
MachineBasicBlock *MBB) const;
+ SDValue getBackchainAddress(SDValue SP, SelectionDAG &DAG) const;
+
MachineMemOperand::Flags
getTargetMMOFlags(const Instruction &I) const override;
const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 50f1e09c6ee5..95e94c4c8e1c 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1764,6 +1764,55 @@ class DirectiveInsnSSF<dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{35-32} = enc{35-32};
}
+class DirectiveInsnVRI<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstVRIe<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnVRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstVRRc<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnVRS<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstVRSc<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnVRV<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstVRV<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnVRX<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstVRX<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+class DirectiveInsnVSI<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstVSI<0, outs, ins, asmstr, pattern> {
+ bits<48> enc;
+
+ let Inst{47-40} = enc{47-40};
+ let Inst{7-0} = enc{7-0};
+}
+
+
//===----------------------------------------------------------------------===//
// Variants of instructions with condition mask
//===----------------------------------------------------------------------===//
@@ -1862,6 +1911,11 @@ class ICV<string name>
!cast<CondVariant>("IntCondVariant"#name).suffix,
!cast<CondVariant>("IntCondVariant"#name).alternate>;
+// Defines a class that makes it easier to define
+// a MnemonicAlias when CondVariant's are involved.
+class MnemonicCondBranchAlias<CondVariant V, string from, string to>
+ : MnemonicAlias<!subst("#", V.suffix, from), !subst("#", V.suffix, to)>;
+
//===----------------------------------------------------------------------===//
// Instruction definitions with semantics
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 223cfcba2fac..bf01c262afe1 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -752,11 +752,14 @@ bool SystemZInstrInfo::PredicateInstruction(
return true;
}
if (Opcode == SystemZ::CallBR) {
- const uint32_t *RegMask = MI.getOperand(0).getRegMask();
+ MachineOperand Target = MI.getOperand(0);
+ const uint32_t *RegMask = MI.getOperand(1).getRegMask();
+ MI.RemoveOperand(1);
MI.RemoveOperand(0);
MI.setDesc(get(SystemZ::CallBCR));
MachineInstrBuilder(*MI.getParent()->getParent(), MI)
.addImm(CCValid).addImm(CCMask)
+ .add(Target)
.addRegMask(RegMask)
.addReg(SystemZ::CC, RegState::Implicit);
return true;
@@ -999,7 +1002,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
unsigned Opcode = MI.getOpcode();
// Check CC liveness if new instruction introduces a dead def of CC.
- MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
+ MCRegUnitIterator CCUnit(MCRegister::from(SystemZ::CC), TRI);
SlotIndex MISlot = SlotIndex();
LiveRange *CCLiveRange = nullptr;
bool CCLiveAtMI = true;
@@ -1196,7 +1199,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
if (RC == &SystemZ::VR32BitRegClass || RC == &SystemZ::VR64BitRegClass) {
Register Reg = MI.getOperand(I).getReg();
Register PhysReg = Register::isVirtualRegister(Reg)
- ? (VRM ? VRM->getPhys(Reg) : Register())
+ ? (VRM ? Register(VRM->getPhys(Reg)) : Register())
: Reg;
if (!PhysReg ||
!(SystemZ::FP32BitRegClass.contains(PhysReg) ||
@@ -1242,7 +1245,8 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
else {
Register DstReg = MI.getOperand(0).getReg();
Register DstPhys =
- (Register::isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
+ (Register::isVirtualRegister(DstReg) ? Register(VRM->getPhys(DstReg))
+ : DstReg);
Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
: ((OpNum == 1 && MI.isCommutable())
? MI.getOperand(2).getReg()
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index d5d56ecf6e47..6e4f9e7f4922 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -101,10 +101,20 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
}
}
-// NOPs. These are again variants of the conditional branches,
-// with the condition mask set to "never".
+// NOPs. These are again variants of the conditional branches, with the
+// condition mask set to "never". NOP_bare can't be an InstAlias since it
+// would need R0D hard coded which is not part of ADDR64BitRegClass.
def NOP : InstAlias<"nop\t$XBD", (BCAsm 0, bdxaddr12only:$XBD), 0>;
+let isAsmParserOnly = 1, hasNoSchedulingInfo = 1, M1 = 0, XBD2 = 0 in
+ def NOP_bare : InstRXb<0x47,(outs), (ins), "nop", []>;
def NOPR : InstAlias<"nopr\t$R", (BCRAsm 0, GR64:$R), 0>;
+def NOPR_bare : InstAlias<"nopr", (BCRAsm 0, R0D), 0>;
+
+// An alias of BRC 0, label
+def JNOP : InstAlias<"jnop\t$RI2", (BRCAsm 0, brtarget16:$RI2), 0>;
+
+// An alias of BRCL 0, label
+def JGNOP : InstAlias<"jgnop\t$RI2", (BRCLAsm 0, brtarget32:$RI2), 0>;
// Fused compare-and-branch instructions.
//
@@ -280,33 +290,32 @@ let isCall = 1, Defs = [R14D, CC] in {
[(z_tls_ldcall tglobaltlsaddr:$I2)]>;
}
-// Sibling calls. Indirect sibling calls must be via R1, since R2 upwards
-// are argument registers and since branching to R0 is a no-op.
+// Sibling calls.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
[(z_sibcall pcrel32:$I2)]>;
- let Uses = [R1D] in
- def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
+ def CallBR : Alias<2, (outs), (ins ADDR64:$R2),
+ [(z_sibcall ADDR64:$R2)]>;
}
// Conditional sibling calls.
let CCMaskFirst = 1, isCall = 1, isTerminator = 1, isReturn = 1 in {
def CallBRCL : Alias<6, (outs), (ins cond4:$valid, cond4:$R1,
pcrel32:$I2), []>;
- let Uses = [R1D] in
- def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+ def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1,
+ ADDR64:$R2), []>;
}
// Fused compare and conditional sibling calls.
-let isCall = 1, isTerminator = 1, isReturn = 1, Uses = [R1D] in {
- def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
- def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
- def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
- def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
- def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
- def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
- def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
- def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+let isCall = 1, isTerminator = 1, isReturn = 1 in {
+ def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3, ADDR64:$R4), []>;
+ def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3, ADDR64:$R4), []>;
+ def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3, ADDR64:$R4), []>;
+ def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3, ADDR64:$R4), []>;
+ def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3, ADDR64:$R4), []>;
+ def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3, ADDR64:$R4), []>;
+ def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3, ADDR64:$R4), []>;
+ def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3, ADDR64:$R4), []>;
}
// A return instruction (br %r14).
@@ -828,16 +837,13 @@ def GOT : Alias<6, (outs GR64:$R1), (ins),
let Defs = [CC] in {
let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
- def LPR : UnaryRR <"lpr", 0x10, z_iabs, GR32, GR32>;
- def LPGR : UnaryRRE<"lpgr", 0xB900, z_iabs, GR64, GR64>;
+ def LPR : UnaryRR <"lpr", 0x10, abs, GR32, GR32>;
+ def LPGR : UnaryRRE<"lpgr", 0xB900, abs, GR64, GR64>;
}
let CCValues = 0xE, CompareZeroCCMask = 0xE in
def LPGFR : UnaryRRE<"lpgfr", 0xB910, null_frag, GR64, GR32>;
}
-def : Pat<(z_iabs32 GR32:$src), (LPR GR32:$src)>;
-def : Pat<(z_iabs64 GR64:$src), (LPGR GR64:$src)>;
-defm : SXU<z_iabs, LPGFR>;
-defm : SXU<z_iabs64, LPGFR>;
+defm : SXU<abs, LPGFR>;
let Defs = [CC] in {
let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
@@ -847,10 +853,7 @@ let Defs = [CC] in {
let CCValues = 0xE, CompareZeroCCMask = 0xE in
def LNGFR : UnaryRRE<"lngfr", 0xB911, null_frag, GR64, GR32>;
}
-def : Pat<(z_inegabs32 GR32:$src), (LNR GR32:$src)>;
-def : Pat<(z_inegabs64 GR64:$src), (LNGR GR64:$src)>;
-defm : SXU<z_inegabs, LNGFR>;
-defm : SXU<z_inegabs64, LNGFR>;
+defm : SXU<z_inegabs, LNGFR>;
let Defs = [CC] in {
let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
@@ -2242,6 +2245,31 @@ let isCodeGenOnly = 1, hasSideEffects = 1 in {
(ins imm64zx48:$enc, bdaddr12only:$BD1,
bdaddr12only:$BD2, AnyReg:$R3),
".insn ssf,$enc,$BD1,$BD2,$R3", []>;
+ def InsnVRI : DirectiveInsnVRI<(outs),
+ (ins imm64zx48:$enc, VR128:$V1, VR128:$V2,
+ imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
+ ".insn vri,$enc,$V1,$V2,$I3,$M4,$M5", []>;
+ def InsnVRR : DirectiveInsnVRR<(outs),
+ (ins imm64zx48:$enc, VR128:$V1, VR128:$V2,
+ VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+ imm32zx4:$M6),
+ ".insn vrr,$enc,$V1,$V2,$V3,$M4,$M5,$M6", []>;
+ def InsnVRS : DirectiveInsnVRS<(outs),
+ (ins imm64zx48:$enc, AnyReg:$R1, VR128:$V3,
+ bdaddr12only:$BD2, imm32zx4:$M4),
+ ".insn vrs,$enc,$BD2,$M4", []>;
+ def InsnVRV : DirectiveInsnVRV<(outs),
+ (ins imm64zx48:$enc, VR128:$V1,
+ bdvaddr12only:$VBD2, imm32zx4:$M3),
+ ".insn vrv,$enc,$V1,$VBD2,$M3", []>;
+ def InsnVRX : DirectiveInsnVRX<(outs),
+ (ins imm64zx48:$enc, VR128:$V1,
+ bdxaddr12only:$XBD2, imm32zx4:$M3),
+ ".insn vrx,$enc,$V1,$XBD2,$M3", []>;
+ def InsnVSI : DirectiveInsnVSI<(outs),
+ (ins imm64zx48:$enc, VR128:$V1,
+ bdaddr12only:$BD2, imm32zx8:$I3),
+ ".insn vsi,$enc,$V1,$BD2,$I3", []>;
}
//===----------------------------------------------------------------------===//
@@ -2315,3 +2343,25 @@ defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
XCSequence, 4>;
defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
XCSequence, 8>;
+
+//===----------------------------------------------------------------------===//
+// Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def JCT : MnemonicAlias<"jct", "brct">;
+def JCTG : MnemonicAlias<"jctg", "brctg">;
+def JAS : MnemonicAlias<"jas", "bras">;
+def JASL : MnemonicAlias<"jasl", "brasl">;
+def JXH : MnemonicAlias<"jxh", "brxh">;
+def JXLE : MnemonicAlias<"jxle", "brxle">;
+def JXHG : MnemonicAlias<"jxhg", "brxhg">;
+def JXLEG : MnemonicAlias<"jxleg", "brxlg">;
+
+def BRU : MnemonicAlias<"bru", "j">;
+def BRUL : MnemonicAlias<"brul", "jg">;
+
+foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+ "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+ def BRUAsm#V : MnemonicCondBranchAlias <CV<V>, "br#", "j#">;
+ def BRULAsm#V : MnemonicCondBranchAlias <CV<V>, "br#l", "jg#">;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index e73f1e429c3c..a85eb1623e1c 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -571,10 +571,10 @@ let Predicates = [FeatureVector] in {
// Load positive.
def VLP : UnaryVRRaGeneric<"vlp", 0xE7DF>;
- def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8, v128b, v128b, 0>;
- def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>;
- def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>;
- def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>;
+ def VLPB : UnaryVRRa<"vlpb", 0xE7DF, abs, v128b, v128b, 0>;
+ def VLPH : UnaryVRRa<"vlph", 0xE7DF, abs, v128h, v128h, 1>;
+ def VLPF : UnaryVRRa<"vlpf", 0xE7DF, abs, v128f, v128f, 2>;
+ def VLPG : UnaryVRRa<"vlpg", 0xE7DF, abs, v128g, v128g, 3>;
let isCommutable = 1 in {
// Maximum.
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 3fc25034dded..9bee5e8d1864 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -72,6 +72,7 @@ advanceTo(MachineBasicBlock::iterator NextBegin) {
}
void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+ Available.clear(); // -misched-cutoff.
LLVM_DEBUG(HazardRec->dumpState(););
}
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td
index 81af5fd854db..992b1512a077 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -259,7 +259,6 @@ def z_tls_ldcall : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
def z_pcrel_wrapper : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
def z_pcrel_offset : SDNode<"SystemZISD::PCREL_OFFSET",
SDT_ZWrapOffset, []>;
-def z_iabs : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
def z_icmp : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
def z_fcmp : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
def z_strict_fcmp : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
@@ -572,10 +571,8 @@ def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
// Aligned loads.
class AlignedLoad<SDPatternOperator load>
- : PatFrag<(ops node:$addr), (load node:$addr), [{
- auto *Load = cast<LoadSDNode>(N);
- return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
-}]>;
+ : PatFrag<(ops node:$addr), (load node:$addr),
+ [{ return storeLoadIsAligned(N); }]>;
def aligned_load : AlignedLoad<load>;
def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
@@ -584,10 +581,8 @@ def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
// Aligned stores.
class AlignedStore<SDPatternOperator store>
- : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
- auto *Store = cast<StoreSDNode>(N);
- return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
-}]>;
+ : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr),
+ [{ return storeLoadIsAligned(N); }]>;
def aligned_store : AlignedStore<store>;
def aligned_truncstorei16 : AlignedStore<truncstorei16>;
def aligned_truncstorei32 : AlignedStore<truncstorei32>;
@@ -671,17 +666,7 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
}]>;
// Negative integer absolute.
-def z_inegabs : PatFrag<(ops node:$src), (ineg (z_iabs node:$src))>;
-
-// Integer absolute, matching the canonical form generated by DAGCombiner.
-def z_iabs32 : PatFrag<(ops node:$src),
- (xor (add node:$src, (sra node:$src, (i32 31))),
- (sra node:$src, (i32 31)))>;
-def z_iabs64 : PatFrag<(ops node:$src),
- (xor (add node:$src, (sra node:$src, (i32 63))),
- (sra node:$src, (i32 63)))>;
-def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
-def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+def z_inegabs : PatFrag<(ops node:$src), (ineg (abs node:$src))>;
// Integer multiply-and-add
def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -898,16 +883,6 @@ def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>;
// Signed "integer less than zero" on vectors.
def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph immAllZerosV, node:$x)>;
-// Integer absolute on vectors.
-class z_viabs<int shift>
- : PatFrag<(ops node:$src),
- (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))),
- (z_vsra_by_scalar node:$src, (i32 shift)))>;
-def z_viabs8 : z_viabs<7>;
-def z_viabs16 : z_viabs<15>;
-def z_viabs32 : z_viabs<31>;
-def z_viabs64 : z_viabs<63>;
-
// Sign-extend the i64 elements of a vector.
class z_vse<int shift>
: PatFrag<(ops node:$src),
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index fe2aaca8429a..5139cc39d2af 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -109,8 +109,9 @@ bool SystemZRegisterInfo::getRegAllocationHints(
auto tryAddHint = [&](const MachineOperand *MO) -> void {
Register Reg = MO->getReg();
- Register PhysReg =
- Register::isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
+ Register PhysReg = Register::isPhysicalRegister(Reg)
+ ? Reg
+ : Register(VRM->getPhys(Reg));
if (PhysReg) {
if (MO->getSubReg())
PhysReg = getSubReg(PhysReg, MO->getSubReg());
@@ -265,8 +266,9 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// Decompose the frame index into a base and offset.
int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
Register BasePtr;
- int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
- MI->getOperand(FIOperandNum + 1).getImm());
+ int64_t Offset =
+ (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed() +
+ MI->getOperand(FIOperandNum + 1).getImm());
// Special handling of dbg_value instructions.
if (MI->isDebugValue()) {
@@ -321,8 +323,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// Load the high offset into the scratch register and use it as
// an index.
TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
- BuildMI(MBB, MI, DL, TII->get(SystemZ::AGR),ScratchReg)
- .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
+ BuildMI(MBB, MI, DL, TII->get(SystemZ::LA), ScratchReg)
+ .addReg(BasePtr, RegState::Kill).addImm(0).addReg(ScratchReg);
}
// Use the scratch register as the base. It then dies here.
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index b3266051da4e..de49106a5a60 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -204,7 +204,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
// Load and zero rightmost byte
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index df7282a2961b..5ea269cb891d 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -205,7 +205,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
// Load and zero rightmost byte
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 56ceb88f35d4..6a28aec6f846 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -206,7 +206,7 @@ def : InstRW<[WLat1, FXa, NormalGr], (instregex "LLIL(F|H|L)$")>;
def : InstRW<[WLat1, FXa, NormalGr], (instregex "LG(F|H)I$")>;
def : InstRW<[WLat1, FXa, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXa, NormalGr], (instregex "LR$")>;
// Load and zero rightmost byte
def : InstRW<[LSULatency, LSU, NormalGr], (instregex "LZR(F|G)$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index ca714ef1a702..9a306591a34f 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -182,7 +182,7 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR$")>;
// Load and test
def : InstRW<[WLat1LSU, WLat1LSU, LSU, FXU, NormalGr], (instregex "LT(G)?$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index fb226be678da..f3ff1dfaba75 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -187,7 +187,7 @@ def : InstRW<[WLat1, FXU, NormalGr], (instregex "LLIL(F|H|L)$")>;
def : InstRW<[WLat1, FXU, NormalGr], (instregex "LG(F|H)I$")>;
def : InstRW<[WLat1, FXU, NormalGr], (instregex "LHI(Mux)?$")>;
-def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR(Mux)?$")>;
+def : InstRW<[WLat1, FXU, NormalGr], (instregex "LR$")>;
// Load and trap
def : InstRW<[WLat1LSU, FXU, LSU, NormalGr], (instregex "L(FH|G)?AT$")>;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 6b4f35e5ba2b..ca5ca7257bab 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -117,9 +117,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
return Chain1;
SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
DAG.getConstant(1, DL, PtrVT));
- SDValue Chain2 =
- DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1),
- /* Alignment = */ 1);
+ SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
+ DstPtrInfo.getWithOffset(1), Align(1));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 68e0b7ae66a4..d24e264b03a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -33,24 +33,32 @@ SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
if (CPUName.empty())
CPUName = "generic";
// Parse features string.
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS);
// -msoft-float implies -mno-vx.
if (HasSoftFloat)
HasVector = false;
+ // -mno-vx implicitly disables all vector-related features.
+ if (!HasVector) {
+ HasVectorEnhancements1 = false;
+ HasVectorEnhancements2 = false;
+ HasVectorPackedDecimal = false;
+ HasVectorPackedDecimalEnhancement = false;
+ }
+
return *this;
}
SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM)
- : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
- HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
- HasPopulationCount(false), HasMessageSecurityAssist3(false),
- HasMessageSecurityAssist4(false), HasResetReferenceBitsMultiple(false),
- HasFastSerialization(false), HasInterlockedAccess1(false),
- HasMiscellaneousExtensions(false),
+ : SystemZGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+ HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false),
+ HasFPExtension(false), HasPopulationCount(false),
+ HasMessageSecurityAssist3(false), HasMessageSecurityAssist4(false),
+ HasResetReferenceBitsMultiple(false), HasFastSerialization(false),
+ HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
HasExecutionHint(false), HasLoadAndTrap(false),
HasTransactionalExecution(false), HasProcessorAssist(false),
HasDFPZonedConversion(false), HasEnhancedDAT2(false),
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 4b49c37fe4e6..3841063d2f61 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -112,7 +112,7 @@ public:
bool enableSubRegLiveness() const override;
// Automatically generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
// Return true if the target has the distinct-operands facility.
bool hasDistinctOps() const { return HasDistinctOps; }
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 3f467b200852..7b78dc4ad13a 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -171,12 +171,10 @@ SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
// FIXME: This is related to the code below to reset the target options,
// we need to know whether or not the soft float flag is set on the
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 864200e5f71c..e7ac2391512f 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -64,8 +64,9 @@ int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
}
int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -340,8 +341,8 @@ unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
// Emit prefetch instructions for smaller strides in cases where we think
// the hardware prefetcher might not be able to keep up.
- if (NumStridedMemAccesses > 32 &&
- NumStridedMemAccesses == NumMemAccesses && !HasCall)
+ if (NumStridedMemAccesses > 32 && !HasCall &&
+ (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
return 1;
return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
@@ -592,8 +593,9 @@ static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
unsigned SystemZTTIImpl::
getVectorTruncCost(Type *SrcTy, Type *DstTy) {
assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
- assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
- "Packing must reduce size of vector type.");
+ assert(SrcTy->getPrimitiveSizeInBits().getFixedSize() >
+ DstTy->getPrimitiveSizeInBits().getFixedSize() &&
+ "Packing must reduce size of vector type.");
assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
cast<FixedVectorType>(DstTy)->getNumElements() &&
"Packing should not change number of elements.");
@@ -699,11 +701,12 @@ getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
}
int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
// FIXME: Can the logic below also be used for these cost kinds?
if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
- int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+ int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
return BaseCost == 0 ? BaseCost : 1;
}
@@ -786,8 +789,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values. Base implementation does not
// realize float->int gets scalarized.
- unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
- Src->getScalarType(), CostKind);
+ unsigned ScalarCost = getCastInstrCost(
+ Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
unsigned TotCost = VF * ScalarCost;
bool NeedsInserts = true, NeedsExtracts = true;
// FP128 registers do not get inserted or extracted.
@@ -828,7 +831,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
}
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
// Scalar i8 / i16 operations will typically be made after first extending
@@ -844,11 +847,11 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
}
int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy,
+ Type *CondTy, CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
if (!ValTy->isVectorTy()) {
switch (Opcode) {
@@ -860,7 +863,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
- C->getZExtValue() == 0)
+ C->isZero())
return 0;
unsigned Cost = 1;
@@ -924,7 +927,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
}
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
}
int SystemZTTIImpl::
@@ -1019,7 +1022,7 @@ isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
// Comparison between memory and immediate.
if (UserI->getOpcode() == Instruction::ICmp)
if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
- if (isUInt<16>(CI->getZExtValue()))
+ if (CI->getValue().isIntN(16))
return true;
return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 7f8f7f6f923f..c97e099f9943 100644
--- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -41,7 +41,8 @@ public:
int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind);
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind);
@@ -93,9 +94,10 @@ public:
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
const Instruction *I);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::TargetCostKind CostKind,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
diff --git a/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
index eea0aeea2c45..81af4eead6d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
@@ -49,12 +50,23 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
// Reset various EH DWARF encodings.
PersonalityEncoding = LSDAEncoding = TTypeEncoding = dwarf::DW_EH_PE_absptr;
CallSiteEncoding = dwarf::DW_EH_PE_uleb128;
+
+ this->TM = &TM;
}
TargetLoweringObjectFile::~TargetLoweringObjectFile() {
delete Mang;
}
+unsigned TargetLoweringObjectFile::getCallSiteEncoding() const {
+ // If target does not have LEB128 directives, we would need the
+ // call site encoding to be udata4 so that the alternative path
+ // for not having LEB128 directives could work.
+ if (!getContext().getAsmInfo()->hasLEB128Directives())
+ return dwarf::DW_EH_PE_udata4;
+ return CallSiteEncoding;
+}
+
static bool isNullOrUndef(const Constant *C) {
// Check that the constant isn't all zeros or undefs.
if (C->isNullValue() || isa<UndefValue>(C))
@@ -136,6 +148,52 @@ void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
const MCSymbol *Sym) const {
}
+void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
+ Module &M) const {
+ MCContext &C = getContext();
+ SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+ M.getModuleFlagsMetadata(ModuleFlags);
+
+ MDNode *CFGProfile = nullptr;
+
+ for (const auto &MFE : ModuleFlags) {
+ StringRef Key = MFE.Key->getString();
+ if (Key == "CG Profile") {
+ CFGProfile = cast<MDNode>(MFE.Val);
+ break;
+ }
+ }
+
+ if (!CFGProfile)
+ return;
+
+ auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * {
+ if (!MDO)
+ return nullptr;
+ auto *V = cast<ValueAsMetadata>(MDO);
+ const Function *F = cast<Function>(V->getValue()->stripPointerCasts());
+ if (F->hasDLLImportStorageClass())
+ return nullptr;
+ return TM->getSymbol(F);
+ };
+
+ for (const auto &Edge : CFGProfile->operands()) {
+ MDNode *E = cast<MDNode>(Edge);
+ const MCSymbol *From = GetSym(E->getOperand(0));
+ const MCSymbol *To = GetSym(E->getOperand(1));
+ // Skip null functions. This can happen if functions are dead stripped after
+ // the CGProfile pass has been run.
+ if (!From || !To)
+ continue;
+ uint64_t Count = cast<ConstantAsMetadata>(E->getOperand(2))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue();
+ Streamer.emitCGProfileEntry(
+ MCSymbolRefExpr::create(From, MCSymbolRefExpr::VK_None, C),
+ MCSymbolRefExpr::create(To, MCSymbolRefExpr::VK_None, C), Count);
+ }
+}
/// getKindForGlobal - This is a top-level target-independent classifier for
/// a global object. Given a global variable and information from the TM, this
diff --git a/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp
index 074e9fde79e6..2aee0e5c3fb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/TargetMachine.cpp
@@ -93,34 +93,30 @@ static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
const GlobalValue *GV) const {
- // If the IR producer requested that this GV be treated as dso local, obey.
- if (GV && GV->isDSOLocal())
- return true;
-
- // If we are not supossed to use a PLT, we cannot assume that intrinsics are
- // local since the linker can convert some direct access to access via plt.
- if (M.getRtLibUseGOT() && !GV)
- return false;
+ const Triple &TT = getTargetTriple();
+ Reloc::Model RM = getRelocationModel();
// According to the llvm language reference, we should be able to
// just return false in here if we have a GV, as we know it is
// dso_preemptable. At this point in time, the various IR producers
// have not been transitioned to always produce a dso_local when it
// is possible to do so.
- // In the case of intrinsics, GV is null and there is nowhere to put
- // dso_local. Returning false for those will produce worse code in some
- // architectures. For example, on x86 the caller has to set ebx before calling
- // a plt.
+ // In the case of ExternalSymbolSDNode, GV is null and we should just return
+ // false. However, COFF currently relies on this to be true
+ //
// As a result we still have some logic in here to improve the quality of the
// generated code.
// FIXME: Add a module level metadata for whether intrinsics should be assumed
// local.
+ if (!GV)
+ return TT.isOSBinFormatCOFF();
- Reloc::Model RM = getRelocationModel();
- const Triple &TT = getTargetTriple();
+ // If the IR producer requested that this GV be treated as dso local, obey.
+ if (GV->isDSOLocal())
+ return true;
// DLLImport explicitly marks the GV as external.
- if (GV && GV->hasDLLImportStorageClass())
+ if (GV->hasDLLImportStorageClass())
return false;
// On MinGW, variables that haven't been declared with DLLImport may still
@@ -128,14 +124,14 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
// don't assume the variables to be DSO local unless we actually know
// that for sure. This only has to be done for variables; for functions
// the linker can insert thunks for calling functions from another DLL.
- if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV &&
+ if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() &&
GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
return false;
// On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
// remain unresolved in the link, they can be resolved to zero, which is
// outside the current DSO.
- if (TT.isOSBinFormatCOFF() && GV && GV->hasExternalWeakLinkage())
+ if (TT.isOSBinFormatCOFF() && GV->hasExternalWeakLinkage())
return false;
// Every other GV is local on COFF.
@@ -147,20 +143,10 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
if (TT.isOSBinFormatCOFF() || TT.isOSWindows())
return true;
- // Most PIC code sequences that assume that a symbol is local cannot
- // produce a 0 if it turns out the symbol is undefined. While this
- // is ABI and relocation depended, it seems worth it to handle it
- // here.
- if (GV && isPositionIndependent() && GV->hasExternalWeakLinkage())
- return false;
-
- if (GV && !GV->hasDefaultVisibility())
- return true;
-
if (TT.isOSBinFormatMachO()) {
if (RM == Reloc::Static)
return true;
- return GV && GV->isStrongDefinitionForLinker();
+ return GV->isStrongDefinitionForLinker();
}
// Due to the AIX linkage model, any global with default visibility is
@@ -170,40 +156,6 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm());
assert(RM != Reloc::DynamicNoPIC);
-
- bool IsExecutable =
- RM == Reloc::Static || M.getPIELevel() != PIELevel::Default;
- if (IsExecutable) {
- // If the symbol is defined, it cannot be preempted.
- if (GV && !GV->isDeclarationForLinker())
- return true;
-
- // A symbol marked nonlazybind should not be accessed with a plt. If the
- // symbol turns out to be external, the linker will convert a direct
- // access to an access via the plt, so don't assume it is local.
- const Function *F = dyn_cast_or_null<Function>(GV);
- if (F && F->hasFnAttribute(Attribute::NonLazyBind))
- return false;
- Triple::ArchType Arch = TT.getArch();
-
- // PowerPC prefers avoiding copy relocations.
- if (Arch == Triple::ppc || TT.isPPC64())
- return false;
-
- // Check if we can use copy relocations.
- if (!(GV && GV->isThreadLocal()) && RM == Reloc::Static)
- return true;
- } else if (TT.isOSBinFormatELF()) {
- // If dso_local allows AsmPrinter::getSymbolPreferLocal to use a local
- // alias, set the flag. We cannot set dso_local for other global values,
- // because otherwise direct accesses to a probably interposable symbol (even
- // if the codegen assumes not) will be rejected by the linker.
- if (!GV || !GV->canBenefitFromLocalAlias())
- return false;
- return TT.isX86() && M.noSemanticInterposition();
- }
-
- // ELF & wasm support preemption of other symbols.
return false;
}
@@ -281,3 +233,12 @@ TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
return TargetIRAnalysis(
[this](const Function &F) { return this->getTargetTransformInfo(F); });
}
+
+std::pair<int, int> TargetMachine::parseBinutilsVersion(StringRef Version) {
+ if (Version == "none")
+ return {INT_MAX, INT_MAX}; // Make binutilsIsAtLeast() return true.
+ std::pair<int, int> Ret;
+ if (!Version.consumeInteger(10, Ret.first) && Version.consume_front("."))
+ Version.consumeInteger(10, Ret.second);
+ return Ret;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 7a899b4b38e2..a3309a68c76d 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -125,6 +125,9 @@ static const MCPhysReg F128Regs[32] = {
VE::Q16, VE::Q17, VE::Q18, VE::Q19, VE::Q20, VE::Q21, VE::Q22, VE::Q23,
VE::Q24, VE::Q25, VE::Q26, VE::Q27, VE::Q28, VE::Q29, VE::Q30, VE::Q31};
+static const MCPhysReg VM512Regs[8] = {VE::VMP0, VE::VMP1, VE::VMP2, VE::VMP3,
+ VE::VMP4, VE::VMP5, VE::VMP6, VE::VMP7};
+
static const MCPhysReg MISCRegs[31] = {
VE::USRCC, VE::PSW, VE::SAR, VE::NoRegister,
VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::PMMR,
@@ -277,6 +280,17 @@ public:
}
return false;
}
+ bool isUImm4() {
+ if (!isImm())
+ return false;
+
+ // Constant case
+ if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+ int64_t Value = ConstExpr->getValue();
+ return isUInt<4>(Value);
+ }
+ return false;
+ }
bool isUImm6() {
if (!isImm())
return false;
@@ -476,6 +490,10 @@ public:
addImmOperands(Inst, N);
}
+ void addUImm4Operands(MCInst &Inst, unsigned N) const {
+ addImmOperands(Inst, N);
+ }
+
void addUImm6Operands(MCInst &Inst, unsigned N) const {
addImmOperands(Inst, N);
}
@@ -648,6 +666,15 @@ public:
return true;
}
+ static bool MorphToVM512Reg(VEOperand &Op) {
+ unsigned Reg = Op.getReg();
+ unsigned regIdx = Reg - VE::VM0;
+ if (regIdx % 2 || regIdx > 15)
+ return false;
+ Op.Reg.RegNum = VM512Regs[regIdx / 2];
+ return true;
+ }
+
static bool MorphToMISCReg(VEOperand &Op) {
const auto *ConstExpr = dyn_cast<MCConstantExpr>(Op.getImm());
if (!ConstExpr)
@@ -902,6 +929,24 @@ StringRef VEAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
Mnemonic = parseRD(Name, 10, NameLoc, Operands);
} else if (Name.startswith("cvt.l.d")) {
Mnemonic = parseRD(Name, 7, NameLoc, Operands);
+ } else if (Name.startswith("vcvt.w.d.sx") || Name.startswith("vcvt.w.d.zx") ||
+ Name.startswith("vcvt.w.s.sx") || Name.startswith("vcvt.w.s.zx")) {
+ Mnemonic = parseRD(Name, 11, NameLoc, Operands);
+ } else if (Name.startswith("vcvt.l.d")) {
+ Mnemonic = parseRD(Name, 8, NameLoc, Operands);
+ } else if (Name.startswith("pvcvt.w.s.lo") ||
+ Name.startswith("pvcvt.w.s.up")) {
+ Mnemonic = parseRD(Name, 12, NameLoc, Operands);
+ } else if (Name.startswith("pvcvt.w.s")) {
+ Mnemonic = parseRD(Name, 9, NameLoc, Operands);
+ } else if (Name.startswith("vfmk.l.") || Name.startswith("vfmk.w.") ||
+ Name.startswith("vfmk.d.") || Name.startswith("vfmk.s.")) {
+ bool ICC = Name[5] == 'l' || Name[5] == 'w' ? true : false;
+ Mnemonic = parseCC(Name, 7, Name.size(), ICC, true, NameLoc, Operands);
+ } else if (Name.startswith("pvfmk.w.lo.") || Name.startswith("pvfmk.w.up.") ||
+ Name.startswith("pvfmk.s.lo.") || Name.startswith("pvfmk.s.up.")) {
+ bool ICC = Name[6] == 'l' || Name[6] == 'w' ? true : false;
+ Mnemonic = parseCC(Name, 11, Name.size(), ICC, true, NameLoc, Operands);
} else {
Operands->push_back(VEOperand::CreateToken(Mnemonic, NameLoc));
}
@@ -1362,9 +1407,38 @@ OperandMatchResultTy VEAsmParser::parseOperand(OperandVector &Operands,
return ResTy;
switch (getLexer().getKind()) {
- case AsmToken::LParen:
- // FIXME: Parsing "(" + %vreg + ", " + %vreg + ")"
- // FALLTHROUGH
+ case AsmToken::LParen: {
+ // Parsing "(" + %vreg + ", " + %vreg + ")"
+ const AsmToken Tok1 = Parser.getTok();
+ Parser.Lex(); // Eat the '('.
+
+ unsigned RegNo1;
+ SMLoc S1, E1;
+ if (tryParseRegister(RegNo1, S1, E1) != MatchOperand_Success) {
+ getLexer().UnLex(Tok1);
+ return MatchOperand_NoMatch;
+ }
+
+ if (!Parser.getTok().is(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+ Parser.Lex(); // Eat the ','.
+
+ unsigned RegNo2;
+ SMLoc S2, E2;
+ if (tryParseRegister(RegNo2, S2, E2) != MatchOperand_Success)
+ return MatchOperand_ParseFail;
+
+ if (!Parser.getTok().is(AsmToken::RParen))
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(VEOperand::CreateToken(Tok1.getString(), Tok1.getLoc()));
+ Operands.push_back(VEOperand::CreateReg(RegNo1, S1, E1));
+ Operands.push_back(VEOperand::CreateReg(RegNo2, S2, E2));
+ Operands.push_back(VEOperand::CreateToken(Parser.getTok().getString(),
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the ')'.
+ break;
+ }
default: {
std::unique_ptr<VEOperand> Op;
ResTy = parseVEAsmOperand(Op);
@@ -1377,7 +1451,24 @@ OperandMatchResultTy VEAsmParser::parseOperand(OperandVector &Operands,
if (!Parser.getTok().is(AsmToken::LParen))
break;
- // FIXME: Parsing %vec-reg + "(" + %sclar-reg/number + ")"
+ // Parsing %vec-reg + "(" + %sclar-reg/number + ")"
+ std::unique_ptr<VEOperand> Op1 = VEOperand::CreateToken(
+ Parser.getTok().getString(), Parser.getTok().getLoc());
+ Parser.Lex(); // Eat the '('.
+
+ std::unique_ptr<VEOperand> Op2;
+ ResTy = parseVEAsmOperand(Op2);
+ if (ResTy != MatchOperand_Success || !Op2)
+ return MatchOperand_ParseFail;
+
+ if (!Parser.getTok().is(AsmToken::RParen))
+ return MatchOperand_ParseFail;
+
+ Operands.push_back(std::move(Op1));
+ Operands.push_back(std::move(Op2));
+ Operands.push_back(VEOperand::CreateToken(Parser.getTok().getString(),
+ Parser.getTok().getLoc()));
+ Parser.Lex(); // Eat the ')'.
break;
}
}
@@ -1445,6 +1536,10 @@ unsigned VEAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
if (Op.isReg() && VEOperand::MorphToF128Reg(Op))
return MCTargetAsmParser::Match_Success;
break;
+ case MCK_VM512:
+ if (Op.isReg() && VEOperand::MorphToVM512Reg(Op))
+ return MCTargetAsmParser::Match_Success;
+ break;
case MCK_MISC:
if (Op.isImm() && VEOperand::MorphToMISCReg(Op))
return MCTargetAsmParser::Match_Success;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 35885a4e3cae..20d609bc6b32 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -47,7 +47,7 @@ static MCDisassembler *createVEDisassembler(const Target &T,
return new VEDisassembler(STI, Ctx);
}
-extern "C" void LLVMInitializeVEDisassembler() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEDisassembler() {
// Register the disassembler.
TargetRegistry::RegisterMCDisassembler(getTheVETarget(),
createVEDisassembler);
@@ -95,6 +95,25 @@ static const unsigned F128RegDecoderTable[] = {
VE::Q16, VE::Q17, VE::Q18, VE::Q19, VE::Q20, VE::Q21, VE::Q22, VE::Q23,
VE::Q24, VE::Q25, VE::Q26, VE::Q27, VE::Q28, VE::Q29, VE::Q30, VE::Q31};
+static const unsigned V64RegDecoderTable[] = {
+ VE::V0, VE::V1, VE::V2, VE::V3, VE::V4, VE::V5, VE::V6, VE::V7,
+ VE::V8, VE::V9, VE::V10, VE::V11, VE::V12, VE::V13, VE::V14, VE::V15,
+ VE::V16, VE::V17, VE::V18, VE::V19, VE::V20, VE::V21, VE::V22, VE::V23,
+ VE::V24, VE::V25, VE::V26, VE::V27, VE::V28, VE::V29, VE::V30, VE::V31,
+ VE::V32, VE::V33, VE::V34, VE::V35, VE::V36, VE::V37, VE::V38, VE::V39,
+ VE::V40, VE::V41, VE::V42, VE::V43, VE::V44, VE::V45, VE::V46, VE::V47,
+ VE::V48, VE::V49, VE::V50, VE::V51, VE::V52, VE::V53, VE::V54, VE::V55,
+ VE::V56, VE::V57, VE::V58, VE::V59, VE::V60, VE::V61, VE::V62, VE::V63};
+
+static const unsigned VMRegDecoderTable[] = {
+ VE::VM0, VE::VM1, VE::VM2, VE::VM3, VE::VM4, VE::VM5,
+ VE::VM6, VE::VM7, VE::VM8, VE::VM9, VE::VM10, VE::VM11,
+ VE::VM12, VE::VM13, VE::VM14, VE::VM15};
+
+static const unsigned VM512RegDecoderTable[] = {VE::VMP0, VE::VMP1, VE::VMP2,
+ VE::VMP3, VE::VMP4, VE::VMP5,
+ VE::VMP6, VE::VMP7};
+
static const unsigned MiscRegDecoderTable[] = {
VE::USRCC, VE::PSW, VE::SAR, VE::NoRegister,
VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::PMMR,
@@ -145,6 +164,40 @@ static DecodeStatus DecodeF128RegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeV64RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Reg = VE::NoRegister;
+ if (RegNo == 255)
+ Reg = VE::VIX;
+ else if (RegNo > 63)
+ return MCDisassembler::Fail;
+ else
+ Reg = V64RegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVMRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 15)
+ return MCDisassembler::Fail;
+ unsigned Reg = VMRegDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVM512RegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo % 2 || RegNo > 15)
+ return MCDisassembler::Fail;
+ unsigned Reg = VM512RegDecoderTable[RegNo / 2];
+ Inst.addOperand(MCOperand::createReg(Reg));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder) {
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp b/contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp
new file mode 100644
index 000000000000..c4588926af9e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/LVLGen.cpp
@@ -0,0 +1,137 @@
+//===-- LVLGen.cpp - LVL instruction generator ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VE.h"
+#include "VESubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lvl-gen"
+
+namespace {
+struct LVLGen : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ static char ID;
+ LVLGen() : MachineFunctionPass(ID) {}
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ unsigned getVL(const MachineInstr &MI);
+ int getVLIndex(unsigned Opcode);
+};
+char LVLGen::ID = 0;
+
+} // end of anonymous namespace
+
+FunctionPass *llvm::createLVLGenPass() { return new LVLGen; }
+
+int LVLGen::getVLIndex(unsigned Opcode) {
+ const MCInstrDesc &MCID = TII->get(Opcode);
+
+ // If an instruction has VLIndex information, return it.
+ if (HAS_VLINDEX(MCID.TSFlags))
+ return GET_VLINDEX(MCID.TSFlags);
+
+ return -1;
+}
+
+// returns a register holding a vector length. NoRegister is returned when
+// this MI does not have a vector length.
+unsigned LVLGen::getVL(const MachineInstr &MI) {
+ int Index = getVLIndex(MI.getOpcode());
+ if (Index >= 0)
+ return MI.getOperand(Index).getReg();
+
+ return VE::NoRegister;
+}
+
+bool LVLGen::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+#define RegName(no) \
+ (MBB.getParent()->getSubtarget<VESubtarget>().getRegisterInfo()->getName(no))
+
+ bool Changed = false;
+ bool HasRegForVL = false;
+ unsigned RegForVL;
+
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+ MachineBasicBlock::iterator MI = I;
+
+ // Check whether MI uses a vector length operand. If so, we prepare for VL
+ // register. We would like to reuse VL register as much as possible. We
+ // also would like to keep the number of LEA instructions as fewer as
+ // possible. Therefore, we use a regular scalar register to hold immediate
+ // values to load VL register. And try to reuse identical scalar registers
+ // to avoid new LVLr instructions as much as possible.
+ unsigned Reg = getVL(*MI);
+ if (Reg != VE::NoRegister) {
+ LLVM_DEBUG(dbgs() << "Vector instruction found: ");
+ LLVM_DEBUG(MI->dump());
+ LLVM_DEBUG(dbgs() << "Vector length is " << RegName(Reg) << ". ");
+ LLVM_DEBUG(dbgs() << "Current VL is "
+ << (HasRegForVL ? RegName(RegForVL) : "unknown")
+ << ". ");
+
+ if (!HasRegForVL || RegForVL != Reg) {
+ // Use VL, but a different value in a different scalar register.
+ // So, generate new LVL instruction just before the current instruction.
+ LLVM_DEBUG(dbgs() << "Generate a LVL instruction to load "
+ << RegName(Reg) << ".\n");
+ BuildMI(MBB, I, MI->getDebugLoc(), TII->get(VE::LVLr)).addReg(Reg);
+ HasRegForVL = true;
+ RegForVL = Reg;
+ Changed = true;
+ } else {
+ LLVM_DEBUG(dbgs() << "Reuse current VL.\n");
+ }
+ }
+ // Check the update of a given scalar register holding an immediate value
+ // for VL register. Also, a call doesn't preserve VL register.
+ if (HasRegForVL) {
+ if (MI->definesRegister(RegForVL, TRI) ||
+ MI->modifiesRegister(RegForVL, TRI) ||
+ MI->killsRegister(RegForVL, TRI) || MI->isCall()) {
+ // The latest VL is needed to be updated, so disable HasRegForVL.
+ LLVM_DEBUG(dbgs() << RegName(RegForVL) << " is needed to be updated: ");
+ LLVM_DEBUG(MI->dump());
+ HasRegForVL = false;
+ }
+ }
+
+ ++I;
+ }
+ return Changed;
+}
+
+bool LVLGen::runOnMachineFunction(MachineFunction &F) {
+ LLVM_DEBUG(dbgs() << "********** Begin LVLGen **********\n");
+ LLVM_DEBUG(dbgs() << "********** Function: " << F.getName() << '\n');
+ LLVM_DEBUG(F.dump());
+
+ bool Changed = false;
+
+ const VESubtarget &Subtarget = F.getSubtarget<VESubtarget>();
+ TII = Subtarget.getInstrInfo();
+ TRI = Subtarget.getRegisterInfo();
+
+ for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+ Changed |= runOnMachineBasicBlock(*FI);
+
+ if (Changed) {
+ LLVM_DEBUG(dbgs() << "\n");
+ LLVM_DEBUG(F.dump());
+ }
+ LLVM_DEBUG(dbgs() << "********** End LVLGen **********\n");
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
index 657cc513b3c5..6995007c6dc6 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
@@ -29,6 +29,7 @@ public:
const MCSubtargetInfo &STI, raw_ostream &OS) override;
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
bool printAliasInstr(const MCInst *, uint64_t Address,
const MCSubtargetInfo &, raw_ostream &);
void printInstruction(const MCInst *, uint64_t, const MCSubtargetInfo &,
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index a39cffc8f4a6..4c480c050274 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -56,8 +56,8 @@ static MCRegisterInfo *createVEMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *createVEMCSubtargetInfo(const Triple &TT, StringRef CPU,
StringRef FS) {
if (CPU.empty())
- CPU = "ve";
- return createVEMCSubtargetInfoImpl(TT, CPU, FS);
+ CPU = "generic";
+ return createVEMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
}
static MCTargetStreamer *
@@ -80,7 +80,7 @@ static MCInstPrinter *createVEMCInstPrinter(const Triple &T,
return new VEInstPrinter(MAI, MII, MRI);
}
-extern "C" void LLVMInitializeVETargetMC() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
// Register the MC asm info.
RegisterMCAsmInfoFn X(getTheVETarget(), createVEMCAsmInfo);
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index 65bd142fe0db..a95a299def88 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -16,7 +16,7 @@ Target &llvm::getTheVETarget() {
return TheVETarget;
}
-extern "C" void LLVMInitializeVETargetInfo() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
"VE", "VE");
}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VE.h b/contrib/llvm-project/llvm/lib/Target/VE/VE.h
index 7ed7797cbb83..a404f7ced70a 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VE.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VE.h
@@ -29,6 +29,7 @@ class MachineInstr;
FunctionPass *createVEISelDag(VETargetMachine &TM);
FunctionPass *createVEPromoteToI1Pass();
+FunctionPass *createLVLGenPass();
void LowerVEMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP);
@@ -349,6 +350,24 @@ inline static bool isMImm32Val(uint32_t Val) {
return (Val & (1 << 31)) && isShiftedMask_32(Val);
}
+/// val2MImm - Convert an integer immediate value to target MImm immediate.
+inline static uint64_t val2MImm(uint64_t Val) {
+ if (Val == 0)
+ return 0; // (0)1
+ if (Val & (1UL << 63))
+ return countLeadingOnes(Val); // (m)1
+ return countLeadingZeros(Val) | 0x40; // (m)0
+}
+
+/// mimm2Val - Convert a target MImm immediate to an integer immediate value.
+inline static uint64_t mimm2Val(uint64_t Val) {
+ if (Val == 0)
+ return 0; // (0)1
+ if ((Val & 0x40) == 0)
+ return (uint64_t)((1L << 63) >> (Val & 0x3f)); // (m)1
+ return ((uint64_t)(-1L) >> (Val & 0x3f)); // (m)0
+}
+
inline unsigned M0(unsigned Val) { return Val + 64; }
inline unsigned M1(unsigned Val) { return Val; }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VE.td b/contrib/llvm-project/llvm/lib/Target/VE/VE.td
index 617a6ea458b6..9e8adcd42077 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VE.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VE.td
@@ -18,6 +18,9 @@ include "llvm/Target/Target.td"
//===----------------------------------------------------------------------===//
// VE Subtarget features.
//
+def FeatureEnableVPU
+ : SubtargetFeature<"vpu", "EnableVPU", "true",
+ "Enable the VPU">;
//===----------------------------------------------------------------------===//
// Register File, Calling Conv, Instruction Descriptions
@@ -43,7 +46,7 @@ def VEAsmParser : AsmParser {
class Proc<string Name, list<SubtargetFeature> Features>
: Processor<Name, NoItineraries, Features>;
-def : Proc<"ve", []>;
+def : Proc<"generic", []>;
//===----------------------------------------------------------------------===//
// Declare the target which we are implementing
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp
index 86e3aa3d3fa1..08a75b6b8c55 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -60,6 +60,9 @@ public:
static const char *getRegisterName(unsigned RegNo) {
return VEInstPrinter::getRegisterName(RegNo);
}
+ void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &OS);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) override;
};
} // end of anonymous namespace
@@ -203,7 +206,7 @@ void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
// lea %got, _GLOBAL_OFFSET_TABLE_@PC_LO(-24)
// and %got, %got, (32)0
// sic %plt
- // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%got, %plt)
+ // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%plt, %got)
MCOperand cim24 = MCOperand::createImm(-24);
MCOperand loImm =
createGOTRelExprOp(VEMCExpr::VK_VE_PC_LO32, GOTLabel, OutContext);
@@ -248,10 +251,10 @@ void VEAsmPrinter::lowerGETFunPLTAndEmitMCInsts(const MachineInstr *MI,
MCOperand RegPLT = MCOperand::createReg(VE::SX16); // PLT
- // lea %dst, %plt_lo(func)(-24)
+ // lea %dst, func@plt_lo(-24)
// and %dst, %dst, (32)0
// sic %plt ; FIXME: is it safe to use %plt here?
- // lea.sl %dst, %plt_hi(func)(%dst, %plt)
+ // lea.sl %dst, func@plt_hi(%plt, %dst)
MCOperand cim24 = MCOperand::createImm(-24);
MCOperand loImm =
createGOTRelExprOp(VEMCExpr::VK_VE_PLT_LO32, AddrSym, OutContext);
@@ -295,7 +298,7 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
// lea %s0, sym@tls_gd_lo(-24)
// and %s0, %s0, (32)0
// sic %lr
- // lea.sl %s0, sym@tls_gd_hi(%s0, %lr)
+ // lea.sl %s0, sym@tls_gd_hi(%lr, %s0)
// lea %s12, __tls_get_addr@plt_lo(8)
// and %s12, %s12, (32)0
// lea.sl %s12, __tls_get_addr@plt_hi(%s12, %lr)
@@ -349,7 +352,42 @@ void VEAsmPrinter::emitInstruction(const MachineInstr *MI) {
} while ((++I != E) && I->isInsideBundle()); // Delay slot check.
}
+void VEAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNum);
+
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
+ break;
+ default:
+ llvm_unreachable("<unknown operand type>");
+ }
+}
+
+// PrintAsmOperand - Print out an operand for an inline asm expression.
+bool VEAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+ case 'r':
+ case 'v':
+ break;
+ }
+ }
+
+ printOperand(MI, OpNo, O);
+
+ return false;
+}
+
// Force static initialization.
-extern "C" void LLVMInitializeVEAsmPrinter() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
RegisterAsmPrinter<VEAsmPrinter> X(getTheVETarget());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td b/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td
index 4f04dae884ab..93899c2cae3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VECallingConv.td
@@ -14,71 +14,133 @@
// Aurora VE
//===----------------------------------------------------------------------===//
def CC_VE_C_Stack: CallingConv<[
- // float --> need special handling like below.
- // 0 4
- // +------+------+
- // | empty| float|
- // +------+------+
- CCIfType<[f32], CCCustom<"allocateFloat">>,
+ // F128 are assigned to the stack in 16-byte aligned units
+ CCIfType<[f128], CCAssignToStackWithShadow<16, 16, [SX7]>>,
// All of the rest are assigned to the stack in 8-byte aligned units.
CCAssignToStack<0, 8>
]>;
-def CC_VE : CallingConv<[
+///// C Calling Convention (VE ABI v2.1) /////
+//
+// Reference: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v2.1.pdf
+//
+def CC_VE_C : CallingConv<[
// All arguments get passed in generic registers if there is space.
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-
- // bool, char, int, enum, long --> generic integer 32 bit registers
- CCIfType<[i32], CCAssignToRegWithShadow<
- [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7],
- [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+ // Promote i1/i8/i16/i32 arguments to i64.
+ CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
- // float --> generic floating point 32 bit registers
- CCIfType<[f32], CCAssignToRegWithShadow<
- [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7],
- [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+ // Convert float arguments to i64 with padding.
+ // 63 31 0
+ // +------+------+
+ // | float| 0 |
+ // +------+------+
+ CCIfType<[f32], CCBitConvertToType<i64>>,
- // long long/double --> generic 64 bit registers
+ // bool, char, int, enum, long, long long, float, double
+ // --> generic 64 bit registers
CCIfType<[i64, f64],
CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+ // long double --> pair of generic 64 bit registers
+ //
+ // NOTE: If Q1 is allocated while SX1 is free, llvm tries to allocate SX1 for
+ // following operands, this masks SX1 to avoid such behavior.
+ CCIfType<[f128],
+ CCAssignToRegWithShadow<[Q0, Q1, Q2, Q3],
+ [SX0, SX1, SX3, SX5]>>,
+
// Alternatively, they are assigned to the stack in 8-byte aligned units.
CCDelegateTo<CC_VE_C_Stack>
]>;
+///// Standard vararg C Calling Convention (VE ABI v2.1) /////
// All arguments get passed in stack for varargs function or non-prototyped
// function.
def CC_VE2 : CallingConv<[
- // float --> need special handling like below.
- // 0 4
+ // Promote i1/i8/i16/i32 arguments to i64.
+ CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Convert float arguments to i64 with padding.
+ // 63 31 0
// +------+------+
- // | empty| float|
+ // | float| 0 |
// +------+------+
- CCIfType<[f32], CCCustom<"allocateFloat">>,
+ CCIfType<[f32], CCBitConvertToType<i64>>,
+
+ // F128 are assigned to the stack in 16-byte aligned units
+ CCIfType<[f128], CCAssignToStack<16, 16>>,
CCAssignToStack<0, 8>
]>;
-def RetCC_VE : CallingConv<[
- // Promote i1/i8/i16 arguments to i32.
- CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+def RetCC_VE_C : CallingConv<[
+ // Promote i1/i8/i16/i32 return values to i64.
+ CCIfType<[i1, i8, i16, i32], CCPromoteToType<i64>>,
- // bool, char, int, enum, long --> generic integer 32 bit registers
- CCIfType<[i32], CCAssignToRegWithShadow<
- [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7],
- [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
-
- // float --> generic floating point 32 bit registers
- CCIfType<[f32], CCAssignToRegWithShadow<
- [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7],
- [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+ // Convert float return values to i64 with padding.
+ // 63 31 0
+ // +------+------+
+ // | float| 0 |
+ // +------+------+
+ CCIfType<[f32], CCBitConvertToType<i64>>,
- // long long/double --> generic 64 bit registers
+ // bool, char, int, enum, long, long long, float, double
+ // --> generic 64 bit registers
CCIfType<[i64, f64],
CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+ // long double --> pair of generic 64 bit registers
+ CCIfType<[f128],
+ CCAssignToRegWithShadow<[Q0, Q1, Q2, Q3],
+ [SX0, SX1, SX3, SX5]>>,
+]>;
+
+///// Custom fastcc /////
+//
+// This passes vector params and return values in registers. Scalar values are
+// handled conforming to the standard cc.
+def CC_VE_Fast : CallingConv<[
+ // vector --> generic vector registers
+ CCIfType<[v256i32, v256f32, v256i64, v256f64],
+ CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+ // TODO: make this conditional on packed mode
+ CCIfType<[v512i32, v512f32],
+ CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+
+ // vector mask --> generic vector mask registers
+ CCIfType<[v256i1],
+ CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>,
+
+ // pair of vector mask --> generic vector mask registers
+ CCIfType<[v512i1],
+ CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
+ [VM1, VM3, VM5]>>,
+
+ // Follow the standard C CC for scalars.
+ CCDelegateTo<CC_VE_C>
+]>;
+
+def RetCC_VE_Fast : CallingConv<[
+ // vector --> generic vector registers
+ CCIfType<[v256i32, v256f32, v256i64, v256f64],
+ CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+ // TODO: make this conditional on packed mode
+ CCIfType<[v512i32, v512f32],
+ CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>,
+
+ // vector mask --> generic vector mask registers
+ CCIfType<[v256i1],
+ CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>,
+
+ // pair of vector mask --> generic vector mask registers
+ CCIfType<[v512i1],
+ CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
+ [VM1, VM3, VM5]>>,
+
+ // Follow the standard C CC for scalars.
+ CCDelegateTo<RetCC_VE_C>
]>;
// Callee-saved registers
@@ -86,4 +148,6 @@ def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>;
def CSR_NoRegs : CalleeSavedRegs<(add)>;
// PreserveAll (clobbers s62,s63) - used for ve_grow_stack
-def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61))>;
+def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61),
+ (sequence "V%u", 0, 63),
+ (sequence "VM%u", 1, 15))>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
index 8b10e6466123..9e97d0eca833 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.cpp
@@ -8,6 +8,105 @@
//
// This file contains the VE implementation of TargetFrameLowering class.
//
+// On VE, stack frames are structured as follows:
+//
+// The stack grows downward.
+//
+// All of the individual frame areas on the frame below are optional, i.e. it's
+// possible to create a function so that the particular area isn't present
+// in the frame.
+//
+// At function entry, the "frame" looks as follows:
+//
+// | | Higher address
+// |----------------------------------------------|
+// | Parameter area for this function |
+// |----------------------------------------------|
+// | Register save area (RSA) for this function |
+// |----------------------------------------------|
+// | Return address for this function |
+// |----------------------------------------------|
+// | Frame pointer for this function |
+// |----------------------------------------------| <- sp
+// | | Lower address
+//
+// VE doesn't use on demand stack allocation, so user code generated by LLVM
+// needs to call VEOS to allocate stack frame. VE's ABI want to reduce the
+// number of VEOS calls, so ABI requires to allocate not only RSA (in general
+// CSR, callee saved register) area but also call frame at the prologue of
+// caller function.
+//
+// After the prologue has run, the frame has the following general structure.
+// Note that technically the last frame area (VLAs) doesn't get created until
+// in the main function body, after the prologue is run. However, it's depicted
+// here for completeness.
+//
+// | | Higher address
+// |----------------------------------------------|
+// | Parameter area for this function |
+// |----------------------------------------------|
+// | Register save area (RSA) for this function |
+// |----------------------------------------------|
+// | Return address for this function |
+// |----------------------------------------------|
+// | Frame pointer for this function |
+// |----------------------------------------------| <- fp(=old sp)
+// |.empty.space.to.make.part.below.aligned.in....|
+// |.case.it.needs.more.than.the.standard.16-byte.| (size of this area is
+// |.alignment....................................| unknown at compile time)
+// |----------------------------------------------|
+// | Local variables of fixed size including spill|
+// | slots |
+// |----------------------------------------------| <- bp(not defined by ABI,
+// |.variable-sized.local.variables.(VLAs)........| LLVM chooses SX17)
+// |..............................................| (size of this area is
+// |..............................................| unknown at compile time)
+// |----------------------------------------------| <- stack top (returned by
+// | Parameter area for callee | alloca)
+// |----------------------------------------------|
+// | Register save area (RSA) for callee |
+// |----------------------------------------------|
+// | Return address for callee |
+// |----------------------------------------------|
+// | Frame pointer for callee |
+// |----------------------------------------------| <- sp
+// | | Lower address
+//
+// To access the data in a frame, at-compile time, a constant offset must be
+// computable from one of the pointers (fp, bp, sp) to access it. The size
+// of the areas with a dotted background cannot be computed at compile-time
+// if they are present, making it required to have all three of fp, bp and
+// sp to be set up to be able to access all contents in the frame areas,
+// assuming all of the frame areas are non-empty.
+//
+// For most functions, some of the frame areas are empty. For those functions,
+// it may not be necessary to set up fp or bp:
+// * A base pointer is definitely needed when there are both VLAs and local
+// variables with more-than-default alignment requirements.
+// * A frame pointer is definitely needed when there are local variables with
+// more-than-default alignment requirements.
+//
+// In addition, VE ABI defines RSA frame, return address, and frame pointer
+// as follows:
+//
+// |----------------------------------------------| <- sp+176
+// | %s18...%s33 |
+// |----------------------------------------------| <- sp+48
+// | Linkage area register (%s17) |
+// |----------------------------------------------| <- sp+40
+// | Procedure linkage table register (%plt=%s16) |
+// |----------------------------------------------| <- sp+32
+// | Global offset table register (%got=%s15) |
+// |----------------------------------------------| <- sp+24
+// | Thread pointer register (%tp=%s14) |
+// |----------------------------------------------| <- sp+16
+// | Return address |
+// |----------------------------------------------| <- sp+8
+// | Frame pointer |
+// |----------------------------------------------| <- sp+0
+//
+// NOTE: This description is based on VE ABI and description in
+// AArch64FrameLowering.cpp. Thanks a lot.
//===----------------------------------------------------------------------===//
#include "VEFrameLowering.h"
@@ -38,48 +137,47 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
MachineBasicBlock::iterator MBBI,
uint64_t NumBytes,
bool RequireFPUpdate) const {
+ const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+ DebugLoc DL;
+ const VEInstrInfo &TII = *STI.getInstrInfo();
- DebugLoc dl;
- const VEInstrInfo &TII =
- *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
// Insert following codes here as prologue
//
- // st %fp, 0(,%sp)
- // st %lr, 8(,%sp)
- // st %got, 24(,%sp)
- // st %plt, 32(,%sp)
- // st %s17, 40(,%sp) iff this function is using s17 as BP
- // or %fp, 0, %sp
-
- BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(0)
- .addReg(VE::SX9);
- BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(8)
- .addReg(VE::SX10);
- BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(24)
- .addReg(VE::SX15);
- BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(32)
- .addReg(VE::SX16);
+ // st %fp, 0(, %sp) iff !isLeafProc
+ // st %lr, 8(, %sp) iff !isLeafProc
+ // st %got, 24(, %sp) iff hasGOT
+ // st %plt, 32(, %sp) iff hasGOT
+ // st %s17, 40(, %sp) iff hasBP
+ if (!FuncInfo->isLeafProc()) {
+ BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(0)
+ .addReg(VE::SX9);
+ BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(8)
+ .addReg(VE::SX10);
+ }
+ if (hasGOT(MF)) {
+ BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(24)
+ .addReg(VE::SX15);
+ BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(32)
+ .addReg(VE::SX16);
+ }
if (hasBP(MF))
- BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
+ BuildMI(MBB, MBBI, DL, TII.get(VE::STrii))
.addReg(VE::SX11)
.addImm(0)
.addImm(40)
.addReg(VE::SX17);
- BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX9)
- .addReg(VE::SX11)
- .addImm(0);
}
void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
@@ -87,43 +185,42 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
MachineBasicBlock::iterator MBBI,
uint64_t NumBytes,
bool RequireFPUpdate) const {
+ const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+ DebugLoc DL;
+ const VEInstrInfo &TII = *STI.getInstrInfo();
- DebugLoc dl;
- const VEInstrInfo &TII =
- *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
// Insert following codes here as epilogue
//
- // or %sp, 0, %fp
- // ld %s17, 40(,%sp) iff this function is using s17 as BP
- // ld %got, 32(,%sp)
- // ld %plt, 24(,%sp)
- // ld %lr, 8(,%sp)
- // ld %fp, 0(,%sp)
-
- BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX11)
- .addReg(VE::SX9)
- .addImm(0);
+ // ld %s17, 40(, %sp) iff hasBP
+ // ld %plt, 32(, %sp) iff hasGOT
+ // ld %got, 24(, %sp) iff hasGOT
+ // ld %lr, 8(, %sp) iff !isLeafProc
+ // ld %fp, 0(, %sp) iff !isLeafProc
if (hasBP(MF))
- BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX17)
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX17)
.addReg(VE::SX11)
.addImm(0)
.addImm(40);
- BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX16)
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(32);
- BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX15)
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(24);
- BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX10)
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(8);
- BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX9)
- .addReg(VE::SX11)
- .addImm(0)
- .addImm(0);
+ if (hasGOT(MF)) {
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX16)
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(32);
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX15)
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(24);
+ }
+ if (!FuncInfo->isLeafProc()) {
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX10)
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(8);
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LDrii), VE::SX9)
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(0);
+ }
}
void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
@@ -131,37 +228,44 @@ void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
MachineBasicBlock::iterator MBBI,
int64_t NumBytes,
MaybeAlign MaybeAlign) const {
- DebugLoc dl;
- const VEInstrInfo &TII =
- *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ DebugLoc DL;
+ const VEInstrInfo &TII = *STI.getInstrInfo();
- if (NumBytes >= -64 && NumBytes < 63) {
- BuildMI(MBB, MBBI, dl, TII.get(VE::ADDSLri), VE::SX11)
+ if (NumBytes == 0) {
+ // Nothing to do here.
+ } else if (isInt<7>(NumBytes)) {
+ // adds.l %s11, NumBytes@lo, %s11
+ BuildMI(MBB, MBBI, DL, TII.get(VE::ADDSLri), VE::SX11)
.addReg(VE::SX11)
.addImm(NumBytes);
- return;
+ } else if (isInt<32>(NumBytes)) {
+ // lea %s11, NumBytes@lo(, %s11)
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LEArii), VE::SX11)
+ .addReg(VE::SX11)
+ .addImm(0)
+ .addImm(Lo_32(NumBytes));
+ } else {
+ // Emit following codes. This clobbers SX13 which we always know is
+ // available here.
+ // lea %s13, NumBytes@lo
+ // and %s13, %s13, (32)0
+ // lea.sl %sp, NumBytes@hi(%s13, %sp)
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LEAzii), VE::SX13)
+ .addImm(0)
+ .addImm(0)
+ .addImm(Lo_32(NumBytes));
+ BuildMI(MBB, MBBI, DL, TII.get(VE::ANDrm), VE::SX13)
+ .addReg(VE::SX13)
+ .addImm(M0(32));
+ BuildMI(MBB, MBBI, DL, TII.get(VE::LEASLrri), VE::SX11)
+ .addReg(VE::SX11)
+ .addReg(VE::SX13)
+ .addImm(Hi_32(NumBytes));
}
- // Emit following codes. This clobbers SX13 which we always know is
- // available here.
- // lea %s13,%lo(NumBytes)
- // and %s13,%s13,(32)0
- // lea.sl %sp,%hi(NumBytes)(%sp, %s13)
- BuildMI(MBB, MBBI, dl, TII.get(VE::LEAzii), VE::SX13)
- .addImm(0)
- .addImm(0)
- .addImm(Lo_32(NumBytes));
- BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX13)
- .addReg(VE::SX13)
- .addImm(M0(32));
- BuildMI(MBB, MBBI, dl, TII.get(VE::LEASLrri), VE::SX11)
- .addReg(VE::SX11)
- .addReg(VE::SX13)
- .addImm(Hi_32(NumBytes));
-
if (MaybeAlign) {
// and %sp, %sp, Align-1
- BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX11)
+ BuildMI(MBB, MBBI, DL, TII.get(VE::ANDrm), VE::SX11)
.addReg(VE::SX11)
.addImm(M1(64 - Log2_64(MaybeAlign.valueOrOne().value())));
}
@@ -169,9 +273,8 @@ void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
void VEFrameLowering::emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {
- DebugLoc dl;
- const VEInstrInfo &TII =
- *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ DebugLoc DL;
+ const VEInstrInfo &TII = *STI.getInstrInfo();
// Emit following codes. It is not possible to insert multiple
// BasicBlocks in PEI pass, so we emit two pseudo instructions here.
@@ -198,22 +301,23 @@ void VEFrameLowering::emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
// EXTEND_STACK_GUARD pseudo will be simply eliminated by ExpandPostRA
// pass. This pseudo is required to be at the next of EXTEND_STACK
// pseudo in order to protect iteration loop in ExpandPostRA.
-
- BuildMI(MBB, MBBI, dl, TII.get(VE::EXTEND_STACK));
- BuildMI(MBB, MBBI, dl, TII.get(VE::EXTEND_STACK_GUARD));
+ BuildMI(MBB, MBBI, DL, TII.get(VE::EXTEND_STACK));
+ BuildMI(MBB, MBBI, DL, TII.get(VE::EXTEND_STACK_GUARD));
}
void VEFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
+ const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineFrameInfo &MFI = MF.getFrameInfo();
const VEInstrInfo &TII = *STI.getInstrInfo();
const VERegisterInfo &RegInfo = *STI.getRegisterInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
+ bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
- DebugLoc dl;
- bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+ DebugLoc DL;
// FIXME: unfortunately, returning false from canRealignStack
// actually just causes needsStackRealignment to return false,
@@ -226,12 +330,17 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
"stack re-alignment, but LLVM couldn't handle it "
"(probably because it has a dynamic alloca).");
- // Get the number of bytes to allocate from the FrameInfo
+ // Get the number of bytes to allocate from the FrameInfo.
+ // This number of bytes is already aligned to ABI stack alignment.
uint64_t NumBytes = MFI.getStackSize();
- // The VE ABI requires a reserved 176 bytes area at the top
- // of stack as described in VESubtarget.cpp. So, we adjust it here.
- NumBytes = STI.getAdjustedFrameSize(NumBytes);
+ // Adjust stack size if this function is not a leaf function since the
+ // VE ABI requires a reserved area at the top of stack as described in
+ // VEFrameLowering.cpp.
+ if (!FuncInfo->isLeafProc()) {
+ // NOTE: The number is aligned to ABI stack alignment after adjustment.
+ NumBytes = STI.getAdjustedFrameSize(NumBytes);
+ }
// Finally, ensure that the size is sufficiently aligned for the
// data on the stack.
@@ -240,36 +349,34 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
// Update stack size with corrected value.
MFI.setStackSize(NumBytes);
- // Emit Prologue instructions to save %lr
+ // Emit Prologue instructions to save multiple registers.
emitPrologueInsns(MF, MBB, MBBI, NumBytes, true);
+ // Emit instructions to save SP in FP as follows if this is not a leaf
+ // function:
+ // or %fp, 0, %sp
+ if (!FuncInfo->isLeafProc())
+ BuildMI(MBB, MBBI, DL, TII.get(VE::ORri), VE::SX9)
+ .addReg(VE::SX11)
+ .addImm(0);
+
// Emit stack adjust instructions
MaybeAlign RuntimeAlign =
NeedsStackRealignment ? MaybeAlign(MFI.getMaxAlign()) : None;
+ assert((RuntimeAlign == None || !FuncInfo->isLeafProc()) &&
+ "SP has to be saved in order to align variable sized stack object!");
emitSPAdjustment(MF, MBB, MBBI, -(int64_t)NumBytes, RuntimeAlign);
if (hasBP(MF)) {
// Copy SP to BP.
- BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX17)
+ BuildMI(MBB, MBBI, DL, TII.get(VE::ORri), VE::SX17)
.addReg(VE::SX11)
.addImm(0);
}
// Emit stack extend instructions
- emitSPExtend(MF, MBB, MBBI);
-
- Register RegFP = RegInfo.getDwarfRegNum(VE::SX9, true);
-
- // Emit ".cfi_def_cfa_register 30".
- unsigned CFIIndex =
- MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, RegFP));
- BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
-
- // Emit ".cfi_window_save".
- CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
- BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ if (NumBytes != 0)
+ emitSPExtend(MF, MBB, MBBI);
}
MachineBasicBlock::iterator VEFrameLowering::eliminateCallFramePseudoInstr(
@@ -289,21 +396,33 @@ MachineBasicBlock::iterator VEFrameLowering::eliminateCallFramePseudoInstr(
void VEFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
+ const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+ DebugLoc DL;
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- DebugLoc dl = MBBI->getDebugLoc();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ const VEInstrInfo &TII = *STI.getInstrInfo();
uint64_t NumBytes = MFI.getStackSize();
- // Emit Epilogue instructions to restore %lr
+ // Emit instructions to retrieve original SP.
+ if (!FuncInfo->isLeafProc()) {
+ // If SP is saved in FP, retrieve it as follows:
+ // or %sp, 0, %fp iff !isLeafProc
+ BuildMI(MBB, MBBI, DL, TII.get(VE::ORri), VE::SX11)
+ .addReg(VE::SX9)
+ .addImm(0);
+ } else {
+ // Emit stack adjust instructions.
+ emitSPAdjustment(MF, MBB, MBBI, NumBytes, None);
+ }
+
+ // Emit Epilogue instructions to restore multiple registers.
emitEpilogueInsns(MF, MBB, MBBI, NumBytes, true);
}
// hasFP - Return true if the specified function should have a dedicated frame
// pointer register. This is true if the function has variable sized allocas
-// or if frame pointer elimination is disabled. For the case of VE, we don't
-// implement FP eliminator yet, but we returns false from this function to
-// not refer fp from generated code.
+// or if frame pointer elimination is disabled.
bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
@@ -320,34 +439,41 @@ bool VEFrameLowering::hasBP(const MachineFunction &MF) const {
return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
}
-int VEFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const {
+bool VEFrameLowering::hasGOT(const MachineFunction &MF) const {
+ const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+
+ // If a global base register is assigned (!= 0), GOT is used.
+ return FuncInfo->getGlobalBaseReg() != 0;
+}
+
+StackOffset VEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const VERegisterInfo *RegInfo = STI.getRegisterInfo();
- const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
bool isFixed = MFI.isFixedObjectIndex(FI);
int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI);
- if (FuncInfo->isLeafProc()) {
- // If there's a leaf proc, all offsets need to be %sp-based,
- // because we haven't caused %fp to actually point to our frame.
+ if (!hasFP(MF)) {
+ // If FP is not used, frame indexies are based on a %sp regiter.
FrameReg = VE::SX11; // %sp
- return FrameOffset + MF.getFrameInfo().getStackSize();
+ return StackOffset::getFixed(FrameOffset +
+ MF.getFrameInfo().getStackSize());
}
if (RegInfo->needsStackRealignment(MF) && !isFixed) {
- // If there is dynamic stack realignment, all local object
- // references need to be via %sp or %s17 (bp), to take account
- // of the re-alignment.
+ // If data on stack require realignemnt, frame indexies are based on a %sp
+ // or %s17 (bp) register. If there is a variable sized object, bp is used.
if (hasBP(MF))
FrameReg = VE::SX17; // %bp
else
FrameReg = VE::SX11; // %sp
- return FrameOffset + MF.getFrameInfo().getStackSize();
+ return StackOffset::getFixed(FrameOffset +
+ MF.getFrameInfo().getStackSize());
}
- // Finally, default to using %fp.
+ // Use %fp by default.
FrameReg = RegInfo->getFrameRegister(MF);
- return FrameOffset;
+ return StackOffset::getFixed(FrameOffset);
}
bool VEFrameLowering::isLeafProc(MachineFunction &MF) const {
@@ -367,8 +493,10 @@ void VEFrameLowering::determineCalleeSaves(MachineFunction &MF,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
- if (isLeafProc(MF)) {
- VEMachineFunctionInfo *MFI = MF.getInfo<VEMachineFunctionInfo>();
- MFI->setLeafProc(true);
+ // Functions having BP need to emit prologue and epilogue to allocate local
+ // buffer on the stack even if the function is a leaf function.
+ if (isLeafProc(MF) && !hasBP(MF)) {
+ VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+ FuncInfo->setLeafProc(true);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h
index b548d663c504..99eb41189b25 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEFrameLowering.h
@@ -15,6 +15,7 @@
#include "VE.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -38,8 +39,10 @@ public:
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const override;
- bool hasBP(const MachineFunction &MF) const;
bool hasFP(const MachineFunction &MF) const override;
+ bool hasBP(const MachineFunction &MF) const;
+ bool hasGOT(const MachineFunction &MF) const;
+
// VE reserves argument space always for call sites in the function
// immediately on entry of the current function.
bool hasReservedCallFrame(const MachineFunction &MF) const override {
@@ -48,8 +51,8 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
const SpillSlot *
getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
index f3d067d55fdb..761baa79b4ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
@@ -113,15 +113,6 @@ inline static uint64_t getFpImmVal(const ConstantFPSDNode *N) {
return Val;
}
-/// convMImmVal - Convert a mimm integer immediate value to target immediate.
-inline static uint64_t convMImmVal(uint64_t Val) {
- if (Val == 0)
- return 0; // (0)1
- if (Val & (1UL << 63))
- return countLeadingOnes(Val); // (m)1
- return countLeadingZeros(Val) | 0x40; // (m)0
-}
-
//===--------------------------------------------------------------------===//
/// VEDAGToDAGISel - VE specific code to select VE machine
/// instructions for SelectionDAG operations.
@@ -148,6 +139,7 @@ public:
bool selectADDRzri(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
bool selectADDRzii(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
bool selectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+ bool selectADDRzi(SDValue N, SDValue &Base, SDValue &Offset);
StringRef getPassName() const override {
return "VE DAG->DAG Pattern Instruction Selection";
@@ -183,6 +175,14 @@ bool VEDAGToDAGISel::selectADDRrri(SDValue Addr, SDValue &Base, SDValue &Index,
return false;
}
if (matchADDRrr(Addr, LHS, RHS)) {
+ // If the input is a pair of a frame-index and a register, move a
+ // frame-index to LHS. This generates MI with following operands.
+ // %dest, #FI, %reg, offset
+ // In the eliminateFrameIndex, above MI is converted to the following.
+ // %dest, %fp, %reg, fi_offset + offset
+ if (dyn_cast<FrameIndexSDNode>(RHS))
+ std::swap(LHS, RHS);
+
if (matchADDRri(RHS, Index, Offset)) {
Base = LHS;
return true;
@@ -228,7 +228,7 @@ bool VEDAGToDAGISel::selectADDRzii(SDValue Addr, SDValue &Base, SDValue &Index,
Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
return false; // direct calls.
- if (ConstantSDNode *CN = cast<ConstantSDNode>(Addr)) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(Addr)) {
if (isInt<32>(CN->getSExtValue())) {
Base = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
Index = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
@@ -250,6 +250,26 @@ bool VEDAGToDAGISel::selectADDRri(SDValue Addr, SDValue &Base,
return true;
}
+bool VEDAGToDAGISel::selectADDRzi(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ if (dyn_cast<FrameIndexSDNode>(Addr))
+ return false;
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress ||
+ Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false; // direct calls.
+
+ if (auto *CN = dyn_cast<ConstantSDNode>(Addr)) {
+ if (isInt<32>(CN->getSExtValue())) {
+ Base = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ Offset =
+ CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ }
+ return false;
+}
+
bool VEDAGToDAGISel::matchADDRrr(SDValue Addr, SDValue &Base, SDValue &Index) {
if (dyn_cast<FrameIndexSDNode>(Addr))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
index ab720545dd83..d377f8e27cfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -13,6 +13,7 @@
#include "VEISelLowering.h"
#include "MCTargetDesc/VEMCExpr.h"
+#include "VEInstrBuilder.h"
#include "VEMachineFunctionInfo.h"
#include "VERegisterInfo.h"
#include "VETargetMachine.h"
@@ -21,6 +22,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -38,39 +40,280 @@ using namespace llvm;
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
-static bool allocateFloat(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::f32: {
- // Allocate stack like below
- // 0 4
- // +------+------+
- // | empty| float|
- // +------+------+
- // Use align=8 for dummy area to align the beginning of these 2 area.
- State.AllocateStack(4, Align(8)); // for empty area
- // Use align=4 for value to place it at just after the dummy area.
- unsigned Offset = State.AllocateStack(4, Align(4)); // for float value area
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return true;
- }
+#include "VEGenCallingConv.inc"
+
+CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
+ switch (CallConv) {
default:
- return false;
+ return RetCC_VE_C;
+ case CallingConv::Fast:
+ return RetCC_VE_Fast;
}
}
-#include "VEGenCallingConv.inc"
+CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
+ if (IsVarArg)
+ return CC_VE2;
+ switch (CallConv) {
+ default:
+ return CC_VE_C;
+ case CallingConv::Fast:
+ return CC_VE_Fast;
+ }
+}
bool VETargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
- CCAssignFn *RetCC = RetCC_VE;
+ CCAssignFn *RetCC = getReturnCC(CallConv);
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC);
}
+static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
+ MVT::v256f32, MVT::v512f32, MVT::v256f64};
+
+static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
+
+void VETargetLowering::initRegisterClasses() {
+ // Set up the register classes.
+ addRegisterClass(MVT::i32, &VE::I32RegClass);
+ addRegisterClass(MVT::i64, &VE::I64RegClass);
+ addRegisterClass(MVT::f32, &VE::F32RegClass);
+ addRegisterClass(MVT::f64, &VE::I64RegClass);
+ addRegisterClass(MVT::f128, &VE::F128RegClass);
+
+ if (Subtarget->enableVPU()) {
+ for (MVT VecVT : AllVectorVTs)
+ addRegisterClass(VecVT, &VE::V64RegClass);
+ addRegisterClass(MVT::v256i1, &VE::VMRegClass);
+ addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
+ }
+}
+
+void VETargetLowering::initSPUActions() {
+ const auto &TM = getTargetMachine();
+ /// Load & Store {
+
+ // VE doesn't have i1 sign extending load.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setTruncStoreAction(VT, MVT::i1, Expand);
+ }
+
+ // VE doesn't have floating point extload/truncstore, so expand them.
+ for (MVT FPVT : MVT::fp_valuetypes()) {
+ for (MVT OtherFPVT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
+ setTruncStoreAction(FPVT, OtherFPVT, Expand);
+ }
+ }
+
+ // VE doesn't have fp128 load/store, so expand them in custom lower.
+ setOperationAction(ISD::LOAD, MVT::f128, Custom);
+ setOperationAction(ISD::STORE, MVT::f128, Custom);
+
+ /// } Load & Store
+
+ // Custom legalize address nodes into LO/HI parts.
+ MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
+ setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+ setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+ setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+ setOperationAction(ISD::JumpTable, PtrVT, Custom);
+
+ /// VAARG handling {
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ // VAARG needs to be lowered to access with 8 bytes alignment.
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ // Use the default implementation.
+ setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
+ /// } VAARG handling
+
+ /// Stack {
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+
+ // Use the default implementation.
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ /// } Stack
+
+ /// Branch {
+
+ // VE doesn't have BRCOND
+ setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+ // BR_JT is not implemented yet.
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+ /// } Branch
+
+ /// Int Ops {
+ for (MVT IntVT : {MVT::i32, MVT::i64}) {
+ // VE has no REM or DIVREM operations.
+ setOperationAction(ISD::UREM, IntVT, Expand);
+ setOperationAction(ISD::SREM, IntVT, Expand);
+ setOperationAction(ISD::SDIVREM, IntVT, Expand);
+ setOperationAction(ISD::UDIVREM, IntVT, Expand);
+
+ // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
+ setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
+ setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
+ setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
+
+ // VE has no MULHU/S or U/SMUL_LOHI operations.
+ // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
+ setOperationAction(ISD::MULHU, IntVT, Expand);
+ setOperationAction(ISD::MULHS, IntVT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
+
+ // VE has no CTTZ, ROTL, ROTR operations.
+ setOperationAction(ISD::CTTZ, IntVT, Expand);
+ setOperationAction(ISD::ROTL, IntVT, Expand);
+ setOperationAction(ISD::ROTR, IntVT, Expand);
+
+ // VE has 64 bits instruction which works as i64 BSWAP operation. This
+ // instruction works fine as i32 BSWAP operation with an additional
+ // parameter. Use isel patterns to lower BSWAP.
+ setOperationAction(ISD::BSWAP, IntVT, Legal);
+
+ // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
+ // operations. Use isel patterns for i64, promote for i32.
+ LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
+ setOperationAction(ISD::BITREVERSE, IntVT, Act);
+ setOperationAction(ISD::CTLZ, IntVT, Act);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
+ setOperationAction(ISD::CTPOP, IntVT, Act);
+
+ // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
+ // Use isel patterns for i64, promote for i32.
+ setOperationAction(ISD::AND, IntVT, Act);
+ setOperationAction(ISD::OR, IntVT, Act);
+ setOperationAction(ISD::XOR, IntVT, Act);
+ }
+ /// } Int Ops
+
+ /// Conversion {
+ // VE doesn't have instructions for fp<->uint, so expand them by llvm
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+
+ // fp16 not supported
+ for (MVT FPVT : MVT::fp_valuetypes()) {
+ setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
+ setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
+ }
+ /// } Conversion
+
+ /// Floating-point Ops {
+ /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
+ /// and fcmp.
+
+ // VE doesn't have following floating point operations.
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setOperationAction(ISD::FNEG, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ }
+
+ // VE doesn't have fdiv of f128.
+ setOperationAction(ISD::FDIV, MVT::f128, Expand);
+
+ for (MVT FPVT : {MVT::f32, MVT::f64}) {
+ // f32 and f64 uses ConstantFP. f128 uses ConstantPool.
+ setOperationAction(ISD::ConstantFP, FPVT, Legal);
+ }
+ /// } Floating-point Ops
+
+ /// Floating-point math functions {
+
+ // VE doesn't have following floating point math functions.
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setOperationAction(ISD::FABS, VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ }
+
+ /// } Floating-point math functions
+
+ /// Atomic instructions {
+
+ setMaxAtomicSizeInBitsSupported(64);
+ setMinCmpXchgSizeInBits(32);
+ setSupportsUnalignedAtomics(false);
+
+ // Use custom inserter for ATOMIC_FENCE.
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+ // Other atomic instructions.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ // Support i8/i16 atomic swap.
+ setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
+
+ // FIXME: Support "atmam" instructions.
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
+
+ // VE doesn't have follwing instructions.
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
+ }
+
+ /// } Atomic instructions
+
+ /// SJLJ instructions {
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+ if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+ setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+ /// } SJLJ instructions
+
+ // Intrinsic instructions
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+}
+
+void VETargetLowering::initVPUActions() {
+ for (MVT LegalVecVT : AllVectorVTs) {
+ setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
+ // Translate all vector instructions with legal element types to VVP_*
+ // nodes.
+ // TODO We will custom-widen into VVP_* nodes in the future. While we are
+ // buildling the infrastructure for this, we only do this for legal vector
+ // VTs.
+#define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME) \
+ setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
+#define ADD_VVP_OP(VVP_NAME, ISD_NAME) \
+ setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
+#include "VVPNodes.def"
+ }
+
+ for (MVT LegalPackedVT : AllPackedVTs) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
+ }
+}
+
SDValue
VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
@@ -85,7 +328,7 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
*DAG.getContext());
// Analyze return values.
- CCInfo.AnalyzeReturn(Outs, RetCC_VE);
+ CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -94,6 +337,7 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
+ assert(!VA.needsCustom() && "Unexpected custom lowering");
SDValue OutVal = OutVals[i];
// Integer return values must be sign or zero extended by the callee.
@@ -109,12 +353,26 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
case CCValAssign::AExt:
OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
break;
+ case CCValAssign::BCvt: {
+ // Convert a float return value to i64 with padding.
+ // 63 31 0
+ // +------+------+
+ // | float| 0 |
+ // +------+------+
+ assert(VA.getLocVT() == MVT::i64);
+ assert(VA.getValVT() == MVT::f32);
+ SDValue Undef = SDValue(
+ DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
+ SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+ OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+ MVT::i64, Undef, OutVal, Sub_f32),
+ 0);
+ break;
+ }
default:
llvm_unreachable("Unknown loc info!");
}
- assert(!VA.needsCustom() && "Unexpected custom lowering");
-
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
// Guarantee that all emitted copies are stuck together with flags.
@@ -138,7 +396,7 @@ SDValue VETargetLowering::LowerFormalArguments(
MachineFunction &MF = DAG.getMachineFunction();
// Get the base offset of the incoming arguments stack space.
- unsigned ArgsBaseOffset = 176;
+ unsigned ArgsBaseOffset = Subtarget->getRsaSize();
// Get the size of the preserved arguments area
unsigned ArgsPreserved = 64;
@@ -150,10 +408,11 @@ SDValue VETargetLowering::LowerFormalArguments(
CCInfo.AllocateStack(ArgsPreserved, Align(8));
// We already allocated the preserved area, so the stack offset computed
// by CC_VE would be correct now.
- CCInfo.AnalyzeFormalArguments(Ins, CC_VE);
+ CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
+ assert(!VA.needsCustom() && "Unexpected custom lowering");
if (VA.isRegLoc()) {
// This argument is passed in a register.
// All integer register arguments are promoted by the caller to i64.
@@ -163,11 +422,6 @@ SDValue VETargetLowering::LowerFormalArguments(
MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
- // Get the high bits for i32 struct elements.
- if (VA.getValVT() == MVT::i32 && VA.needsCustom())
- Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
- DAG.getConstant(32, DL, MVT::i32));
-
// The caller promoted the argument, so insert an Assert?ext SDNode so we
// won't promote the value again in this function.
switch (VA.getLocInfo()) {
@@ -179,6 +433,20 @@ SDValue VETargetLowering::LowerFormalArguments(
Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
DAG.getValueType(VA.getValVT()));
break;
+ case CCValAssign::BCvt: {
+ // Extract a float argument from i64 with padding.
+ // 63 31 0
+ // +------+------+
+ // | float| 0 |
+ // +------+------+
+ assert(VA.getLocVT() == MVT::i64);
+ assert(VA.getValVT() == MVT::f32);
+ SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+ Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::f32, Arg, Sub_f32),
+ 0);
+ break;
+ }
default:
break;
}
@@ -194,9 +462,23 @@ SDValue VETargetLowering::LowerFormalArguments(
// The registers are exhausted. This argument was passed on the stack.
assert(VA.isMemLoc());
// The CC_VE_Full/Half functions compute stack offsets relative to the
- // beginning of the arguments area at %fp+176.
+ // beginning of the arguments area at %fp + the size of reserved area.
unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
+
+ // Adjust offset for a float argument by adding 4 since the argument is
+ // stored in 8 bytes buffer with offset like below. LLVM generates
+ // 4 bytes load instruction, so need to adjust offset here. This
+ // adjustment is required in only LowerFormalArguments. In LowerCall,
+ // a float argument is converted to i64 first, and stored as 8 bytes
+ // data, which is required by ABI, so no need for adjustment.
+ // 0 4
+ // +------+------+
+ // | empty| float|
+ // +------+------+
+ if (VA.getValVT() == MVT::f32)
+ Offset += 4;
+
int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
InVals.push_back(
DAG.getLoad(VA.getValVT(), DL, Chain,
@@ -215,7 +497,7 @@ SDValue VETargetLowering::LowerFormalArguments(
// TODO: need to calculate offset correctly once we support f128.
unsigned ArgOffset = ArgLocs.size() * 8;
VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
- // Skip the 176 bytes of register save area.
+ // Skip the reserved area at the top of stack.
FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
return Chain;
@@ -258,7 +540,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CLI.IsTailCall = false;
// Get the base offset of the outgoing arguments stack space.
- unsigned ArgsBaseOffset = 176;
+ unsigned ArgsBaseOffset = Subtarget->getRsaSize();
// Get the size of the preserved arguments area
unsigned ArgsPreserved = 8 * 8u;
@@ -270,7 +552,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CCInfo.AllocateStack(ArgsPreserved, Align(8));
// We already allocated the preserved area, so the stack offset computed
// by CC_VE would be correct now.
- CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE);
+ CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
// VE requires to use both register and stack for varargs or no-prototyped
// functions.
@@ -281,7 +563,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
ArgLocs2, *DAG.getContext());
if (UseBoth)
- CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2);
+ CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
// Get the size of the outgoing arguments stack space requirement.
unsigned ArgsSize = CCInfo.getNextStackOffset();
@@ -371,6 +653,22 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::BCvt: {
+ // Convert a float argument to i64 with padding.
+ // 63 31 0
+ // +------+------+
+ // | float| 0 |
+ // +------+------+
+ assert(VA.getLocVT() == MVT::i64);
+ assert(VA.getValVT() == MVT::f32);
+ SDValue Undef = SDValue(
+ DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
+ SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+ Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+ MVT::i64, Undef, Arg, Sub_f32),
+ 0);
+ break;
+ }
}
if (VA.isRegLoc()) {
@@ -384,8 +682,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Create a store off the stack pointer for this argument.
SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
- // The argument area starts at %fp+176 in the callee frame,
- // %sp+176 in ours.
+ // The argument area starts at %fp/%sp + the size of reserved area.
SDValue PtrOff =
DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
@@ -450,11 +747,12 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
CLI.Ins[0].Flags.setInReg();
- RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_VE);
+ RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
+ assert(!VA.needsCustom() && "Unexpected custom lowering");
unsigned Reg = VA.getLocReg();
// When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
@@ -472,11 +770,6 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InGlue = Chain.getValue(2);
}
- // Get the high bits for i32 struct elements.
- if (VA.getValVT() == MVT::i32 && VA.needsCustom())
- RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
- DAG.getConstant(32, DL, MVT::i32));
-
// The callee promoted the return value, so insert an Assert?ext SDNode so
// we won't promote the value again in this function.
switch (VA.getLocInfo()) {
@@ -488,6 +781,20 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
DAG.getValueType(VA.getValVT()));
break;
+ case CCValAssign::BCvt: {
+ // Extract a float return value from i64 with padding.
+ // 63 31 0
+ // +------+------+
+ // | float| 0 |
+ // +------+------+
+ assert(VA.getLocVT() == MVT::i64);
+ assert(VA.getValVT() == MVT::f32);
+ SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
+ RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::f32, RV, Sub_f32),
+ 0);
+ break;
+ }
default:
break;
}
@@ -502,6 +809,15 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return Chain;
}
+bool VETargetLowering::isOffsetFoldingLegal(
+ const GlobalAddressSDNode *GA) const {
+ // VE uses 64 bit addressing, so we need multiple instructions to generate
+ // an address. Folding address with offset increases the number of
+ // instructions, so that we disable it here. Offsets will be folded in
+ // the DAG combine later if it worth to do so.
+ return false;
+}
+
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
@@ -531,30 +847,6 @@ bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return true;
}
-bool VETargetLowering::hasAndNot(SDValue Y) const {
- EVT VT = Y.getValueType();
-
- // VE doesn't have vector and not instruction.
- if (VT.isVector())
- return false;
-
- // VE allows different immediate values for X and Y where ~X & Y.
- // Only simm7 works for X, and only mimm works for Y on VE. However, this
- // function is used to check whether an immediate value is OK for and-not
- // instruction as both X and Y. Generating additional instruction to
- // retrieve an immediate value is no good since the purpose of this
- // function is to convert a series of 3 instructions to another series of
- // 3 instructions with better parallelism. Therefore, we return false
- // for all immediate values now.
- // FIXME: Change hasAndNot function to have two operands to make it work
- // correctly with Aurora VE.
- if (isa<ConstantSDNode>(Y))
- return false;
-
- // It's ok for generic registers.
- return true;
-}
-
VETargetLowering::VETargetLowering(const TargetMachine &TM,
const VESubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -566,91 +858,15 @@ VETargetLowering::VETargetLowering(const TargetMachine &TM,
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent);
- // Set up the register classes.
- addRegisterClass(MVT::i32, &VE::I32RegClass);
- addRegisterClass(MVT::i64, &VE::I64RegClass);
- addRegisterClass(MVT::f32, &VE::F32RegClass);
- addRegisterClass(MVT::f64, &VE::I64RegClass);
-
- /// Load & Store {
- for (MVT FPVT : MVT::fp_valuetypes()) {
- for (MVT OtherFPVT : MVT::fp_valuetypes()) {
- // Turn FP extload into load/fpextend
- setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
-
- // Turn FP truncstore into trunc + store.
- setTruncStoreAction(FPVT, OtherFPVT, Expand);
- }
- }
-
- // VE doesn't have i1 sign extending load
- for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
- setTruncStoreAction(VT, MVT::i1, Expand);
- }
- /// } Load & Store
-
- // Custom legalize address nodes into LO/HI parts.
- MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
- setOperationAction(ISD::BlockAddress, PtrVT, Custom);
- setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
- setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
-
- /// VAARG handling {
- setOperationAction(ISD::VASTART, MVT::Other, Custom);
- // VAARG needs to be lowered to access with 8 bytes alignment.
- setOperationAction(ISD::VAARG, MVT::Other, Custom);
- // Use the default implementation.
- setOperationAction(ISD::VACOPY, MVT::Other, Expand);
- setOperationAction(ISD::VAEND, MVT::Other, Expand);
- /// } VAARG handling
-
- /// Stack {
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
- /// } Stack
-
- /// Int Ops {
- for (MVT IntVT : {MVT::i32, MVT::i64}) {
- // VE has no REM or DIVREM operations.
- setOperationAction(ISD::UREM, IntVT, Expand);
- setOperationAction(ISD::SREM, IntVT, Expand);
- setOperationAction(ISD::SDIVREM, IntVT, Expand);
- setOperationAction(ISD::UDIVREM, IntVT, Expand);
-
- setOperationAction(ISD::CTTZ, IntVT, Expand);
- setOperationAction(ISD::ROTL, IntVT, Expand);
- setOperationAction(ISD::ROTR, IntVT, Expand);
-
- // Use isel patterns for i32 and i64
- setOperationAction(ISD::BSWAP, IntVT, Legal);
- setOperationAction(ISD::CTLZ, IntVT, Legal);
- setOperationAction(ISD::CTPOP, IntVT, Legal);
-
- // Use isel patterns for i64, Promote i32
- LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
- setOperationAction(ISD::BITREVERSE, IntVT, Act);
- }
- /// } Int Ops
-
- /// Conversion {
- // VE doesn't have instructions for fp<->uint, so expand them by llvm
- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
- setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
- setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
-
- // fp16 not supported
- for (MVT FPVT : MVT::fp_valuetypes()) {
- setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
- setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
- }
- /// } Conversion
+ initRegisterClasses();
+ initSPUActions();
+ initVPUActions();
setStackPointerRegisterToSaveRestore(VE::SX11);
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::TRUNCATE);
+
// Set function alignment to 16 bytes
setMinFunctionAlignment(Align(16));
@@ -667,14 +883,24 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((VEISD::NodeType)Opcode) {
case VEISD::FIRST_NUMBER:
break;
- TARGET_NODE_CASE(Lo)
- TARGET_NODE_CASE(Hi)
+ TARGET_NODE_CASE(CALL)
+ TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
+ TARGET_NODE_CASE(EH_SJLJ_SETJMP)
+ TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
TARGET_NODE_CASE(GETFUNPLT)
TARGET_NODE_CASE(GETSTACKTOP)
TARGET_NODE_CASE(GETTLSADDR)
- TARGET_NODE_CASE(CALL)
- TARGET_NODE_CASE(RET_FLAG)
TARGET_NODE_CASE(GLOBAL_BASE_REG)
+ TARGET_NODE_CASE(Hi)
+ TARGET_NODE_CASE(Lo)
+ TARGET_NODE_CASE(MEMBARRIER)
+ TARGET_NODE_CASE(RET_FLAG)
+ TARGET_NODE_CASE(TS1AM)
+ TARGET_NODE_CASE(VEC_BROADCAST)
+
+ // Register the VVP_* SDNodes.
+#define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
+#include "VVPNodes.def"
}
#undef TARGET_NODE_CASE
return nullptr;
@@ -696,10 +922,17 @@ SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
0, TF);
+ if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
+ return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
+ CP->getAlign(), CP->getOffset(), TF);
+
if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
TF);
+ if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
+ return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
+
llvm_unreachable("Unhandled address SDNode");
}
@@ -722,32 +955,24 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
// Handle PIC mode first. VE needs a got load for every variable!
if (isPositionIndependent()) {
- // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
- // function has calls.
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- MFI.setHasCalls(true);
auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
- if (isa<ConstantPoolSDNode>(Op) ||
+ if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
(GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
// Create following instructions for local linkage PIC code.
- // lea %s35, %gotoff_lo(.LCPI0_0)
- // and %s35, %s35, (32)0
- // lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35)
- // adds.l %s35, %s15, %s35 ; %s15 is GOT
- // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
+ // lea %reg, label@gotoff_lo
+ // and %reg, %reg, (32)0
+ // lea.sl %reg, label@gotoff_hi(%reg, %got)
SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
}
// Create following instructions for not local linkage PIC code.
- // lea %s35, %got_lo(.LCPI0_0)
- // and %s35, %s35, (32)0
- // lea.sl %s35, %got_hi(.LCPI0_0)(%s35)
- // adds.l %s35, %s15, %s35 ; %s15 is GOT
- // ld %s35, (,%s35)
- // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
+ // lea %reg, label@got_lo
+ // and %reg, %reg, (32)0
+ // lea.sl %reg, label@got_hi(%reg)
+ // ld %reg, (%reg, %got)
SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
VEMCExpr::VK_VE_GOT_LO32, DAG);
SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
@@ -770,20 +995,222 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
/// Custom Lower {
-SDValue VETargetLowering::LowerGlobalAddress(SDValue Op,
+// The mappings for emitLeading/TrailingFence for VE is designed by following
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction *VETargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ switch (Ord) {
+ case AtomicOrdering::NotAtomic:
+ case AtomicOrdering::Unordered:
+ llvm_unreachable("Invalid fence: unordered/non-atomic");
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::Acquire:
+ return nullptr; // Nothing to do
+ case AtomicOrdering::Release:
+ case AtomicOrdering::AcquireRelease:
+ return Builder.CreateFence(AtomicOrdering::Release);
+ case AtomicOrdering::SequentiallyConsistent:
+ if (!Inst->hasAtomicStore())
+ return nullptr; // Nothing to do
+ return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
+ }
+ llvm_unreachable("Unknown fence ordering in emitLeadingFence");
+}
+
+Instruction *VETargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ switch (Ord) {
+ case AtomicOrdering::NotAtomic:
+ case AtomicOrdering::Unordered:
+ llvm_unreachable("Invalid fence: unordered/not-atomic");
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::Release:
+ return nullptr; // Nothing to do
+ case AtomicOrdering::Acquire:
+ case AtomicOrdering::AcquireRelease:
+ return Builder.CreateFence(AtomicOrdering::Acquire);
+ case AtomicOrdering::SequentiallyConsistent:
+ return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
+ }
+ llvm_unreachable("Unknown fence ordering in emitTrailingFence");
+}
+
+SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // VE uses Release consistency, so need a fence instruction if it is a
+ // cross-thread fence.
+ if (FenceSSID == SyncScope::System) {
+ switch (FenceOrdering) {
+ case AtomicOrdering::NotAtomic:
+ case AtomicOrdering::Unordered:
+ case AtomicOrdering::Monotonic:
+ // No need to generate fencem instruction here.
+ break;
+ case AtomicOrdering::Acquire:
+ // Generate "fencem 2" as acquire fence.
+ return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
+ DAG.getTargetConstant(2, DL, MVT::i32),
+ Op.getOperand(0)),
+ 0);
+ case AtomicOrdering::Release:
+ // Generate "fencem 1" as release fence.
+ return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
+ DAG.getTargetConstant(1, DL, MVT::i32),
+ Op.getOperand(0)),
+ 0);
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ // Generate "fencem 3" as acq_rel and seq_cst fence.
+ // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
+ // so seq_cst may require more instruction for them.
+ return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
+ DAG.getTargetConstant(3, DL, MVT::i32),
+ Op.getOperand(0)),
+ 0);
+ }
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+TargetLowering::AtomicExpansionKind
+VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ // We have TS1AM implementation for i8/i16/i32/i64, so use it.
+ if (AI->getOperation() == AtomicRMWInst::Xchg) {
+ return AtomicExpansionKind::None;
+ }
+ // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
+
+ // Otherwise, expand it using compare and exchange instruction to not call
+ // __sync_fetch_and_* functions.
+ return AtomicExpansionKind::CmpXChg;
+}
+
+static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
+ SDValue &Bits) {
+ SDLoc DL(Op);
+ AtomicSDNode *N = cast<AtomicSDNode>(Op);
+ SDValue Ptr = N->getOperand(1);
+ SDValue Val = N->getOperand(2);
+ EVT PtrVT = Ptr.getValueType();
+ bool Byte = N->getMemoryVT() == MVT::i8;
+ // Remainder = AND Ptr, 3
+ // Flag = 1 << Remainder ; If Byte is true (1 byte swap flag)
+ // Flag = 3 << Remainder ; If Byte is false (2 bytes swap flag)
+ // Bits = Remainder << 3
+ // NewVal = Val << Bits
+ SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
+ SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
+ SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
+ : DAG.getConstant(3, DL, MVT::i32);
+ Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
+ Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
+ return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
+}
+
+static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
+ SDValue Bits) {
+ SDLoc DL(Op);
+ EVT VT = Data.getValueType();
+ bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
+ // NewData = Data >> Bits
+ // Result = NewData & 0xff ; If Byte is true (1 byte)
+ // Result = NewData & 0xffff ; If Byte is false (2 bytes)
+
+ SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
+ return DAG.getNode(ISD::AND, DL, VT,
+ {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
+}
+
+SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ AtomicSDNode *N = cast<AtomicSDNode>(Op);
+
+ if (N->getMemoryVT() == MVT::i8) {
+ // For i8, use "ts1am"
+ // Input:
+ // ATOMIC_SWAP Ptr, Val, Order
+ //
+ // Output:
+ // Remainder = AND Ptr, 3
+ // Flag = 1 << Remainder ; 1 byte swap flag for TS1AM inst.
+ // Bits = Remainder << 3
+ // NewVal = Val << Bits
+ //
+ // Aligned = AND Ptr, -4
+ // Data = TS1AM Aligned, Flag, NewVal
+ //
+ // NewData = Data >> Bits
+ // Result = NewData & 0xff ; 1 byte result
+ SDValue Flag;
+ SDValue Bits;
+ SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
+
+ SDValue Ptr = N->getOperand(1);
+ SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
+ {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
+ SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
+ DAG.getVTList(Op.getNode()->getValueType(0),
+ Op.getNode()->getValueType(1)),
+ {N->getChain(), Aligned, Flag, NewVal},
+ N->getMemOperand());
+
+ SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
+ SDValue Chain = TS1AM.getValue(1);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
+ if (N->getMemoryVT() == MVT::i16) {
+ // For i16, use "ts1am"
+ SDValue Flag;
+ SDValue Bits;
+ SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
+
+ SDValue Ptr = N->getOperand(1);
+ SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
+ {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
+ SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
+ DAG.getVTList(Op.getNode()->getValueType(0),
+ Op.getNode()->getValueType(1)),
+ {N->getChain(), Aligned, Flag, NewVal},
+ N->getMemOperand());
+
+ SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
+ SDValue Chain = TS1AM.getValue(1);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
+ // Otherwise, let llvm legalize it.
+ return Op;
+}
+
+SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
return makeAddress(Op, DAG);
}
-SDValue VETargetLowering::LowerBlockAddress(SDValue Op,
+SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ return makeAddress(Op, DAG);
+}
+
+SDValue VETargetLowering::lowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
return makeAddress(Op, DAG);
}
SDValue
-VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
+VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
SelectionDAG &DAG) const {
- SDLoc dl(Op);
+ SDLoc DL(Op);
// Generate the following code:
// t1: ch,glue = callseq_start t0, 0, 0
@@ -799,13 +1226,13 @@ VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
DAG.getMachineFunction(), CallingConv::C);
- Chain = DAG.getCALLSEQ_START(Chain, 64, 0, dl);
+ Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
- Chain = DAG.getNode(VEISD::GETTLSADDR, dl, NodeTys, Args);
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, dl, true),
- DAG.getIntPtrConstant(0, dl, true),
- Chain.getValue(1), dl);
- Chain = DAG.getCopyFromReg(Chain, dl, VE::SX0, PtrVT, Chain.getValue(1));
+ Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
+ DAG.getIntPtrConstant(0, DL, true),
+ Chain.getValue(1), DL);
+ Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
// GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -820,17 +1247,133 @@ VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
return Chain;
}
-SDValue VETargetLowering::LowerGlobalTLSAddress(SDValue Op,
+SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
// The current implementation of nld (2.26) doesn't allow local exec model
// code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
// generate the general dynamic model code sequence.
//
// *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
- return LowerToTLSGeneralDynamicModel(Op, DAG);
+ return lowerToTLSGeneralDynamicModel(Op, DAG);
+}
+
+SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ return makeAddress(Op, DAG);
+}
+
+// Lower a f128 load into two f64 loads.
+static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+ assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
+ unsigned Alignment = LdNode->getAlign().value();
+ if (Alignment > 8)
+ Alignment = 8;
+
+ SDValue Lo64 =
+ DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
+ LdNode->getPointerInfo(), Alignment,
+ LdNode->isVolatile() ? MachineMemOperand::MOVolatile
+ : MachineMemOperand::MONone);
+ EVT AddrVT = LdNode->getBasePtr().getValueType();
+ SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
+ DAG.getConstant(8, DL, AddrVT));
+ SDValue Hi64 =
+ DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
+ LdNode->getPointerInfo(), Alignment,
+ LdNode->isVolatile() ? MachineMemOperand::MOVolatile
+ : MachineMemOperand::MONone);
+
+ SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
+ SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
+
+ // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
+ SDNode *InFP128 =
+ DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
+ InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
+ SDValue(InFP128, 0), Hi64, SubRegEven);
+ InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
+ SDValue(InFP128, 0), Lo64, SubRegOdd);
+ SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
+ SDValue(Hi64.getNode(), 1)};
+ SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+ SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
+ return DAG.getMergeValues(Ops, DL);
}
-SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+ SDValue BasePtr = LdNode->getBasePtr();
+ if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
+ // Do not expand store instruction with frame index here because of
+ // dependency problems. We expand it later in eliminateFrameIndex().
+ return Op;
+ }
+
+ EVT MemVT = LdNode->getMemoryVT();
+ if (MemVT == MVT::f128)
+ return lowerLoadF128(Op, DAG);
+
+ return Op;
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+ assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
+
+ SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
+ SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
+
+ SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
+ StNode->getValue(), SubRegEven);
+ SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
+ StNode->getValue(), SubRegOdd);
+
+ unsigned Alignment = StNode->getAlign().value();
+ if (Alignment > 8)
+ Alignment = 8;
+
+ // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
+ SDValue OutChains[2];
+ OutChains[0] =
+ DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
+ StNode->getBasePtr(), MachinePointerInfo(), Alignment,
+ StNode->isVolatile() ? MachineMemOperand::MOVolatile
+ : MachineMemOperand::MONone);
+ EVT AddrVT = StNode->getBasePtr().getValueType();
+ SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
+ DAG.getConstant(8, DL, AddrVT));
+ OutChains[1] =
+ DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
+ MachinePointerInfo(), Alignment,
+ StNode->isVolatile() ? MachineMemOperand::MOVolatile
+ : MachineMemOperand::MONone);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+}
+
+SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
+ assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
+
+ SDValue BasePtr = StNode->getBasePtr();
+ if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
+ // Do not expand store instruction with frame index here because of
+ // dependency problems. We expand it later in eliminateFrameIndex().
+ return Op;
+ }
+
+ EVT MemVT = StNode->getMemoryVT();
+ if (MemVT == MVT::f128)
+ return lowerStoreF128(Op, DAG);
+
+ // Otherwise, ask llvm to expand it.
+ return SDValue();
+}
+
+SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -849,7 +1392,7 @@ SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV));
}
-SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDNode *Node = Op.getNode();
EVT VT = Node->getValueType(0);
SDValue InChain = Node->getOperand(0);
@@ -862,7 +1405,19 @@ SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = VAList.getValue(1);
SDValue NextPtr;
- if (VT == MVT::f32) {
+ if (VT == MVT::f128) {
+ // VE f128 values must be stored with 16 bytes alignment. We doesn't
+ // know the actual alignment of VAList, so we take alignment of it
+ // dyanmically.
+ int Align = 16;
+ VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+ DAG.getConstant(Align - 1, DL, PtrVT));
+ VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+ DAG.getConstant(-Align, DL, PtrVT));
+ // Increment the pointer, VAList, by 16 to the next vaarg.
+ NextPtr =
+ DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
+ } else if (VT == MVT::f32) {
// float --> need special handling like below.
// 0 4
// +------+------+
@@ -955,22 +1510,1325 @@ SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
return DAG.getMergeValues(Ops, DL);
}
+SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
+ Op.getOperand(1));
+}
+
+SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+ Op.getOperand(1));
+}
+
+SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+ Op.getOperand(0));
+}
+
+static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
+ const VETargetLowering &TLI,
+ const VESubtarget *Subtarget) {
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
+
+ unsigned Depth = Op.getConstantOperandVal(0);
+ const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ unsigned FrameReg = RegInfo->getFrameRegister(MF);
+ SDValue FrameAddr =
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
+ FrameAddr, MachinePointerInfo());
+ return FrameAddr;
+}
+
+static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
+ const VETargetLowering &TLI,
+ const VESubtarget *Subtarget) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
+
+ if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
+ return SDValue();
+
+ SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue Offset = DAG.getConstant(8, DL, VT);
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+ MachinePointerInfo());
+}
+
+SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default: // Don't custom lower most intrinsics.
+ return SDValue();
+ case Intrinsic::eh_sjlj_lsda: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MVT VT = Op.getSimpleValueType();
+ const VETargetMachine *TM =
+ static_cast<const VETargetMachine *>(&DAG.getTarget());
+
+ // Create GCC_except_tableXX string. The real symbol for that will be
+ // generated in EHStreamer::emitExceptionTable() later. So, we just
+ // borrow it's name here.
+ TM->getStrList()->push_back(std::string(
+ (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
+ SDValue Addr =
+ DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
+ if (isPositionIndependent()) {
+ Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
+ VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
+ SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
+ return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
+ }
+ return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
+ }
+ }
+}
+
+static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
+ if (!isa<BuildVectorSDNode>(N))
+ return false;
+ const auto *BVN = cast<BuildVectorSDNode>(N);
+
+ // Find first non-undef insertion.
+ unsigned Idx;
+ for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
+ auto ElemV = BVN->getOperand(Idx);
+ if (!ElemV->isUndef())
+ break;
+ }
+ // Catch the (hypothetical) all-undef case.
+ if (Idx == BVN->getNumOperands())
+ return false;
+ // Remember insertion.
+ UniqueIdx = Idx++;
+ // Verify that all other insertions are undef.
+ for (; Idx < BVN->getNumOperands(); ++Idx) {
+ auto ElemV = BVN->getOperand(Idx);
+ if (!ElemV->isUndef())
+ return false;
+ }
+ return true;
+}
+
+static SDValue getSplatValue(SDNode *N) {
+ if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
+ return BuildVec->getSplatValue();
+ }
+ return SDValue();
+}
+
+SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ unsigned NumEls = Op.getValueType().getVectorNumElements();
+ MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
+
+ // If there is just one element, expand to INSERT_VECTOR_ELT.
+ unsigned UniqueIdx;
+ if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
+ SDValue AccuV = DAG.getUNDEF(Op.getValueType());
+ auto ElemV = Op->getOperand(UniqueIdx);
+ SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV,
+ ElemV, IdxV);
+ }
+
+ // Else emit a broadcast.
+ if (SDValue ScalarV = getSplatValue(Op.getNode())) {
+ // lower to VEC_BROADCAST
+ MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
+
+ auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
+ return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
+ AVL);
+ }
+
+ // Expand
+ return SDValue();
+}
+
SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- switch (Op.getOpcode()) {
+ unsigned Opcode = Op.getOpcode();
+ if (ISD::isVPOpcode(Opcode))
+ return lowerToVVP(Op, DAG);
+
+ switch (Opcode) {
default:
llvm_unreachable("Should not custom lower this!");
+ case ISD::ATOMIC_FENCE:
+ return lowerATOMIC_FENCE(Op, DAG);
+ case ISD::ATOMIC_SWAP:
+ return lowerATOMIC_SWAP(Op, DAG);
case ISD::BlockAddress:
- return LowerBlockAddress(Op, DAG);
+ return lowerBlockAddress(Op, DAG);
+ case ISD::ConstantPool:
+ return lowerConstantPool(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return lowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP:
+ return lowerEH_SJLJ_LONGJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP:
+ return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_SETUP_DISPATCH:
+ return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+ case ISD::FRAMEADDR:
+ return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
case ISD::GlobalAddress:
- return LowerGlobalAddress(Op, DAG);
+ return lowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress:
- return LowerGlobalTLSAddress(Op, DAG);
+ return lowerGlobalTLSAddress(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::JumpTable:
+ return lowerJumpTable(Op, DAG);
+ case ISD::LOAD:
+ return lowerLOAD(Op, DAG);
+ case ISD::RETURNADDR:
+ return lowerRETURNADDR(Op, DAG, *this, Subtarget);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG);
+ case ISD::STORE:
+ return lowerSTORE(Op, DAG);
case ISD::VASTART:
- return LowerVASTART(Op, DAG);
+ return lowerVASTART(Op, DAG);
case ISD::VAARG:
- return LowerVAARG(Op, DAG);
+ return lowerVAARG(Op, DAG);
+
+ case ISD::INSERT_VECTOR_ELT:
+ return lowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+
+#define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
+#include "VVPNodes.def"
+ return lowerToVVP(Op, DAG);
}
}
/// } Custom Lower
+
+void VETargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ case ISD::ATOMIC_SWAP:
+ // Let LLVM expand atomic swap instruction through LowerOperation.
+ return;
+ default:
+ LLVM_DEBUG(N->dumpr(&DAG));
+ llvm_unreachable("Do not know how to custom type legalize this operation!");
+ }
+}
+
+/// JumpTable for VE.
+///
+/// VE cannot generate relocatable symbol in jump table. VE cannot
+/// generate expressions using symbols in both text segment and data
+/// segment like below.
+/// .4byte .LBB0_2-.LJTI0_0
+/// So, we generate offset from the top of function like below as
+/// a custom label.
+/// .4byte .LBB0_2-<function name>
+
+unsigned VETargetLowering::getJumpTableEncoding() const {
+ // Use custom label for PIC.
+ if (isPositionIndependent())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ // Otherwise, use the normal jump table encoding heuristics.
+ return TargetLowering::getJumpTableEncoding();
+}
+
+const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
+ const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
+ unsigned Uid, MCContext &Ctx) const {
+ assert(isPositionIndependent());
+
+ // Generate custom label for PIC like below.
+ // .4bytes .LBB0_2-<function name>
+ const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
+ MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
+ const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
+ return MCBinaryExpr::createSub(Value, Base, Ctx);
+}
+
+SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ assert(isPositionIndependent());
+ SDLoc DL(Table);
+ Function *Function = &DAG.getMachineFunction().getFunction();
+ assert(Function != nullptr);
+ auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
+
+ // In the jump table, we have following values in PIC mode.
+ // .4bytes .LBB0_2-<function name>
+ // We need to add this value and the address of this function to generate
+ // .LBB0_2 label correctly under PIC mode. So, we want to generate following
+ // instructions:
+ // lea %reg, fun@gotoff_lo
+ // and %reg, %reg, (32)0
+ // lea.sl %reg, fun@gotoff_hi(%reg, %got)
+ // In order to do so, we need to genarate correctly marked DAG node using
+ // makeHiLoPair.
+ SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
+ SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
+ VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
+ SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
+ return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
+}
+
+Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock *TargetBB,
+ const DebugLoc &DL) const {
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const VEInstrInfo *TII = Subtarget->getInstrInfo();
+
+ const TargetRegisterClass *RC = &VE::I64RegClass;
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+ Register Tmp2 = MRI.createVirtualRegister(RC);
+ Register Result = MRI.createVirtualRegister(RC);
+
+ if (isPositionIndependent()) {
+ // Create following instructions for local linkage PIC code.
+ // lea %Tmp1, TargetBB@gotoff_lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+ BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
+ BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
+ .addReg(VE::SX15)
+ .addReg(Tmp2, getKillRegState(true))
+ .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
+ } else {
+ // Create following instructions for non-PIC code.
+ // lea %Tmp1, TargetBB@lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %Result, TargetBB@hi(%Tmp2)
+ BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
+ BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
+ .addReg(Tmp2, getKillRegState(true))
+ .addImm(0)
+ .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
+ }
+ return Result;
+}
+
+Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ StringRef Symbol, const DebugLoc &DL,
+ bool IsLocal = false,
+ bool IsCall = false) const {
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const VEInstrInfo *TII = Subtarget->getInstrInfo();
+
+ const TargetRegisterClass *RC = &VE::I64RegClass;
+ Register Result = MRI.createVirtualRegister(RC);
+
+ if (isPositionIndependent()) {
+ if (IsCall && !IsLocal) {
+ // Create following instructions for non-local linkage PIC code function
+ // calls. These instructions uses IC and magic number -24, so we expand
+ // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
+ // lea %Reg, Symbol@plt_lo(-24)
+ // and %Reg, %Reg, (32)0
+ // sic %s16
+ // lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
+ BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
+ .addExternalSymbol("abort");
+ } else if (IsLocal) {
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+ Register Tmp2 = MRI.createVirtualRegister(RC);
+ // Create following instructions for local linkage PIC code.
+ // lea %Tmp1, Symbol@gotoff_lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+ BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
+ BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
+ .addReg(VE::SX15)
+ .addReg(Tmp2, getKillRegState(true))
+ .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
+ } else {
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+ Register Tmp2 = MRI.createVirtualRegister(RC);
+ // Create following instructions for not local linkage PIC code.
+ // lea %Tmp1, Symbol@got_lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+ // ld %Result, 0(%Tmp3)
+ Register Tmp3 = MRI.createVirtualRegister(RC);
+ BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
+ BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
+ .addReg(VE::SX15)
+ .addReg(Tmp2, getKillRegState(true))
+ .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
+ BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
+ .addReg(Tmp3, getKillRegState(true))
+ .addImm(0)
+ .addImm(0);
+ }
+ } else {
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+ Register Tmp2 = MRI.createVirtualRegister(RC);
+ // Create following instructions for non-PIC code.
+ // lea %Tmp1, Symbol@lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %Result, Symbol@hi(%Tmp2)
+ BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
+ BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
+ .addReg(Tmp2, getKillRegState(true))
+ .addImm(0)
+ .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
+ }
+ return Result;
+}
+
+void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB,
+ int FI, int Offset) const {
+ DebugLoc DL = MI.getDebugLoc();
+ const VEInstrInfo *TII = Subtarget->getInstrInfo();
+
+ Register LabelReg =
+ prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
+
+ // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
+ // referenced by longjmp (throw) later.
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
+ addFrameReference(MIB, FI, Offset); // jmpbuf[1]
+ MIB.addReg(LabelReg, getKillRegState(true));
+}
+
+MachineBasicBlock *
+VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = ++MBB->getIterator();
+
+ // Memory Reference.
+ SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+ MI.memoperands_end());
+ Register BufReg = MI.getOperand(1).getReg();
+
+ Register DstReg;
+
+ DstReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+ (void)TRI;
+ Register MainDestReg = MRI.createVirtualRegister(RC);
+ Register RestoreDestReg = MRI.createVirtualRegister(RC);
+
+ // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
+ // instructions. SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
+ //
+ // ThisMBB:
+ // buf[3] = %s17 iff %s17 is used as BP
+ // buf[1] = RestoreMBB as IC after longjmp
+ // # SjLjSetup RestoreMBB
+ //
+ // MainMBB:
+ // v_main = 0
+ //
+ // SinkMBB:
+ // v = phi(v_main, MainMBB, v_restore, RestoreMBB)
+ // ...
+ //
+ // RestoreMBB:
+ // %s17 = buf[3] = iff %s17 is used as BP
+ // v_restore = 1
+ // goto SinkMBB
+
+ MachineBasicBlock *ThisMBB = MBB;
+ MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, MainMBB);
+ MF->insert(I, SinkMBB);
+ MF->push_back(RestoreMBB);
+ RestoreMBB->setHasAddressTaken();
+
+ // Transfer the remainder of BB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // ThisMBB:
+ Register LabelReg =
+ prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
+
+ // Store BP in buf[3] iff this function is using BP.
+ const VEFrameLowering *TFI = Subtarget->getFrameLowering();
+ if (TFI->hasBP(*MF)) {
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
+ MIB.addReg(BufReg);
+ MIB.addImm(0);
+ MIB.addImm(24);
+ MIB.addReg(VE::SX17);
+ MIB.setMemRefs(MMOs);
+ }
+
+ // Store IP in buf[1].
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
+ MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
+ MIB.addImm(0);
+ MIB.addImm(8);
+ MIB.addReg(LabelReg, getKillRegState(true));
+ MIB.setMemRefs(MMOs);
+
+ // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
+
+ // Insert setup.
+ MIB =
+ BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
+
+ const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ MIB.addRegMask(RegInfo->getNoPreservedMask());
+ ThisMBB->addSuccessor(MainMBB);
+ ThisMBB->addSuccessor(RestoreMBB);
+
+ // MainMBB:
+ BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0);
+ MainMBB->addSuccessor(SinkMBB);
+
+ // SinkMBB:
+ BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
+ .addReg(MainDestReg)
+ .addMBB(MainMBB)
+ .addReg(RestoreDestReg)
+ .addMBB(RestoreMBB);
+
+ // RestoreMBB:
+ // Restore BP from buf[3] iff this function is using BP. The address of
+ // buf is in SX10.
+ // FIXME: Better to not use SX10 here
+ if (TFI->hasBP(*MF)) {
+ MachineInstrBuilder MIB =
+ BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
+ MIB.addReg(VE::SX10);
+ MIB.addImm(0);
+ MIB.addImm(24);
+ MIB.setMemRefs(MMOs);
+ }
+ BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
+ .addImm(0)
+ .addImm(0)
+ .addImm(1);
+ BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
+ RestoreMBB->addSuccessor(SinkMBB);
+
+ MI.eraseFromParent();
+ return SinkMBB;
+}
+
+MachineBasicBlock *
+VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference.
+ SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
+ MI.memoperands_end());
+ Register BufReg = MI.getOperand(0).getReg();
+
+ Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ Register FP = VE::SX9;
+ Register SP = VE::SX11;
+
+ MachineInstrBuilder MIB;
+
+ MachineBasicBlock *ThisMBB = MBB;
+
+ // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
+ //
+ // ThisMBB:
+ // %fp = load buf[0]
+ // %jmp = load buf[1]
+ // %s10 = buf ; Store an address of buf to SX10 for RestoreMBB
+ // %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
+ // jmp %jmp
+
+ // Reload FP.
+ MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
+ MIB.addReg(BufReg);
+ MIB.addImm(0);
+ MIB.addImm(0);
+ MIB.setMemRefs(MMOs);
+
+ // Reload IP.
+ MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
+ MIB.addReg(BufReg);
+ MIB.addImm(0);
+ MIB.addImm(8);
+ MIB.setMemRefs(MMOs);
+
+ // Copy BufReg to SX10 for later use in setjmp.
+ // FIXME: Better to not use SX10 here
+ BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
+ .addReg(BufReg)
+ .addImm(0);
+
+ // Reload SP.
+ MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
+ MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
+ MIB.addImm(0);
+ MIB.addImm(16);
+ MIB.setMemRefs(MMOs);
+
+ // Jump.
+ BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
+ .addReg(Tmp, getKillRegState(true))
+ .addImm(0);
+
+ MI.eraseFromParent();
+ return ThisMBB;
+}
+
+MachineBasicBlock *
+VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const VEInstrInfo *TII = Subtarget->getInstrInfo();
+ int FI = MFI.getFunctionContextIndex();
+
+ // Get a mapping of the call site numbers to all of the landing pads they're
+ // associated with.
+ DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+ unsigned MaxCSNum = 0;
+ for (auto &MBB : *MF) {
+ if (!MBB.isEHPad())
+ continue;
+
+ MCSymbol *Sym = nullptr;
+ for (const auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
+
+ assert(MI.isEHLabel() && "expected EH_LABEL");
+ Sym = MI.getOperand(0).getMCSymbol();
+ break;
+ }
+
+ if (!MF->hasCallSiteLandingPad(Sym))
+ continue;
+
+ for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
+ CallSiteNumToLPad[CSI].push_back(&MBB);
+ MaxCSNum = std::max(MaxCSNum, CSI);
+ }
+ }
+
+ // Get an ordered list of the machine basic blocks for the jump table.
+ std::vector<MachineBasicBlock *> LPadList;
+ SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+ LPadList.reserve(CallSiteNumToLPad.size());
+
+ for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+ for (auto &LP : CallSiteNumToLPad[CSI]) {
+ LPadList.push_back(LP);
+ InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+ }
+ }
+
+ assert(!LPadList.empty() &&
+ "No landing pad destinations for the dispatch jump table!");
+
+ // The %fn_context is allocated like below (from --print-after=sjljehprepare):
+ // %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
+ //
+ // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
+ // First `i64` is callsite, so callsite is FI+8.
+ static const int OffsetIC = 72;
+ static const int OffsetCS = 8;
+
+ // Create the MBBs for the dispatch code like following:
+ //
+ // ThisMBB:
+ // Prepare DispatchBB address and store it to buf[1].
+ // ...
+ //
+ // DispatchBB:
+ // %s15 = GETGOT iff isPositionIndependent
+ // %callsite = load callsite
+ // brgt.l.t #size of callsites, %callsite, DispContBB
+ //
+ // TrapBB:
+ // Call abort.
+ //
+ // DispContBB:
+ // %breg = address of jump table
+ // %pc = load and calculate next pc from %breg and %callsite
+ // jmp %pc
+
+ // Shove the dispatch's address into the return slot in the function context.
+ MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+ DispatchBB->setIsEHPad(true);
+
+ // Trap BB will causes trap like `assert(0)`.
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ DispatchBB->addSuccessor(TrapBB);
+
+ MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+ DispatchBB->addSuccessor(DispContBB);
+
+ // Insert MBBs.
+ MF->push_back(DispatchBB);
+ MF->push_back(DispContBB);
+ MF->push_back(TrapBB);
+
+ // Insert code to call abort in the TrapBB.
+ Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
+ /* Local */ false, /* Call */ true);
+ BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
+ .addReg(Abort, getKillRegState(true))
+ .addImm(0)
+ .addImm(0);
+
+ // Insert code into the entry block that creates and registers the function
+ // context.
+ setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
+
+ // Create the jump table and associated information
+ unsigned JTE = getJumpTableEncoding();
+ MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
+ unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+ const VERegisterInfo &RI = TII->getRegisterInfo();
+ // Add a register mask with no preserved registers. This results in all
+ // registers being marked as clobbered.
+ BuildMI(DispatchBB, DL, TII->get(VE::NOP))
+ .addRegMask(RI.getNoPreservedMask());
+
+ if (isPositionIndependent()) {
+ // Force to generate GETGOT, since current implementation doesn't store GOT
+ // register.
+ BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
+ }
+
+ // IReg is used as an index in a memory operand and therefore can't be SP
+ const TargetRegisterClass *RC = &VE::I64RegClass;
+ Register IReg = MRI.createVirtualRegister(RC);
+ addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
+ OffsetCS);
+ if (LPadList.size() < 64) {
+ BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
+ .addImm(VECC::CC_ILE)
+ .addImm(LPadList.size())
+ .addReg(IReg)
+ .addMBB(TrapBB);
+ } else {
+ assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
+ Register TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
+ .addImm(0)
+ .addImm(0)
+ .addImm(LPadList.size());
+ BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
+ .addImm(VECC::CC_ILE)
+ .addReg(TmpReg, getKillRegState(true))
+ .addReg(IReg)
+ .addMBB(TrapBB);
+ }
+
+ Register BReg = MRI.createVirtualRegister(RC);
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+ Register Tmp2 = MRI.createVirtualRegister(RC);
+
+ if (isPositionIndependent()) {
+ // Create following instructions for local linkage PIC code.
+ // lea %Tmp1, .LJTI0_0@gotoff_lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
+ BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
+ BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
+ .addReg(VE::SX15)
+ .addReg(Tmp2, getKillRegState(true))
+ .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
+ } else {
+ // Create following instructions for non-PIC code.
+ // lea %Tmp1, .LJTI0_0@lo
+ // and %Tmp2, %Tmp1, (32)0
+ // lea.sl %BReg, .LJTI0_0@hi(%Tmp2)
+ BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
+ .addImm(0)
+ .addImm(0)
+ .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
+ BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(M0(32));
+ BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
+ .addReg(Tmp2, getKillRegState(true))
+ .addImm(0)
+ .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
+ }
+
+ switch (JTE) {
+ case MachineJumpTableInfo::EK_BlockAddress: {
+ // Generate simple block address code for no-PIC model.
+ // sll %Tmp1, %IReg, 3
+ // lds %TReg, 0(%Tmp1, %BReg)
+ // bcfla %TReg
+
+ Register TReg = MRI.createVirtualRegister(RC);
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+
+ BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
+ .addReg(IReg, getKillRegState(true))
+ .addImm(3);
+ BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
+ .addReg(BReg, getKillRegState(true))
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(0);
+ BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
+ .addReg(TReg, getKillRegState(true))
+ .addImm(0);
+ break;
+ }
+ case MachineJumpTableInfo::EK_Custom32: {
+ // Generate block address code using differences from the function pointer
+ // for PIC model.
+ // sll %Tmp1, %IReg, 2
+ // ldl.zx %OReg, 0(%Tmp1, %BReg)
+ // Prepare function address in BReg2.
+ // adds.l %TReg, %BReg2, %OReg
+ // bcfla %TReg
+
+ assert(isPositionIndependent());
+ Register OReg = MRI.createVirtualRegister(RC);
+ Register TReg = MRI.createVirtualRegister(RC);
+ Register Tmp1 = MRI.createVirtualRegister(RC);
+
+ BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
+ .addReg(IReg, getKillRegState(true))
+ .addImm(2);
+ BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
+ .addReg(BReg, getKillRegState(true))
+ .addReg(Tmp1, getKillRegState(true))
+ .addImm(0);
+ Register BReg2 =
+ prepareSymbol(*DispContBB, DispContBB->end(),
+ DispContBB->getParent()->getName(), DL, /* Local */ true);
+ BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
+ .addReg(OReg, getKillRegState(true))
+ .addReg(BReg2, getKillRegState(true));
+ BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
+ .addReg(TReg, getKillRegState(true))
+ .addImm(0);
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected jump table encoding");
+ }
+
+ // Add the jump table entries as successors to the MBB.
+ SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+ for (auto &LP : LPadList)
+ if (SeenMBBs.insert(LP).second)
+ DispContBB->addSuccessor(LP);
+
+ // N.B. the order the invoke BBs are processed in doesn't matter here.
+ SmallVector<MachineBasicBlock *, 64> MBBLPads;
+ const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
+ for (MachineBasicBlock *MBB : InvokeBBs) {
+ // Remove the landing pad successor from the invoke block and replace it
+ // with the new dispatch block.
+ // Keep a copy of Successors since it's modified inside the loop.
+ SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+ MBB->succ_rend());
+ // FIXME: Avoid quadratic complexity.
+ for (auto MBBS : Successors) {
+ if (MBBS->isEHPad()) {
+ MBB->removeSuccessor(MBBS);
+ MBBLPads.push_back(MBBS);
+ }
+ }
+
+ MBB->addSuccessor(DispatchBB);
+
+ // Find the invoke call and mark all of the callee-saved registers as
+ // 'implicit defined' so that they're spilled. This prevents code from
+ // moving instructions to before the EH block, where they will never be
+ // executed.
+ for (auto &II : reverse(*MBB)) {
+ if (!II.isCall())
+ continue;
+
+ DenseMap<Register, bool> DefRegs;
+ for (auto &MOp : II.operands())
+ if (MOp.isReg())
+ DefRegs[MOp.getReg()] = true;
+
+ MachineInstrBuilder MIB(*MF, &II);
+ for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
+ Register Reg = SavedRegs[RI];
+ if (!DefRegs[Reg])
+ MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+ }
+
+ break;
+ }
+ }
+
+ // Mark all former landing pads as non-landing pads. The dispatch is the only
+ // landing pad now.
+ for (auto &LP : MBBLPads)
+ LP->setIsEHPad(false);
+
+ // The instruction is gone now.
+ MI.eraseFromParent();
+ return BB;
+}
+
+MachineBasicBlock *
+VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unknown Custom Instruction!");
+ case VE::EH_SjLj_LongJmp:
+ return emitEHSjLjLongJmp(MI, BB);
+ case VE::EH_SjLj_SetJmp:
+ return emitEHSjLjSetJmp(MI, BB);
+ case VE::EH_SjLj_Setup_Dispatch:
+ return emitSjLjDispatchBlock(MI, BB);
+ }
+}
+
+static bool isI32Insn(const SDNode *User, const SDNode *N) {
+ switch (User->getOpcode()) {
+ default:
+ return false;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SETCC:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::BSWAP:
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ case ISD::BR_CC:
+ case ISD::BITCAST:
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_SWAP:
+ return true;
+ case ISD::SRL:
+ if (N->getOperand(0).getOpcode() != ISD::SRL)
+ return true;
+ // (srl (trunc (srl ...))) may be optimized by combining srl, so
+ // doesn't optimize trunc now.
+ return false;
+ case ISD::SELECT_CC:
+ if (User->getOperand(2).getNode() != N &&
+ User->getOperand(3).getNode() != N)
+ return true;
+ LLVM_FALLTHROUGH;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SELECT:
+ case ISD::CopyToReg:
+ // Check all use of selections, bit operations, and copies. If all of them
+ // are safe, optimize truncate to extract_subreg.
+ for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end();
+ UI != UE; ++UI) {
+ switch ((*UI)->getOpcode()) {
+ default:
+ // If the use is an instruction which treats the source operand as i32,
+ // it is safe to avoid truncate here.
+ if (isI32Insn(*UI, N))
+ continue;
+ break;
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ // Special optimizations to the combination of ext and trunc.
+ // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
+ // since this truncate instruction clears higher 32 bits which is filled
+ // by one of ext instructions later.
+ assert(N->getValueType(0) == MVT::i32 &&
+ "find truncate to not i32 integer");
+ if (User->getOpcode() == ISD::SELECT_CC ||
+ User->getOpcode() == ISD::SELECT)
+ continue;
+ break;
+ }
+ }
+ return false;
+ }
+ return true;
+ }
+}
+
+// Optimize TRUNCATE in DAG combining. Optimizing it in CUSTOM lower is
+// sometime too early. Optimizing it in DAG pattern matching in VEInstrInfo.td
+// is sometime too late. So, doing it at here.
+SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::TRUNCATE &&
+ "Should be called with a TRUNCATE node");
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // We prefer to do this when all types are legal.
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
+ if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
+ isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
+ isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+ return SDValue();
+
+ // Check all use of this TRUNCATE.
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
+ ++UI) {
+ SDNode *User = *UI;
+
+ // Make sure that we're not going to replace TRUNCATE for non i32
+ // instructions.
+ //
+ // FIXME: Although we could sometimes handle this, and it does occur in
+ // practice that one of the condition inputs to the select is also one of
+ // the outputs, we currently can't deal with this.
+ if (isI32Insn(User, N))
+ continue;
+
+ return SDValue();
+ }
+
+ SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
+ return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
+ N->getOperand(0), SubI32),
+ 0);
+}
+
+SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case ISD::TRUNCATE:
+ return combineTRUNCATE(N, DCI);
+ }
+
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// VE Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+VETargetLowering::ConstraintType
+VETargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'v': // vector registers
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint,
+ MVT VT) const {
+ const TargetRegisterClass *RC = nullptr;
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ case 'r':
+ RC = &VE::I64RegClass;
+ break;
+ case 'v':
+ RC = &VE::V64RegClass;
+ break;
+ }
+ return std::make_pair(0U, RC);
+ }
+
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+// VE Target Optimization Support
+//===----------------------------------------------------------------------===//
+
+unsigned VETargetLowering::getMinimumJumpTableEntries() const {
+ // Specify 8 for PIC model to relieve the impact of PIC load instructions.
+ if (isJumpTableRelative())
+ return 8;
+
+ return TargetLowering::getMinimumJumpTableEntries();
+}
+
+bool VETargetLowering::hasAndNot(SDValue Y) const {
+ EVT VT = Y.getValueType();
+
+ // VE doesn't have vector and not instruction.
+ if (VT.isVector())
+ return false;
+
+ // VE allows different immediate values for X and Y where ~X & Y.
+ // Only simm7 works for X, and only mimm works for Y on VE. However, this
+ // function is used to check whether an immediate value is OK for and-not
+ // instruction as both X and Y. Generating additional instruction to
+ // retrieve an immediate value is no good since the purpose of this
+ // function is to convert a series of 3 instructions to another series of
+ // 3 instructions with better parallelism. Therefore, we return false
+ // for all immediate values now.
+ // FIXME: Change hasAndNot function to have two operands to make it work
+ // correctly with Aurora VE.
+ if (isa<ConstantSDNode>(Y))
+ return false;
+
+ // It's ok for generic registers.
+ return true;
+}
+
+/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
+static Optional<unsigned> getVVPOpcode(unsigned Opcode) {
+ switch (Opcode) {
+#define HANDLE_VP_TO_VVP(VPOPC, VVPNAME) \
+ case ISD::VPOPC: \
+ return VEISD::VVPNAME;
+#define ADD_VVP_OP(VVPNAME, SDNAME) \
+ case VEISD::VVPNAME: \
+ case ISD::SDNAME: \
+ return VEISD::VVPNAME;
+#include "VVPNodes.def"
+ }
+ return None;
+}
+
+SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
+ // Can we represent this as a VVP node.
+ const unsigned Opcode = Op->getOpcode();
+ auto VVPOpcodeOpt = getVVPOpcode(Opcode);
+ if (!VVPOpcodeOpt.hasValue())
+ return SDValue();
+ unsigned VVPOpcode = VVPOpcodeOpt.getValue();
+ const bool FromVP = ISD::isVPOpcode(Opcode);
+
+ // The representative and legalized vector type of this operation.
+ SDLoc DL(Op);
+ MVT MaskVT = MVT::v256i1; // TODO: packed mode.
+ EVT OpVecVT = Op.getValueType();
+ EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
+
+ SDValue AVL;
+ SDValue Mask;
+
+ if (FromVP) {
+ // All upstream VP SDNodes always have a mask and avl.
+ auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
+ auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
+ Mask = Op->getOperand(MaskIdx);
+ AVL = Op->getOperand(AVLIdx);
+
+ } else {
+ // Materialize the VL parameter.
+ AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32);
+ SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32);
+ Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT,
+ ConstTrue); // emit a VEISD::VEC_BROADCAST here.
+ }
+
+ // Categories we are interested in.
+ bool IsBinaryOp = false;
+
+ switch (VVPOpcode) {
+#define ADD_BINARY_VVP_OP(VVPNAME, ...) \
+ case VEISD::VVPNAME: \
+ IsBinaryOp = true; \
+ break;
+#include "VVPNodes.def"
+ }
+
+ if (IsBinaryOp) {
+ assert(LegalVecVT.isSimple());
+ return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
+ Op->getOperand(1), Mask, AVL);
+ }
+ llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+}
+
+SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+ MVT VT = Op.getOperand(0).getSimpleValueType();
+
+ // Special treatment for packed V64 types.
+ assert(VT == MVT::v512i32 || VT == MVT::v512f32);
+ // Example of codes:
+ // %packed_v = extractelt %vr, %idx / 2
+ // %v = %packed_v >> (%idx % 2 * 32)
+ // %res = %v & 0xffffffff
+
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ SDLoc DL(Op);
+ SDValue Result = Op;
+ if (0 /* Idx->isConstant() */) {
+ // TODO: optimized implementation using constant values
+ } else {
+ SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
+ SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
+ SDValue PackedElt =
+ SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
+ SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
+ SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
+ SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
+ Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
+ PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
+ SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
+ PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
+ SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
+ Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, PackedElt, SubI32),
+ 0);
+
+ if (Op.getSimpleValueType() == MVT::f32) {
+ Result = DAG.getBitcast(MVT::f32, Result);
+ } else {
+ assert(Op.getSimpleValueType() == MVT::i32);
+ }
+ }
+ return Result;
+}
+
+SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+ MVT VT = Op.getOperand(0).getSimpleValueType();
+
+ // Special treatment for packed V64 types.
+ assert(VT == MVT::v512i32 || VT == MVT::v512f32);
+ // The v512i32 and v512f32 starts from upper bits (0..31). This "upper
+ // bits" required `val << 32` from C implementation's point of view.
+ //
+ // Example of codes:
+ // %packed_elt = extractelt %vr, (%idx >> 1)
+ // %shift = ((%idx & 1) ^ 1) << 5
+ // %packed_elt &= 0xffffffff00000000 >> shift
+ // %packed_elt |= (zext %val) << shift
+ // %vr = insertelt %vr, %packed_elt, (%idx >> 1)
+
+ SDLoc DL(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Val = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ if (Idx.getSimpleValueType() == MVT::i32)
+ Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
+ if (Val.getSimpleValueType() == MVT::f32)
+ Val = DAG.getBitcast(MVT::i32, Val);
+ assert(Val.getSimpleValueType() == MVT::i32);
+ Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+
+ SDValue Result = Op;
+ if (0 /* Idx->isConstant()*/) {
+ // TODO: optimized implementation using constant values
+ } else {
+ SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
+ SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
+ SDValue PackedElt =
+ SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
+ SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
+ SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
+ SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
+ Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
+ SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
+ Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
+ PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
+ Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
+ PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
+ Result =
+ SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
+ {HalfIdx, PackedElt, Vec}),
+ 0);
+ }
+ return Result;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
index 4633220efaa1..a6e1bf396035 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.h
@@ -24,23 +24,36 @@ namespace VEISD {
enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
- Hi,
- Lo, // Hi/Lo operations, typically on a global address.
-
- GETFUNPLT, // load function address through %plt insturction
- GETTLSADDR, // load address for TLS access
- GETSTACKTOP, // retrieve address of stack top (first address of
- // locals and temporaries)
-
- CALL, // A call instruction.
- RET_FLAG, // Return with a flag operand.
- GLOBAL_BASE_REG, // Global base reg for PIC.
+ CALL, // A call instruction.
+ EH_SJLJ_LONGJMP, // SjLj exception handling longjmp.
+ EH_SJLJ_SETJMP, // SjLj exception handling setjmp.
+ EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
+ GETFUNPLT, // Load function address through %plt insturction.
+ GETTLSADDR, // Load address for TLS access.
+ GETSTACKTOP, // Retrieve address of stack top (first address of
+ // locals and temporaries).
+ GLOBAL_BASE_REG, // Global base reg for PIC.
+ Hi, // Hi/Lo operations, typically on a global address.
+ Lo, // Hi/Lo operations, typically on a global address.
+ MEMBARRIER, // Compiler barrier only; generate a no-op.
+ RET_FLAG, // Return with a flag operand.
+ TS1AM, // A TS1AM instruction used for 1/2 bytes swap.
+ VEC_BROADCAST, // A vector broadcast instruction.
+ // 0: scalar value, 1: VL
+
+// VVP_* nodes.
+#define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,
+#include "VVPNodes.def"
};
}
class VETargetLowering : public TargetLowering {
const VESubtarget *Subtarget;
+ void initRegisterClasses();
+ void initSPUActions();
+ void initVPUActions();
+
public:
VETargetLowering(const TargetMachine &TM, const VESubtarget &STI);
@@ -74,23 +87,98 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
SelectionDAG &DAG) const override;
+ /// Helper functions for atomic operations.
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ // VE uses release consistency, so need fence for each atomics.
+ return true;
+ }
+ Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
/// Custom Lower {
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
- SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
+ unsigned getJumpTableEncoding() const override;
+ const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned Uid,
+ MCContext &Ctx) const override;
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+ // VE doesn't need getPICJumpTableRelocBaseExpr since it is used for only
+ // EK_LabelDifference32.
+
+ SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerATOMIC_SWAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
/// } Custom Lower
+ /// Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ /// Custom Inserter {
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+ MachineBasicBlock *emitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
+ void setupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB, int FI,
+ int Offset) const;
+ // Setup basic block address.
+ Register prepareMBB(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ MachineBasicBlock *TargetBB, const DebugLoc &DL) const;
+ // Prepare function/variable address.
+ Register prepareSymbol(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ StringRef Symbol, const DebugLoc &DL, bool IsLocal,
+ bool IsCall) const;
+ /// } Custom Inserter
+
+ /// VVP Lowering {
+ SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const;
+ /// } VVPLowering
+
+ /// Custom DAGCombine {
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
+ /// } Custom DAGCombine
+
SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
SelectionDAG &DAG) const;
SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
/// Returns true if the target allows unaligned memory accesses of the
@@ -99,10 +187,32 @@ public:
MachineMemOperand::Flags Flags,
bool *Fast) const override;
- // Block s/udiv lowering for now
- bool isIntDivCheap(EVT VT, AttributeList Attr) const override { return true; }
+ /// Inline Assembly {
+
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// } Inline Assembly
+ /// Target Optimization {
+
+ // Return lower limit for number of blocks in a jump table.
+ unsigned getMinimumJumpTableEntries() const override;
+
+ // SX-Aurora VE's s/udiv is 5-9 times slower than multiply.
+ bool isIntDivCheap(EVT, AttributeList) const override { return false; }
+ // VE doesn't have rem.
+ bool hasStandaloneRem(EVT) const override { return false; }
+ // VE LDZ instruction returns 64 if the input is zero.
+ bool isCheapToSpeculateCtlz() const override { return true; }
+ // VE LDZ instruction is fast.
+ bool isCtlzFast() const override { return true; }
+ // VE has NND instruction.
bool hasAndNot(SDValue Y) const override;
+
+ /// } Target Optimization
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h
new file mode 100644
index 000000000000..1b0e07546931
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrBuilder.h
@@ -0,0 +1,41 @@
+//===-- VEInstrBuilder.h - Aides for building VE insts ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VEINSTRBUILDER_H
+#define LLVM_LIB_TARGET_VE_VEINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+ bool ThreeOp = true) {
+ if (ThreeOp)
+ return MIB.addFrameIndex(FI).addImm(0).addImm(Offset);
+ return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td
index 0c02411ff916..f43c9755f1b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrFormats.td
@@ -35,6 +35,25 @@ class InstVE<dag outs, dag ins, string asmstr, list<dag> pattern>
let AsmString = asmstr;
let Pattern = pattern;
+ bits<1> VE_Vector = 0;
+ bits<1> VE_VLInUse = 0;
+ bits<3> VE_VLIndex = 0;
+ bits<1> VE_VLWithMask = 0;
+
+ /// These fields correspond to the fields in VEInstrInfo.h. Any changes to
+ /// these must be reflected there! See comments there for what these are.
+ ///
+ /// VLIndex is the index of VL register in MI's operands. The HW instruction
+ /// doesn't have that field, but we add is in MI for the ease of optimization.
+ /// For example, the index of VL of (VST $sy, $sz, $sx, $vl) is 3 (beginning
+ /// from 0), and the index of VL of (VST $sy, $sz, $sx, $vm, $vl) is 4. We
+ /// define vector instructions hierarchically, so use VE_VLIndex which is
+ /// defined by the type of instruction and VE_VLWithMask which is defined
+ /// whether the insturction use mask or not.
+ let TSFlags{0} = VE_Vector;
+ let TSFlags{1} = VE_VLInUse;
+ let TSFlags{4-2} = !add(VE_VLIndex, VE_VLWithMask);
+
let DecoderNamespace = "VE";
field bits<64> SoftFail = 0;
}
@@ -179,12 +198,82 @@ class RRFENCE<bits<8>opVal, dag outs, dag ins, string asmstr,
//-----------------------------------------------------------------------------
// Section 5.6 RVM Type
+//
+// RVM type is for vector transfer instructions.
//-----------------------------------------------------------------------------
+class RVM<bits<8>opVal, dag outs, dag ins, string asmstr,
+ list<dag> pattern = []>
+ : InstVE<outs, ins, asmstr, pattern> {
+ bits<1> cx = 0;
+ bits<1> vc = 0;
+ bits<1> cs = 0;
+ bits<4> m = 0;
+ bits<1> cy = 1;
+ bits<7> sy;
+ bits<1> cz = 1;
+ bits<7> sz;
+ bits<8> vx;
+ bits<8> vy = 0;
+ bits<7> sw = 0;
+ let op = opVal;
+ let Inst{55} = cx;
+ let Inst{54} = vc;
+ let Inst{53} = cs;
+ let Inst{52} = 0;
+ let Inst{51-48} = m;
+ let Inst{47} = cy;
+ let Inst{46-40} = sy;
+ let Inst{39} = cz;
+ let Inst{38-32} = sz;
+ let Inst{31-24} = vx;
+ let Inst{23-16} = vy;
+ let Inst{15-8} = 0;
+ let Inst{7} = 0;
+ let Inst{6-0} = sw;
+
+ let VE_Vector = 1;
+}
+
//-----------------------------------------------------------------------------
// Section 5.7 RV Type
+//
+// RV type is for vector instructions.
//-----------------------------------------------------------------------------
+class RV<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern = []>
+ : InstVE<outs, ins, asmstr, pattern> {
+ bits<1> cx = 0;
+ bits<1> cx2 = 0;
+ bits<1> cs = 0;
+ bits<1> cs2 = 0;
+ bits<4> m = 0;
+ bits<1> cy = 1;
+ bits<7> sy;
+ bits<1> cz = 0;
+ bits<7> sz = 0;
+ bits<8> vx = 0;
+ bits<8> vy = 0;
+ bits<8> vz = 0;
+ bits<8> vw = 0;
+ let op = opVal;
+ let Inst{55} = cx;
+ let Inst{54} = cx2;
+ let Inst{53} = cs;
+ let Inst{52} = cs2;
+ let Inst{51-48} = m;
+ let Inst{47} = cy;
+ let Inst{46-40} = sy;
+ let Inst{39} = cz;
+ let Inst{38-32} = sz;
+ let Inst{31-24} = vx;
+ let Inst{23-16} = vy;
+ let Inst{15-8} = vz;
+ let Inst{7-0} = vw;
+
+ let VE_Vector = 1;
+}
+
// Pseudo instructions.
class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = []>
: InstVE<outs, ins, asmstr, pattern> {
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
index 86b2ac2078b1..9770052ff913 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -92,38 +92,46 @@ static VECC::CondCode GetOppositeBranchCondition(VECC::CondCode CC) {
llvm_unreachable("Invalid cond code");
}
-// Treat br.l [BRCF AT] as unconditional branch
+// Treat a branch relative long always instruction as unconditional branch.
+// For example, br.l.t and br.l.
static bool isUncondBranchOpcode(int Opc) {
- return Opc == VE::BRCFLa || Opc == VE::BRCFWa ||
- Opc == VE::BRCFLa_nt || Opc == VE::BRCFWa_nt ||
- Opc == VE::BRCFLa_t || Opc == VE::BRCFWa_t ||
- Opc == VE::BRCFDa || Opc == VE::BRCFSa ||
- Opc == VE::BRCFDa_nt || Opc == VE::BRCFSa_nt ||
- Opc == VE::BRCFDa_t || Opc == VE::BRCFSa_t;
+ using namespace llvm::VE;
+
+#define BRKIND(NAME) (Opc == NAME##a || Opc == NAME##a_nt || Opc == NAME##a_t)
+ // VE has other branch relative always instructions for word/double/float,
+ // but we use only long branches in our lower. So, sanity check it here.
+ assert(!BRKIND(BRCFW) && !BRKIND(BRCFD) && !BRKIND(BRCFS) &&
+ "Branch relative word/double/float always instructions should not be "
+ "used!");
+ return BRKIND(BRCFL);
+#undef BRKIND
}
+// Treat branch relative conditional as conditional branch instructions.
+// For example, brgt.l.t and brle.s.nt.
static bool isCondBranchOpcode(int Opc) {
- return Opc == VE::BRCFLrr || Opc == VE::BRCFLir ||
- Opc == VE::BRCFLrr_nt || Opc == VE::BRCFLir_nt ||
- Opc == VE::BRCFLrr_t || Opc == VE::BRCFLir_t ||
- Opc == VE::BRCFWrr || Opc == VE::BRCFWir ||
- Opc == VE::BRCFWrr_nt || Opc == VE::BRCFWir_nt ||
- Opc == VE::BRCFWrr_t || Opc == VE::BRCFWir_t ||
- Opc == VE::BRCFDrr || Opc == VE::BRCFDir ||
- Opc == VE::BRCFDrr_nt || Opc == VE::BRCFDir_nt ||
- Opc == VE::BRCFDrr_t || Opc == VE::BRCFDir_t ||
- Opc == VE::BRCFSrr || Opc == VE::BRCFSir ||
- Opc == VE::BRCFSrr_nt || Opc == VE::BRCFSir_nt ||
- Opc == VE::BRCFSrr_t || Opc == VE::BRCFSir_t;
+ using namespace llvm::VE;
+
+#define BRKIND(NAME) \
+ (Opc == NAME##rr || Opc == NAME##rr_nt || Opc == NAME##rr_t || \
+ Opc == NAME##ir || Opc == NAME##ir_nt || Opc == NAME##ir_t)
+ return BRKIND(BRCFL) || BRKIND(BRCFW) || BRKIND(BRCFD) || BRKIND(BRCFS);
+#undef BRKIND
}
+// Treat branch long always instructions as indirect branch.
+// For example, b.l.t and b.l.
static bool isIndirectBranchOpcode(int Opc) {
- return Opc == VE::BCFLari || Opc == VE::BCFLari ||
- Opc == VE::BCFLari_nt || Opc == VE::BCFLari_nt ||
- Opc == VE::BCFLari_t || Opc == VE::BCFLari_t ||
- Opc == VE::BCFLari || Opc == VE::BCFLari ||
- Opc == VE::BCFLari_nt || Opc == VE::BCFLari_nt ||
- Opc == VE::BCFLari_t || Opc == VE::BCFLari_t;
+ using namespace llvm::VE;
+
+#define BRKIND(NAME) \
+ (Opc == NAME##ari || Opc == NAME##ari_nt || Opc == NAME##ari_t)
+ // VE has other branch always instructions for word/double/float, but
+ // we use only long branches in our lower. So, sanity check it here.
+ assert(!BRKIND(BCFW) && !BRKIND(BCFD) && !BRKIND(BCFS) &&
+ "Branch word/double/float always instructions should not be used!");
+ return BRKIND(BCFL);
+#undef BRKIND
}
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
@@ -311,11 +319,43 @@ bool VEInstrInfo::reverseBranchCondition(
}
static bool IsAliasOfSX(Register Reg) {
- return VE::I8RegClass.contains(Reg) || VE::I16RegClass.contains(Reg) ||
- VE::I32RegClass.contains(Reg) || VE::I64RegClass.contains(Reg) ||
+ return VE::I32RegClass.contains(Reg) || VE::I64RegClass.contains(Reg) ||
VE::F32RegClass.contains(Reg);
}
+static void copyPhysSubRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
+ const MCInstrDesc &MCID, unsigned int NumSubRegs,
+ const unsigned *SubRegIdx,
+ const TargetRegisterInfo *TRI) {
+ MachineInstr *MovMI = nullptr;
+
+ for (unsigned Idx = 0; Idx != NumSubRegs; ++Idx) {
+ Register SubDest = TRI->getSubReg(DestReg, SubRegIdx[Idx]);
+ Register SubSrc = TRI->getSubReg(SrcReg, SubRegIdx[Idx]);
+ assert(SubDest && SubSrc && "Bad sub-register");
+
+ if (MCID.getOpcode() == VE::ORri) {
+ // generate "ORri, dest, src, 0" instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, I, DL, MCID, SubDest).addReg(SubSrc).addImm(0);
+ MovMI = MIB.getInstr();
+ } else if (MCID.getOpcode() == VE::ANDMmm) {
+ // generate "ANDM, dest, vm0, src" instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, I, DL, MCID, SubDest).addReg(VE::VM0).addReg(SubSrc);
+ MovMI = MIB.getInstr();
+ } else {
+ llvm_unreachable("Unexpected reg-to-reg copy instruction");
+ }
+ }
+ // Add implicit super-register defs and kills to the last MovMI.
+ MovMI->addRegisterDefined(DestReg, TRI);
+ if (KillSrc)
+ MovMI->addRegisterKilled(SrcReg, TRI, true);
+}
+
void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
MCRegister DestReg, MCRegister SrcReg,
@@ -325,6 +365,41 @@ void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(VE::ORri), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0);
+ } else if (VE::V64RegClass.contains(DestReg, SrcReg)) {
+ // Generate following instructions
+ // %sw16 = LEA32zii 256
+ // VORmvl %dest, (0)1, %src, %sw16
+ // TODO: reuse a register if vl is already assigned to a register
+ // FIXME: it would be better to scavenge a register here instead of
+ // reserving SX16 all of the time.
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ Register TmpReg = VE::SX16;
+ Register SubTmp = TRI->getSubReg(TmpReg, VE::sub_i32);
+ BuildMI(MBB, I, DL, get(VE::LEAzii), TmpReg)
+ .addImm(0)
+ .addImm(0)
+ .addImm(256);
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(VE::VORmvl), DestReg)
+ .addImm(M1(0)) // Represent (0)1.
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addReg(SubTmp, getKillRegState(true));
+ MIB.getInstr()->addRegisterKilled(TmpReg, TRI, true);
+ } else if (VE::VMRegClass.contains(DestReg, SrcReg)) {
+ BuildMI(MBB, I, DL, get(VE::ANDMmm), DestReg)
+ .addReg(VE::VM0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else if (VE::VM512RegClass.contains(DestReg, SrcReg)) {
+ // Use two instructions.
+ const unsigned SubRegIdx[] = {VE::sub_vm_even, VE::sub_vm_odd};
+ unsigned int NumSubRegs = 2;
+ copyPhysSubRegs(MBB, I, DL, DestReg, SrcReg, KillSrc, get(VE::ANDMmm),
+ NumSubRegs, SubRegIdx, &getRegisterInfo());
+ } else if (VE::F128RegClass.contains(DestReg, SrcReg)) {
+ // Use two instructions.
+ const unsigned SubRegIdx[] = {VE::sub_even, VE::sub_odd};
+ unsigned int NumSubRegs = 2;
+ copyPhysSubRegs(MBB, I, DL, DestReg, SrcReg, KillSrc, get(VE::ORri),
+ NumSubRegs, SubRegIdx, &getRegisterInfo());
} else {
const TargetRegisterInfo *TRI = &getRegisterInfo();
dbgs() << "Impossible reg-to-reg copy from " << printReg(SrcReg, TRI)
@@ -342,7 +417,8 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
if (MI.getOpcode() == VE::LDrii || // I64
MI.getOpcode() == VE::LDLSXrii || // I32
- MI.getOpcode() == VE::LDUrii // F32
+ MI.getOpcode() == VE::LDUrii || // F32
+ MI.getOpcode() == VE::LDQrii // F128 (pseudo)
) {
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() &&
@@ -363,7 +439,8 @@ unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
if (MI.getOpcode() == VE::STrii || // I64
MI.getOpcode() == VE::STLrii || // I32
- MI.getOpcode() == VE::STUrii // F32
+ MI.getOpcode() == VE::STUrii || // F32
+ MI.getOpcode() == VE::STQrii // F128 (pseudo)
) {
if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() &&
@@ -412,6 +489,13 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addImm(0)
.addReg(SrcReg, getKillRegState(isKill))
.addMemOperand(MMO);
+ } else if (VE::F128RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(VE::STQrii))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(isKill))
+ .addMemOperand(MMO);
} else
report_fatal_error("Can't store this register to stack slot");
}
@@ -449,10 +533,194 @@ void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
.addImm(0)
.addImm(0)
.addMemOperand(MMO);
+ } else if (VE::F128RegClass.hasSubClassEq(RC)) {
+ BuildMI(MBB, I, DL, get(VE::LDQrii), DestReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addImm(0)
+ .addMemOperand(MMO);
} else
report_fatal_error("Can't load this register from stack slot");
}
+bool VEInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+ Register Reg, MachineRegisterInfo *MRI) const {
+ LLVM_DEBUG(dbgs() << "FoldImmediate\n");
+
+ LLVM_DEBUG(dbgs() << "checking DefMI\n");
+ int64_t ImmVal;
+ switch (DefMI.getOpcode()) {
+ default:
+ return false;
+ case VE::ORim:
+ // General move small immediate instruction on VE.
+ LLVM_DEBUG(dbgs() << "checking ORim\n");
+ LLVM_DEBUG(DefMI.dump());
+ // FIXME: We may need to support FPImm too.
+ assert(DefMI.getOperand(1).isImm());
+ assert(DefMI.getOperand(2).isImm());
+ ImmVal =
+ DefMI.getOperand(1).getImm() + mimm2Val(DefMI.getOperand(2).getImm());
+ LLVM_DEBUG(dbgs() << "ImmVal is " << ImmVal << "\n");
+ break;
+ case VE::LEAzii:
+ // General move immediate instruction on VE.
+ LLVM_DEBUG(dbgs() << "checking LEAzii\n");
+ LLVM_DEBUG(DefMI.dump());
+ // FIXME: We may need to support FPImm too.
+ assert(DefMI.getOperand(2).isImm());
+ if (!DefMI.getOperand(3).isImm())
+ // LEAzii may refer label
+ return false;
+ ImmVal = DefMI.getOperand(2).getImm() + DefMI.getOperand(3).getImm();
+ LLVM_DEBUG(dbgs() << "ImmVal is " << ImmVal << "\n");
+ break;
+ }
+
+ // Try to fold like below:
+ // %1:i64 = ORim 0, 0(1)
+ // %2:i64 = CMPSLrr %0, %1
+ // To
+ // %2:i64 = CMPSLrm %0, 0(1)
+ //
+ // Another example:
+ // %1:i64 = ORim 6, 0(1)
+ // %2:i64 = CMPSLrr %1, %0
+ // To
+ // %2:i64 = CMPSLir 6, %0
+ //
+ // Support commutable instructions like below:
+ // %1:i64 = ORim 6, 0(1)
+ // %2:i64 = ADDSLrr %1, %0
+ // To
+ // %2:i64 = ADDSLri %0, 6
+ //
+ // FIXME: Need to support i32. Current implementtation requires
+ // EXTRACT_SUBREG, so input has following COPY and it avoids folding:
+ // %1:i64 = ORim 6, 0(1)
+ // %2:i32 = COPY %1.sub_i32
+ // %3:i32 = ADDSWSXrr %0, %2
+ // FIXME: Need to support shift, cmov, and more instructions.
+ // FIXME: Need to support lvl too, but LVLGen runs after peephole-opt.
+
+ LLVM_DEBUG(dbgs() << "checking UseMI\n");
+ LLVM_DEBUG(UseMI.dump());
+ unsigned NewUseOpcSImm7;
+ unsigned NewUseOpcMImm;
+ enum InstType {
+ rr2ri_rm, // rr -> ri or rm, commutable
+ rr2ir_rm, // rr -> ir or rm
+ } InstType;
+
+ using namespace llvm::VE;
+#define INSTRKIND(NAME) \
+ case NAME##rr: \
+ NewUseOpcSImm7 = NAME##ri; \
+ NewUseOpcMImm = NAME##rm; \
+ InstType = rr2ri_rm; \
+ break
+#define NCINSTRKIND(NAME) \
+ case NAME##rr: \
+ NewUseOpcSImm7 = NAME##ir; \
+ NewUseOpcMImm = NAME##rm; \
+ InstType = rr2ir_rm; \
+ break
+
+ switch (UseMI.getOpcode()) {
+ default:
+ return false;
+
+ INSTRKIND(ADDUL);
+ INSTRKIND(ADDSWSX);
+ INSTRKIND(ADDSWZX);
+ INSTRKIND(ADDSL);
+ NCINSTRKIND(SUBUL);
+ NCINSTRKIND(SUBSWSX);
+ NCINSTRKIND(SUBSWZX);
+ NCINSTRKIND(SUBSL);
+ INSTRKIND(MULUL);
+ INSTRKIND(MULSWSX);
+ INSTRKIND(MULSWZX);
+ INSTRKIND(MULSL);
+ NCINSTRKIND(DIVUL);
+ NCINSTRKIND(DIVSWSX);
+ NCINSTRKIND(DIVSWZX);
+ NCINSTRKIND(DIVSL);
+ NCINSTRKIND(CMPUL);
+ NCINSTRKIND(CMPSWSX);
+ NCINSTRKIND(CMPSWZX);
+ NCINSTRKIND(CMPSL);
+ INSTRKIND(MAXSWSX);
+ INSTRKIND(MAXSWZX);
+ INSTRKIND(MAXSL);
+ INSTRKIND(MINSWSX);
+ INSTRKIND(MINSWZX);
+ INSTRKIND(MINSL);
+ INSTRKIND(AND);
+ INSTRKIND(OR);
+ INSTRKIND(XOR);
+ INSTRKIND(EQV);
+ NCINSTRKIND(NND);
+ NCINSTRKIND(MRG);
+ }
+
+#undef INSTRKIND
+
+ unsigned NewUseOpc;
+ unsigned UseIdx;
+ bool Commute = false;
+ LLVM_DEBUG(dbgs() << "checking UseMI operands\n");
+ switch (InstType) {
+ case rr2ri_rm:
+ UseIdx = 2;
+ if (UseMI.getOperand(1).getReg() == Reg) {
+ Commute = true;
+ } else {
+ assert(UseMI.getOperand(2).getReg() == Reg);
+ }
+ if (isInt<7>(ImmVal)) {
+ // This ImmVal matches to SImm7 slot, so change UseOpc to an instruction
+ // holds a simm7 slot.
+ NewUseOpc = NewUseOpcSImm7;
+ } else if (isMImmVal(ImmVal)) {
+ // Similarly, change UseOpc to an instruction holds a mimm slot.
+ NewUseOpc = NewUseOpcMImm;
+ ImmVal = val2MImm(ImmVal);
+ } else
+ return false;
+ break;
+ case rr2ir_rm:
+ if (UseMI.getOperand(1).getReg() == Reg) {
+ // Check immediate value whether it matchs to the UseMI instruction.
+ if (!isInt<7>(ImmVal))
+ return false;
+ NewUseOpc = NewUseOpcSImm7;
+ UseIdx = 1;
+ } else {
+ assert(UseMI.getOperand(2).getReg() == Reg);
+ // Check immediate value whether it matchs to the UseMI instruction.
+ if (!isMImmVal(ImmVal))
+ return false;
+ NewUseOpc = NewUseOpcMImm;
+ ImmVal = val2MImm(ImmVal);
+ UseIdx = 2;
+ }
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "modifying UseMI\n");
+ bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+ UseMI.setDesc(get(NewUseOpc));
+ if (Commute) {
+ UseMI.getOperand(1).setReg(UseMI.getOperand(UseIdx).getReg());
+ }
+ UseMI.getOperand(UseIdx).ChangeToImmediate(ImmVal);
+ if (DeleteDef)
+ DefMI.eraseFromParent();
+
+ return true;
+}
+
Register VEInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
VEMachineFunctionInfo *VEFI = MF->getInfo<VEMachineFunctionInfo>();
Register GlobalBaseReg = VEFI->getGlobalBaseReg();
@@ -472,6 +740,106 @@ Register VEInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
return GlobalBaseReg;
}
+static Register getVM512Upper(Register reg) {
+ return (reg - VE::VMP0) * 2 + VE::VM0;
+}
+
+static Register getVM512Lower(Register reg) { return getVM512Upper(reg) + 1; }
+
+// Expand pseudo logical vector instructions for VM512 registers.
+static void expandPseudoLogM(MachineInstr &MI, const MCInstrDesc &MCID) {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ Register VMXu = getVM512Upper(MI.getOperand(0).getReg());
+ Register VMXl = getVM512Lower(MI.getOperand(0).getReg());
+ Register VMYu = getVM512Upper(MI.getOperand(1).getReg());
+ Register VMYl = getVM512Lower(MI.getOperand(1).getReg());
+
+ switch (MI.getOpcode()) {
+ default: {
+ Register VMZu = getVM512Upper(MI.getOperand(2).getReg());
+ Register VMZl = getVM512Lower(MI.getOperand(2).getReg());
+ BuildMI(*MBB, MI, DL, MCID).addDef(VMXu).addUse(VMYu).addUse(VMZu);
+ BuildMI(*MBB, MI, DL, MCID).addDef(VMXl).addUse(VMYl).addUse(VMZl);
+ break;
+ }
+ case VE::NEGMy:
+ BuildMI(*MBB, MI, DL, MCID).addDef(VMXu).addUse(VMYu);
+ BuildMI(*MBB, MI, DL, MCID).addDef(VMXl).addUse(VMYl);
+ break;
+ }
+ MI.eraseFromParent();
+}
+
+static void addOperandsForVFMK(MachineInstrBuilder &MIB, MachineInstr &MI,
+ bool Upper) {
+ // VM512
+ MIB.addReg(Upper ? getVM512Upper(MI.getOperand(0).getReg())
+ : getVM512Lower(MI.getOperand(0).getReg()));
+
+ switch (MI.getNumExplicitOperands()) {
+ default:
+ report_fatal_error("unexpected number of operands for pvfmk");
+ case 2: // _Ml: VM512, VL
+ // VL
+ MIB.addReg(MI.getOperand(1).getReg());
+ break;
+ case 4: // _Mvl: VM512, CC, VR, VL
+ // CC
+ MIB.addImm(MI.getOperand(1).getImm());
+ // VR
+ MIB.addReg(MI.getOperand(2).getReg());
+ // VL
+ MIB.addReg(MI.getOperand(3).getReg());
+ break;
+ case 5: // _MvMl: VM512, CC, VR, VM512, VL
+ // CC
+ MIB.addImm(MI.getOperand(1).getImm());
+ // VR
+ MIB.addReg(MI.getOperand(2).getReg());
+ // VM512
+ MIB.addReg(Upper ? getVM512Upper(MI.getOperand(3).getReg())
+ : getVM512Lower(MI.getOperand(3).getReg()));
+ // VL
+ MIB.addReg(MI.getOperand(4).getReg());
+ break;
+ }
+}
+
+static void expandPseudoVFMK(const TargetInstrInfo &TI, MachineInstr &MI) {
+ // replace to pvfmk.w.up and pvfmk.w.lo
+ // replace to pvfmk.s.up and pvfmk.s.lo
+
+ static std::map<unsigned, std::pair<unsigned, unsigned>> VFMKMap = {
+ {VE::VFMKyal, {VE::VFMKLal, VE::VFMKLal}},
+ {VE::VFMKynal, {VE::VFMKLnal, VE::VFMKLnal}},
+ {VE::VFMKWyvl, {VE::PVFMKWUPvl, VE::PVFMKWLOvl}},
+ {VE::VFMKWyvyl, {VE::PVFMKWUPvml, VE::PVFMKWLOvml}},
+ {VE::VFMKSyvl, {VE::PVFMKSUPvl, VE::PVFMKSLOvl}},
+ {VE::VFMKSyvyl, {VE::PVFMKSUPvml, VE::PVFMKSLOvml}},
+ };
+
+ unsigned Opcode = MI.getOpcode();
+
+ auto Found = VFMKMap.find(Opcode);
+ if (Found == VFMKMap.end())
+ report_fatal_error("unexpected opcode for pseudo vfmk");
+
+ unsigned OpcodeUpper = (*Found).second.first;
+ unsigned OpcodeLower = (*Found).second.second;
+
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ MachineInstrBuilder Bu = BuildMI(*MBB, MI, DL, TI.get(OpcodeUpper));
+ addOperandsForVFMK(Bu, MI, /* Upper */ true);
+ MachineInstrBuilder Bl = BuildMI(*MBB, MI, DL, TI.get(OpcodeLower));
+ addOperandsForVFMK(Bl, MI, /* Upper */ false);
+
+ MI.eraseFromParent();
+}
+
bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case VE::EXTEND_STACK: {
@@ -484,6 +852,110 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case VE::GETSTACKTOP: {
return expandGetStackTopPseudo(MI);
}
+
+ case VE::ANDMyy:
+ expandPseudoLogM(MI, get(VE::ANDMmm));
+ return true;
+ case VE::ORMyy:
+ expandPseudoLogM(MI, get(VE::ORMmm));
+ return true;
+ case VE::XORMyy:
+ expandPseudoLogM(MI, get(VE::XORMmm));
+ return true;
+ case VE::EQVMyy:
+ expandPseudoLogM(MI, get(VE::EQVMmm));
+ return true;
+ case VE::NNDMyy:
+ expandPseudoLogM(MI, get(VE::NNDMmm));
+ return true;
+ case VE::NEGMy:
+ expandPseudoLogM(MI, get(VE::NEGMm));
+ return true;
+
+ case VE::LVMyir:
+ case VE::LVMyim:
+ case VE::LVMyir_y:
+ case VE::LVMyim_y: {
+ Register VMXu = getVM512Upper(MI.getOperand(0).getReg());
+ Register VMXl = getVM512Lower(MI.getOperand(0).getReg());
+ int64_t Imm = MI.getOperand(1).getImm();
+ bool IsSrcReg =
+ MI.getOpcode() == VE::LVMyir || MI.getOpcode() == VE::LVMyir_y;
+ Register Src = IsSrcReg ? MI.getOperand(2).getReg() : VE::NoRegister;
+ int64_t MImm = IsSrcReg ? 0 : MI.getOperand(2).getImm();
+ bool KillSrc = IsSrcReg ? MI.getOperand(2).isKill() : false;
+ Register VMX = VMXl;
+ if (Imm >= 4) {
+ VMX = VMXu;
+ Imm -= 4;
+ }
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ switch (MI.getOpcode()) {
+ case VE::LVMyir:
+ BuildMI(*MBB, MI, DL, get(VE::LVMir))
+ .addDef(VMX)
+ .addImm(Imm)
+ .addReg(Src, getKillRegState(KillSrc));
+ break;
+ case VE::LVMyim:
+ BuildMI(*MBB, MI, DL, get(VE::LVMim))
+ .addDef(VMX)
+ .addImm(Imm)
+ .addImm(MImm);
+ break;
+ case VE::LVMyir_y:
+ assert(MI.getOperand(0).getReg() == MI.getOperand(3).getReg() &&
+ "LVMyir_y has different register in 3rd operand");
+ BuildMI(*MBB, MI, DL, get(VE::LVMir_m))
+ .addDef(VMX)
+ .addImm(Imm)
+ .addReg(Src, getKillRegState(KillSrc))
+ .addReg(VMX);
+ break;
+ case VE::LVMyim_y:
+ assert(MI.getOperand(0).getReg() == MI.getOperand(3).getReg() &&
+ "LVMyim_y has different register in 3rd operand");
+ BuildMI(*MBB, MI, DL, get(VE::LVMim_m))
+ .addDef(VMX)
+ .addImm(Imm)
+ .addImm(MImm)
+ .addReg(VMX);
+ break;
+ }
+ MI.eraseFromParent();
+ return true;
+ }
+ case VE::SVMyi: {
+ Register Dest = MI.getOperand(0).getReg();
+ Register VMZu = getVM512Upper(MI.getOperand(1).getReg());
+ Register VMZl = getVM512Lower(MI.getOperand(1).getReg());
+ bool KillSrc = MI.getOperand(1).isKill();
+ int64_t Imm = MI.getOperand(2).getImm();
+ Register VMZ = VMZl;
+ if (Imm >= 4) {
+ VMZ = VMZu;
+ Imm -= 4;
+ }
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, DL, get(VE::SVMmi), Dest).addReg(VMZ).addImm(Imm);
+ MachineInstr *Inst = MIB.getInstr();
+ MI.eraseFromParent();
+ if (KillSrc) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ Inst->addRegisterKilled(MI.getOperand(1).getReg(), TRI, true);
+ }
+ return true;
+ }
+ case VE::VFMKyal:
+ case VE::VFMKynal:
+ case VE::VFMKWyvl:
+ case VE::VFMKWyvyl:
+ case VE::VFMKSyvl:
+ case VE::VFMKSyvyl:
+ expandPseudoVFMK(*this, MI);
}
return false;
}
@@ -586,8 +1058,8 @@ bool VEInstrInfo::expandGetStackTopPseudo(MachineInstr &MI) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const VEFrameLowering &TFL = *STI.getFrameLowering();
- // The VE ABI requires a reserved 176 bytes area at the top
- // of stack as described in VESubtarget.cpp. So, we adjust it here.
+ // The VE ABI requires a reserved area at the top of stack as described
+ // in VEFrameLowering.cpp. So, we adjust it here.
unsigned NumBytes = STI.getAdjustedFrameSize(0);
// Also adds the size of parameter area.
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h
index 7b6662df1d60..ed1f49182150 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.h
@@ -23,6 +23,31 @@ namespace llvm {
class VESubtarget;
+/// VEII - This namespace holds all of the Aurora VE target-specific
+/// per-instruction flags. These must match the corresponding definitions in
+/// VEInstrFormats.td.
+namespace VEII {
+enum {
+ // Aurora VE Instruction Flags. These flags describe the characteristics of
+ // the Aurora VE instructions for vector handling.
+
+ /// VE_Vector - This instruction is Vector Instruction.
+ VE_Vector = 0x1,
+
+ /// VE_VLInUse - This instruction has a vector register in its operands.
+ VE_VLInUse = 0x2,
+
+ /// VE_VLMask/Shift - This is a bitmask that selects the index number where
+ /// an instruction holds vector length informatio (0 to 6, 7 means undef).n
+ VE_VLShift = 2,
+ VE_VLMask = 0x07 << VE_VLShift,
+};
+
+#define HAS_VLINDEX(TSF) ((TSF)&VEII::VE_VLInUse)
+#define GET_VLINDEX(TSF) \
+ (HAS_VLINDEX(TSF) ? (int)(((TSF)&VEII::VE_VLMask) >> VEII::VE_VLShift) : -1)
+} // end namespace VEII
+
class VEInstrInfo : public VEGenInstrInfo {
const VERegisterInfo RI;
virtual void anchor();
@@ -75,6 +100,13 @@ public:
const TargetRegisterInfo *TRI) const override;
/// } Stack Spill & Reload
+ /// Optimization {
+
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
+ MachineRegisterInfo *MRI) const override;
+
+ /// } Optimization
+
Register getGlobalBaseReg(MachineFunction *MF) const;
// Lower pseudo instructions after register allocation.
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
index 8500f8ef1292..b6862cf7b30d 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.td
@@ -48,7 +48,7 @@ def LO7 : SDNodeXForm<imm, [{
SDLoc(N), MVT::i32);
}]>;
def MIMM : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(convMImmVal(getImmVal(N)),
+ return CurDAG->getTargetConstant(val2MImm(getImmVal(N)),
SDLoc(N), MVT::i32);
}]>;
def LO32 : SDNodeXForm<imm, [{
@@ -66,7 +66,7 @@ def LO7FP : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(SignExtend32(Val, 7), SDLoc(N), MVT::i32);
}]>;
def MIMMFP : SDNodeXForm<fpimm, [{
- return CurDAG->getTargetConstant(convMImmVal(getFpImmVal(N)),
+ return CurDAG->getTargetConstant(val2MImm(getFpImmVal(N)),
SDLoc(N), MVT::i32);
}]>;
def LOFP32 : SDNodeXForm<fpimm, [{
@@ -157,6 +157,15 @@ def uimm3 : Operand<i32>, PatLeaf<(imm), [{
let ParserMatchClass = UImm3AsmOperand;
}
+// uimm4 - Generic immediate value.
+def UImm4AsmOperand : AsmOperandClass {
+ let Name = "UImm4";
+}
+def uimm4 : Operand<i32>, PatLeaf<(imm), [{
+ return isUInt<4>(N->getZExtValue()); }], ULO7> {
+ let ParserMatchClass = UImm4AsmOperand;
+}
+
// uimm6 - Generic immediate value.
def UImm6AsmOperand : AsmOperandClass {
let Name = "UImm6";
@@ -196,6 +205,12 @@ def mimm : Operand<i32>, PatLeaf<(imm), [{
let PrintMethod = "printMImmOperand";
}
+// zerofp - Generic fp immediate zero value.
+def zerofp : Operand<i32>, PatLeaf<(fpimm), [{
+ return getFpImmVal(N) == 0; }]> {
+ let ParserMatchClass = ZeroAsmOperand;
+}
+
// simm7fp - Generic fp immediate value.
def simm7fp : Operand<i32>, PatLeaf<(fpimm), [{
return isInt<7>(getFpImmVal(N));
@@ -230,6 +245,7 @@ def fplomsbzero : PatLeaf<(fpimm), [{ return (getFpImmVal(N) & 0x80000000)
== 0; }]>;
def fplozero : PatLeaf<(fpimm), [{ return (getFpImmVal(N) & 0xffffffff)
== 0; }]>;
+def nonzero : PatLeaf<(imm), [{ return N->getSExtValue() !=0 ; }]>;
def CCSIOp : PatLeaf<(cond), [{
switch (N->get()) {
@@ -430,6 +446,17 @@ def retflag : SDNode<"VEISD::RET_FLAG", SDTNone,
def getGOT : Operand<iPTR>;
+def VEeh_sjlj_setjmp: SDNode<"VEISD::EH_SJLJ_SETJMP",
+ SDTypeProfile<1, 1, [SDTCisInt<0>,
+ SDTCisPtrTy<1>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def VEeh_sjlj_longjmp: SDNode<"VEISD::EH_SJLJ_LONGJMP",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+def VEeh_sjlj_setup_dispatch: SDNode<"VEISD::EH_SJLJ_SETUP_DISPATCH",
+ SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
// GETFUNPLT for PIC
def GetFunPLT : SDNode<"VEISD::GETFUNPLT", SDTIntUnaryOp>;
@@ -442,6 +469,16 @@ def GetTLSAddr : SDNode<"VEISD::GETTLSADDR", SDT_SPCall,
def GetStackTop : SDNode<"VEISD::GETSTACKTOP", SDTNone,
[SDNPHasChain, SDNPSideEffect]>;
+// MEMBARRIER
+def MemBarrier : SDNode<"VEISD::MEMBARRIER", SDTNone,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// TS1AM
+def SDT_TS1AM : SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>, SDTCisInt<3>]>;
+def ts1am : SDNode<"VEISD::TS1AM", SDT_TS1AM,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
//===----------------------------------------------------------------------===//
// VE Flag Conditions
@@ -497,7 +534,8 @@ multiclass RRbm<string opcStr, bits<8>opc,
RegisterClass RCo, ValueType Tyo,
RegisterClass RCi, ValueType Tyi,
SDPatternOperator OpNode = null_frag,
- Operand immOp = simm7, Operand mOp = mimm> {
+ Operand immOp = simm7, Operand mOp = mimm,
+ bit MoveImm = 0> {
def rr : RR<opc, (outs RCo:$sx), (ins RCi:$sy, RCi:$sz),
!strconcat(opcStr, " $sx, $sy, $sz"),
[(set Tyo:$sx, (OpNode Tyi:$sy, Tyi:$sz))]>;
@@ -514,7 +552,12 @@ multiclass RRbm<string opcStr, bits<8>opc,
let cy = 0, cz = 0 in
def im : RR<opc, (outs RCo:$sx), (ins immOp:$sy, mOp:$sz),
!strconcat(opcStr, " $sx, $sy, $sz"),
- [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), (Tyi mOp:$sz)))]>;
+ [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), (Tyi mOp:$sz)))]> {
+ // VE uses ORim as a move immediate instruction, so declare it here.
+ // An instruction declared as MoveImm will be optimized in FoldImmediate
+ // later.
+ let isMoveImm = MoveImm;
+ }
}
// Multiclass for non-commutative RR type instructions
@@ -546,8 +589,8 @@ multiclass RRNCbm<string opcStr, bits<8>opc,
multiclass RRm<string opcStr, bits<8>opc,
RegisterClass RC, ValueType Ty,
SDPatternOperator OpNode = null_frag,
- Operand immOp = simm7, Operand mOp = mimm> :
- RRbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp>;
+ Operand immOp = simm7, Operand mOp = mimm, bit MoveImm = 0> :
+ RRbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp, MoveImm>;
// Generic RR multiclass for non-commutative instructions with 2 arguments.
// e.g. SUBUL, SUBUW, SUBSWSX, and etc.
@@ -775,10 +818,10 @@ multiclass BCbpfm<string opcStr, string cmpStr, bits<8> opc, dag cond,
let bpf = 0 /* NONE */ in
def "" : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
!strconcat(opcStr, " ", cmpStr, "$addr")>;
- let bpf = 2 /* NOT TaKEN */ in
+ let bpf = 2 /* NOT TAKEN */ in
def _nt : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
!strconcat(opcStr, ".nt ", cmpStr, "$addr")>;
- let bpf = 3 /* TaKEN */ in
+ let bpf = 3 /* TAKEN */ in
def _t : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
!strconcat(opcStr, ".t ", cmpStr, "$addr")>;
}
@@ -807,18 +850,25 @@ multiclass BCRbpfm<string opcStr, string cmpStr, bits<8> opc, dag cond> {
let bpf = 0 /* NONE */ in
def "" : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
!strconcat(opcStr, " ", cmpStr, "$imm32")>;
- let bpf = 2 /* NOT TaKEN */ in
+ let bpf = 2 /* NOT TAKEN */ in
def _nt : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
!strconcat(opcStr, ".nt ", cmpStr, "$imm32")>;
- let bpf = 3 /* TaKEN */ in
+ let bpf = 3 /* TAKEN */ in
def _t : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
!strconcat(opcStr, ".t ", cmpStr, "$imm32")>;
}
multiclass BCRm<string opcStr, string opcStrAt, string opcStrAf, bits<8> opc,
- RegisterClass RC, Operand immOp> {
+ RegisterClass RC, Operand immOp, Operand zeroOp> {
defm rr : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, RC:$sy, RC:$sz)>;
let cy = 0 in
- defm ir : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy, RC:$sz)>;
+ defm ir : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy,
+ RC:$sz)>;
+ let cz = 0 in
+ defm rz : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, RC:$sy,
+ zeroOp:$sz)>;
+ let cy = 0, cz = 0 in
+ defm iz : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy,
+ zeroOp:$sz)>;
let cy = 0, sy = 0, cz = 0, sz = 0, cf = 15 /* AT */, isBarrier = 1 in
defm a : BCRbpfm<opcStrAt, "", opc, (ins)>;
let cy = 0, sy = 0, cz = 0, sz = 0, cf = 0 /* AF */ in
@@ -898,7 +948,7 @@ multiclass SHMm<string opcStr, bits<8> opc, RegisterClass RC> {
//-----------------------------------------------------------------------------
// Multiclass for generic RM instructions
-multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC> {
+multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC, bit MoveImm = 0> {
def rri : RM<opc, (outs RC:$dest), (ins MEMrri:$addr),
!strconcat(opcStr, " $dest, $addr"), []>;
let cy = 0 in
@@ -909,36 +959,27 @@ multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC> {
!strconcat(opcStr, " $dest, $addr"), []>;
let cy = 0, cz = 0 in
def zii : RM<opc, (outs RC:$dest), (ins MEMzii:$addr),
- !strconcat(opcStr, " $dest, $addr"), []>;
+ !strconcat(opcStr, " $dest, $addr"), []> {
+ // VE uses LEAzii and LEASLzii as a move immediate instruction, so declare
+ // it here. An instruction declared as MoveImm will be optimized in
+ // FoldImmediate later.
+ let isMoveImm = MoveImm;
+ }
}
// Section 8.2.1 - LEA
-let cx = 0, DecoderMethod = "DecodeLoadI64" in
-defm LEA : RMm<"lea", 0x06, I64>;
-let cx = 1, DecoderMethod = "DecodeLoadI64" in
-defm LEASL : RMm<"lea.sl", 0x06, I64>;
-let cx = 0, DecoderMethod = "DecodeLoadI32", isCodeGenOnly = 1 in
-defm LEA32 : RMm<"lea", 0x06, I32>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+ DecoderMethod = "DecodeLoadI64" in {
+ let cx = 0 in defm LEA : RMm<"lea", 0x06, I64, /* MoveImm */ 1>;
+ let cx = 1 in defm LEASL : RMm<"lea.sl", 0x06, I64, /* MoveImm */ 1>;
+}
+// LEA basic patterns.
+// Need to be defined here to prioritize LEA over ADX.
def : Pat<(iPTR ADDRrri:$addr), (LEArri MEMrri:$addr)>;
def : Pat<(iPTR ADDRrii:$addr), (LEArii MEMrii:$addr)>;
def : Pat<(add I64:$base, simm32:$disp), (LEArii $base, 0, (LO32 $disp))>;
def : Pat<(add I64:$base, lozero:$disp), (LEASLrii $base, 0, (HI32 $disp))>;
-def : Pat<(add I32:$base, simm32:$disp),
- (LEA32rii (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $base, sub_i32), 0,
- (LO32 $disp))>;
-
-def lea_add : PatFrags<(ops node:$base, node:$idx, node:$disp),
- [(add (add node:$base, node:$idx), node:$disp),
- (add (add node:$base, node:$disp), node:$idx)]>;
-def : Pat<(lea_add I64:$base, simm7:$idx, simm32:$disp),
- (LEArii $base, (LO7 $idx), (LO32 $disp))>;
-def : Pat<(lea_add I64:$base, I64:$idx, simm32:$disp),
- (LEArri $base, $idx, (LO32 $disp))>;
-def : Pat<(lea_add I64:$base, simm7:$idx, lozero:$disp),
- (LEASLrii $base, (LO7 $idx), (HI32 $disp))>;
-def : Pat<(lea_add I64:$base, I64:$idx, lozero:$disp),
- (LEASLrri $base, $idx, (HI32 $disp))>;
// Multiclass for load instructions.
let mayLoad = 1, hasSideEffects = 0 in
@@ -991,6 +1032,13 @@ defm LD1BSX : LOADm<"ld1b.sx", 0x05, I32, i32, sextloadi8>;
let cx = 1, DecoderMethod = "DecodeLoadI32" in
defm LD1BZX : LOADm<"ld1b.zx", 0x05, I32, i32, zextloadi8>;
+// LDQ pseudo instructions
+let mayLoad = 1, hasSideEffects = 0 in {
+ def LDQrii : Pseudo<(outs F128:$dest), (ins MEMrii:$addr),
+ "# pseudo ldq $dest, $addr",
+ [(set f128:$dest, (load ADDRrii:$addr))]>;
+}
+
// Multiclass for store instructions.
let mayStore = 1 in
multiclass STOREm<string opcStr, bits<8> opc, RegisterClass RC, ValueType Ty,
@@ -1036,6 +1084,13 @@ defm ST2B : STOREm<"st2b", 0x14, I32, i32, truncstorei16>;
let DecoderMethod = "DecodeStoreI32" in
defm ST1B : STOREm<"st1b", 0x15, I32, i32, truncstorei8>;
+// STQ pseudo instructions
+let mayStore = 1, hasSideEffects = 0 in {
+ def STQrii : Pseudo<(outs), (ins MEMrii:$addr, F128:$sx),
+ "# pseudo stq $sx, $addr",
+ [(store f128:$sx, ADDRrii:$addr)]>;
+}
+
// Section 8.2.12 - DLDS
let DecoderMethod = "DecodeLoadI64" in
defm DLD : LOADm<"dld", 0x09, I64, i64, load>;
@@ -1074,9 +1129,9 @@ defm ATMAM : RRCASm<"atmam", 0x53, I64, i64, uimm0to2>;
// Section 8.2.20 - CAS (Compare and Swap)
let DecoderMethod = "DecodeCASI64" in
-defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7>;
+defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7, atomic_cmp_swap_64>;
let DecoderMethod = "DecodeCASI32", cx = 1 in
-defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7>;
+defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7, atomic_cmp_swap_32>;
//-----------------------------------------------------------------------------
// Section 8.3 - Transfer Control Instructions
@@ -1106,6 +1161,8 @@ def SVOB : RR<0x30, (outs), (ins), "svob">;
// Section 8.4 - Fixed-point Operation Instructions
//-----------------------------------------------------------------------------
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
// Section 8.4.1 - ADD (Add)
defm ADDUL : RRm<"addu.l", 0x48, I64, i64>;
let cx = 1 in defm ADDUW : RRm<"addu.w", 0x48, I32, i32>;
@@ -1128,6 +1185,8 @@ let cx = 1 in defm SUBSWZX : RRNCm<"subs.w.zx", 0x5A, I32, i32>;
// Section 8.4.6 - SBX (Subtract)
defm SUBSL : RRNCm<"subs.l", 0x5B, I64, i64, sub>;
+} // isReMaterializable, isAsCheapAsAMove
+
// Section 8.4.7 - MPY (Multiply)
defm MULUL : RRm<"mulu.l", 0x49, I64, i64>;
let cx = 1 in defm MULUW : RRm<"mulu.w", 0x49, I32, i32>;
@@ -1153,6 +1212,8 @@ let cx = 1 in defm DIVSWZX : RRNCm<"divs.w.zx", 0x7B, I32, i32>;
// Section 8.4.13 - DVX (Divide)
defm DIVSL : RRNCm<"divs.l", 0x7F, I64, i64, sdiv>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
// Section 8.4.14 - CMP (Compare)
defm CMPUL : RRNCm<"cmpu.l", 0x55, I64, i64>;
let cx = 1 in defm CMPUW : RRNCm<"cmpu.w", 0x55, I32, i32>;
@@ -1175,45 +1236,66 @@ let cx = 1, cw = 1 in defm MINSWZX : RRm<"mins.w.zx", 0x78, I32, i32>;
defm MAXSL : RRm<"maxs.l", 0x68, I64, i64>;
let cw = 1 in defm MINSL : RRm<"mins.l", 0x68, I64, i64>;
+} // isReMaterializable, isAsCheapAsAMove
+
//-----------------------------------------------------------------------------
// Section 8.5 - Logical Operation Instructions
//-----------------------------------------------------------------------------
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
// Section 8.5.1 - AND (AND)
defm AND : RRm<"and", 0x44, I64, i64, and>;
-let isCodeGenOnly = 1 in defm AND32 : RRm<"and", 0x44, I32, i32, and>;
// Section 8.5.2 - OR (OR)
-defm OR : RRm<"or", 0x45, I64, i64, or>;
-let isCodeGenOnly = 1 in defm OR32 : RRm<"or", 0x45, I32, i32, or>;
+defm OR : RRm<"or", 0x45, I64, i64, or, simm7, mimm, /* MoveImm */ 1>;
// Section 8.5.3 - XOR (Exclusive OR)
defm XOR : RRm<"xor", 0x46, I64, i64, xor>;
-let isCodeGenOnly = 1 in defm XOR32 : RRm<"xor", 0x46, I32, i32, xor>;
// Section 8.5.4 - EQV (Equivalence)
defm EQV : RRm<"eqv", 0x47, I64, i64>;
+} // isReMaterializable, isAsCheapAsAMove
+
// Section 8.5.5 - NND (Negate AND)
def and_not : PatFrags<(ops node:$x, node:$y),
[(and (not node:$x), node:$y)]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm NND : RRNCm<"nnd", 0x54, I64, i64, and_not>;
// Section 8.5.6 - MRG (Merge)
defm MRG : RRMRGm<"mrg", 0x56, I64, i64>;
// Section 8.5.7 - LDZ (Leading Zero Count)
-defm LDZ : RRI1m<"ldz", 0x67, I64, i64, ctlz>;
+def ctlz_pat : PatFrags<(ops node:$src),
+ [(ctlz node:$src),
+ (ctlz_zero_undef node:$src)]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+defm LDZ : RRI1m<"ldz", 0x67, I64, i64, ctlz_pat>;
// Section 8.5.8 - PCNT (Population Count)
defm PCNT : RRI1m<"pcnt", 0x38, I64, i64, ctpop>;
// Section 8.5.9 - BRV (Bit Reverse)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm BRV : RRI1m<"brv", 0x39, I64, i64, bitreverse>;
// Section 8.5.10 - BSWP (Byte Swap)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm BSWP : RRSWPm<"bswp", 0x2B, I64, i64>;
+def : Pat<(i64 (bswap i64:$src)),
+ (BSWPri $src, 0)>;
+def : Pat<(i64 (bswap (i64 mimm:$src))),
+ (BSWPmi (MIMM $src), 0)>;
+def : Pat<(i32 (bswap i32:$src)),
+ (EXTRACT_SUBREG
+ (BSWPri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub_i32), 1),
+ sub_i32)>;
+def : Pat<(i32 (bswap (i32 mimm:$src))),
+ (EXTRACT_SUBREG (BSWPmi (MIMM $src), 1), sub_i32)>;
+
// Section 8.5.11 - CMOV (Conditional Move)
let cw = 0, cw2 = 0 in defm CMOVL : RRCMOVm<"cmov.l.${cfw}", 0x3B, I64, i64>;
let cw = 1, cw2 = 0 in defm CMOVW : RRCMOVm<"cmov.w.${cfw}", 0x3B, I32, i32>;
@@ -1229,17 +1311,21 @@ def : MnemonicAlias<"cmov.s", "cmov.s.at">;
//-----------------------------------------------------------------------------
// Section 8.6.1 - SLL (Shift Left Logical)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm SLL : RRIm<"sll", 0x65, I64, i64, shl>;
// Section 8.6.2 - SLD (Shift Left Double)
defm SLD : RRILDm<"sld", 0x64, I64, i64>;
// Section 8.6.3 - SRL (Shift Right Logical)
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm SRL : RRIm<"srl", 0x75, I64, i64, srl>;
// Section 8.6.4 - SRD (Shift Right Double)
defm SRD : RRIRDm<"srd", 0x74, I64, i64>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+
// Section 8.6.5 - SLA (Shift Left Arithmetic)
defm SLAWSX : RRIm<"sla.w.sx", 0x66, I32, i32, shl>;
let cx = 1 in defm SLAWZX : RRIm<"sla.w.zx", 0x66, I32, i32>;
@@ -1254,6 +1340,8 @@ let cx = 1 in defm SRAWZX : RRIm<"sra.w.zx", 0x76, I32, i32>;
// Section 8.6.8 - SRAX (Shift Right Arithmetic)
defm SRAL : RRIm<"sra.l", 0x77, I64, i64, sra>;
+} // isReMaterializable, isAsCheapAsAMove
+
def : Pat<(i32 (srl i32:$src, (i32 simm7:$val))),
(EXTRACT_SUBREG (SRLri (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
$src, sub_i32), !add(32, 64)), imm:$val), sub_i32)>;
@@ -1302,13 +1390,13 @@ let cw = 1, cx = 1 in
defm FMINS : RRFm<"fmin.s", 0x3E, F32, f32, fminnum, simm7fp, mimmfp32>;
// Section 8.7.7 - FAQ (Floating Add Quadruple)
-defm FADDQ : RRFm<"fadd.q", 0x6C, F128, f128>;
+defm FADDQ : RRFm<"fadd.q", 0x6C, F128, f128, fadd>;
// Section 8.7.8 - FSQ (Floating Subtract Quadruple)
-defm FSUBQ : RRFm<"fsub.q", 0x7C, F128, f128>;
+defm FSUBQ : RRFm<"fsub.q", 0x7C, F128, f128, fsub>;
// Section 8.7.9 - FMQ (Floating Subtract Quadruple)
-defm FMULQ : RRFm<"fmul.q", 0x6D, F128, f128>;
+defm FMULQ : RRFm<"fmul.q", 0x6D, F128, f128, fmul>;
// Section 8.7.10 - FCQ (Floating Compare Quadruple)
defm FCMPQ : RRNCbm<"fcmp.q", 0x7D, I64, f64, F128, f128, null_frag, simm7fp,
@@ -1339,17 +1427,17 @@ defm CVTDL : CVTm<"cvt.d.l", 0x5F, I64, f64, I64, i64, sint_to_fp>;
// Section 8.7.15 - CVS (Convert to Single-format)
defm CVTSD : CVTm<"cvt.s.d", 0x1F, F32, f32, I64, f64, fpround>;
let cx = 1 in
-defm CVTSQ : CVTm<"cvt.s.q", 0x1F, F32, f32, F128, f128>;
+defm CVTSQ : CVTm<"cvt.s.q", 0x1F, F32, f32, F128, f128, fpround>;
// Section 8.7.16 - CVD (Convert to Double-format)
defm CVTDS : CVTm<"cvt.d.s", 0x0F, I64, f64, F32, f32, fpextend>;
let cx = 1 in
-defm CVTDQ : CVTm<"cvt.d.q", 0x0F, I64, f64, F128, f128>;
+defm CVTDQ : CVTm<"cvt.d.q", 0x0F, I64, f64, F128, f128, fpround>;
// Section 8.7.17 - CVQ (Convert to Single-format)
-defm CVTQD : CVTm<"cvt.q.d", 0x2D, F128, f128, I64, f64>;
+defm CVTQD : CVTm<"cvt.q.d", 0x2D, F128, f128, I64, f64, fpextend>;
let cx = 1 in
-defm CVTQS : CVTm<"cvt.q.s", 0x2D, F128, f128, F32, f32>;
+defm CVTQS : CVTm<"cvt.q.s", 0x2D, F128, f128, F32, f32, fpextend>;
//-----------------------------------------------------------------------------
// Section 8.8 - Branch instructions
@@ -1378,13 +1466,13 @@ defm BCFS : BCm<"b${cond}.s", "b.s", "baf.s", 0x1C, F32, simm7fp>;
// Section 8.8.4 - BCR (Branch on Condition Relative)
let cx = 0, cx2 = 0 in
-defm BRCFL : BCRm<"br${cf}.l", "br.l", "braf.l", 0x18, I64, simm7>;
+defm BRCFL : BCRm<"br${cf}.l", "br.l", "braf.l", 0x18, I64, simm7, zero>;
let cx = 1, cx2 = 0 in
-defm BRCFW : BCRm<"br${cf}.w", "br.w", "braf.w", 0x18, I32, simm7>;
+defm BRCFW : BCRm<"br${cf}.w", "br.w", "braf.w", 0x18, I32, simm7, zero>;
let cx = 0, cx2 = 1 in
-defm BRCFD : BCRm<"br${cf}.d", "br.d", "braf.d", 0x18, I64, simm7fp>;
+defm BRCFD : BCRm<"br${cf}.d", "br.d", "braf.d", 0x18, I64, simm7fp, zerofp>;
let cx = 1, cx2 = 1 in
-defm BRCFS : BCRm<"br${cf}.s", "br.s", "braf.s", 0x18, F32, simm7fp>;
+defm BRCFS : BCRm<"br${cf}.s", "br.s", "braf.s", 0x18, F32, simm7fp, zerofp>;
// Section 8.8.5 - BSIC (Branch and Save IC)
let isCall = 1, hasSideEffects = 0, DecoderMethod = "DecodeCall" in
@@ -1481,11 +1569,23 @@ defm SHMB : SHMm<"shm.b", 0x31, I64>;
// Pattern Matchings
//===----------------------------------------------------------------------===//
+// Basic cast between registers. This is often used in ISel patterns, so make
+// them as OutPatFrag.
+def i2l : OutPatFrag<(ops node:$exp),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_i32)>;
+def l2i : OutPatFrag<(ops node:$exp),
+ (EXTRACT_SUBREG $exp, sub_i32)>;
+def f2l : OutPatFrag<(ops node:$exp),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $exp, sub_f32)>;
+def l2f : OutPatFrag<(ops node:$exp),
+ (EXTRACT_SUBREG $exp, sub_f32)>;
+
// Small immediates.
-def : Pat<(i32 simm7:$val), (OR32im (LO7 $val), 0)>;
+def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>;
def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
// Medium immediates.
-def : Pat<(i32 simm32:$val), (LEA32zii 0, 0, (LO32 $val))>;
+def : Pat<(i32 simm32:$val),
+ (EXTRACT_SUBREG (LEAzii 0, 0, (LO32 $val)), sub_i32)>;
def : Pat<(i64 simm32:$val), (LEAzii 0, 0, (LO32 $val))>;
def : Pat<(i64 uimm32:$val), (ANDrm (LEAzii 0, 0, (LO32 $val)), !add(32, 64))>;
// Arbitrary immediates.
@@ -1497,6 +1597,54 @@ def : Pat<(i64 imm:$val),
(LEASLrii (ANDrm (LEAzii 0, 0, (LO32 imm:$val)), !add(32, 64)), 0,
(HI32 imm:$val))>;
+// LEA patterns
+def lea_add : PatFrags<(ops node:$base, node:$idx, node:$disp),
+ [(add (add node:$base, node:$idx), node:$disp),
+ (add (add node:$base, node:$disp), node:$idx),
+ (add node:$base, (add $idx, $disp))]>;
+def : Pat<(lea_add I64:$base, simm7:$idx, simm32:$disp),
+ (LEArii $base, (LO7 $idx), (LO32 $disp))>;
+def : Pat<(lea_add I64:$base, I64:$idx, simm32:$disp),
+ (LEArri $base, $idx, (LO32 $disp))>;
+def : Pat<(lea_add I64:$base, simm7:$idx, lozero:$disp),
+ (LEASLrii $base, (LO7 $idx), (HI32 $disp))>;
+def : Pat<(lea_add I64:$base, I64:$idx, lozero:$disp),
+ (LEASLrri $base, $idx, (HI32 $disp))>;
+
+// Address calculation patterns and optimizations
+//
+// Generate following instructions:
+// 1. LEA %reg, label@LO32
+// AND %reg, %reg, (32)0
+// 2. LEASL %reg, label@HI32
+// 3. (LEA %reg, label@LO32)
+// (AND %reg, %reg, (32)0)
+// LEASL %reg, label@HI32(, %reg)
+// 4. (LEA %reg, label@LO32)
+// (AND %reg, %reg, (32)0)
+// LEASL %reg, label@HI32(%reg, %got)
+//
+def velo_only : OutPatFrag<(ops node:$lo),
+ (ANDrm (LEAzii 0, 0, $lo), !add(32, 64))>;
+def vehi_only : OutPatFrag<(ops node:$hi),
+ (LEASLzii 0, 0, $hi)>;
+def vehi_lo : OutPatFrag<(ops node:$hi, node:$lo),
+ (LEASLrii $lo, 0, $hi)>;
+def vehi_lo_imm : OutPatFrag<(ops node:$hi, node:$lo, node:$idx),
+ (LEASLrii $lo, $idx, $hi)>;
+def vehi_baselo : OutPatFrag<(ops node:$base, node:$hi, node:$lo),
+ (LEASLrri $base, $lo, $hi)>;
+foreach type = [ "tblockaddress", "tconstpool", "texternalsym", "tglobaladdr",
+ "tglobaltlsaddr", "tjumptable" ] in {
+ def : Pat<(VElo !cast<SDNode>(type):$lo), (velo_only $lo)>;
+ def : Pat<(VEhi !cast<SDNode>(type):$hi), (vehi_only $hi)>;
+ def : Pat<(add (VEhi !cast<SDNode>(type):$hi), I64:$lo), (vehi_lo $hi, $lo)>;
+ def : Pat<(add (add (VEhi !cast<SDNode>(type):$hi), I64:$lo), simm7:$val),
+ (vehi_lo_imm $hi, $lo, (LO7 $val))>;
+ def : Pat<(add I64:$base, (add (VEhi !cast<SDNode>(type):$hi), I64:$lo)),
+ (vehi_baselo $base, $hi, $lo)>;
+}
+
// floating point
def : Pat<(f32 fpimm:$val),
(EXTRACT_SUBREG (LEASLzii 0, 0, (HIFP32 $val)), sub_f32)>;
@@ -1526,8 +1674,8 @@ def : Pat<(sext_inreg I64:$src, i8),
(SRALri (SLLri $src, 56), 56)>;
def : Pat<(sext_inreg (i32 (trunc i64:$src)), i8),
(EXTRACT_SUBREG (SRALri (SLLri $src, 56), 56), sub_i32)>;
-def : Pat<(and (trunc i64:$src), 0xff),
- (AND32rm (EXTRACT_SUBREG $src, sub_i32), !add(56, 64))>;
+def : Pat<(i32 (and (trunc i64:$src), 0xff)),
+ (EXTRACT_SUBREG (ANDrm $src, !add(56, 64)), sub_i32)>;
// Cast to i16
def : Pat<(sext_inreg I32:$src, i16),
@@ -1536,28 +1684,34 @@ def : Pat<(sext_inreg I64:$src, i16),
(SRALri (SLLri $src, 48), 48)>;
def : Pat<(sext_inreg (i32 (trunc i64:$src)), i16),
(EXTRACT_SUBREG (SRALri (SLLri $src, 48), 48), sub_i32)>;
-def : Pat<(and (trunc i64:$src), 0xffff),
- (AND32rm (EXTRACT_SUBREG $src, sub_i32), !add(48, 64))>;
+def : Pat<(i32 (and (trunc i64:$src), 0xffff)),
+ (EXTRACT_SUBREG (ANDrm $src, !add(48, 64)), sub_i32)>;
// Cast to i32
def : Pat<(i32 (trunc i64:$src)),
- (ADDSWSXrm (EXTRACT_SUBREG $src, sub_i32), 0)>;
-def : Pat<(i32 (fp_to_sint I64:$reg)), (CVTWDSXr RD_RZ, $reg)>;
-def : Pat<(i32 (fp_to_sint F32:$reg)), (CVTWSSXr RD_RZ, $reg)>;
+ (EXTRACT_SUBREG (ANDrm $src, !add(32, 64)), sub_i32)>;
+def : Pat<(i32 (fp_to_sint f32:$src)), (CVTWSSXr RD_RZ, $src)>;
+def : Pat<(i32 (fp_to_sint f64:$src)), (CVTWDSXr RD_RZ, $src)>;
+def : Pat<(i32 (fp_to_sint f128:$src)), (CVTWDSXr RD_RZ, (CVTDQr $src))>;
// Cast to i64
-def : Pat<(sext_inreg I64:$src, i32),
+def : Pat<(sext_inreg i64:$src, i32),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(ADDSWSXrm (EXTRACT_SUBREG $src, sub_i32), 0), sub_i32)>;
-def : Pat<(i64 (sext i32:$sy)),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWSXrm $sy, 0), sub_i32)>;
-def : Pat<(i64 (zext i32:$sy)),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWZXrm $sy, 0), sub_i32)>;
-def : Pat<(i64 (fp_to_sint f32:$sy)), (CVTLDr RD_RZ, (CVTDSr $sy))>;
-def : Pat<(i64 (fp_to_sint I64:$reg)), (CVTLDr RD_RZ, $reg)>;
+def : Pat<(i64 (sext i32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWSXrm $src, 0), sub_i32)>;
+def : Pat<(i64 (zext i32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWZXrm $src, 0), sub_i32)>;
+def : Pat<(i64 (fp_to_sint f32:$src)), (CVTLDr RD_RZ, (CVTDSr $src))>;
+def : Pat<(i64 (fp_to_sint f64:$src)), (CVTLDr RD_RZ, $src)>;
+def : Pat<(i64 (fp_to_sint f128:$src)), (CVTLDr RD_RZ, (CVTDQr $src))>;
// Cast to f32
-def : Pat<(f32 (sint_to_fp i64:$sy)), (CVTSDr (CVTDLr i64:$sy))>;
+def : Pat<(f32 (sint_to_fp i64:$src)), (CVTSDr (CVTDLr i64:$src))>;
+
+// Cast to f128
+def : Pat<(f128 (sint_to_fp i32:$src)), (CVTQDr (CVTDWr $src))>;
+def : Pat<(f128 (sint_to_fp i64:$src)), (CVTQDr (CVTDLr $src))>;
def : Pat<(i64 (anyext i32:$sy)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, sub_i32)>;
@@ -1625,29 +1779,150 @@ defm : TRUNC64m<truncstorei8, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
defm : TRUNC64m<truncstorei16, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
defm : TRUNC64m<truncstorei32, STLrri, STLrii, STLzri, ST1Bzii>;
-// Address calculation and its optimization
-def : Pat<(VEhi tglobaladdr:$in), (LEASLzii 0, 0, tglobaladdr:$in)>;
-def : Pat<(VElo tglobaladdr:$in),
- (ANDrm (LEAzii 0, 0, tglobaladdr:$in), !add(32, 64))>;
-def : Pat<(add (VEhi tglobaladdr:$in1), (VElo tglobaladdr:$in2)),
- (LEASLrii (ANDrm (LEAzii 0, 0, tglobaladdr:$in2), !add(32, 64)), 0,
- (tglobaladdr:$in1))>;
-
-// GlobalTLS address calculation and its optimization
-def : Pat<(VEhi tglobaltlsaddr:$in), (LEASLzii 0, 0, tglobaltlsaddr:$in)>;
-def : Pat<(VElo tglobaltlsaddr:$in),
- (ANDrm (LEAzii 0, 0, tglobaltlsaddr:$in), !add(32, 64))>;
-def : Pat<(add (VEhi tglobaltlsaddr:$in1), (VElo tglobaltlsaddr:$in2)),
- (LEASLrii (ANDrm (LEAzii 0, 0, tglobaltlsaddr:$in2), !add(32, 64)), 0,
- (tglobaltlsaddr:$in1))>;
-
-// Address calculation and its optimization
-def : Pat<(VEhi texternalsym:$in), (LEASLzii 0, 0, texternalsym:$in)>;
-def : Pat<(VElo texternalsym:$in),
- (ANDrm (LEAzii 0, 0, texternalsym:$in), !add(32, 64))>;
-def : Pat<(add (VEhi texternalsym:$in1), (VElo texternalsym:$in2)),
- (LEASLrii (ANDrm (LEAzii 0, 0, texternalsym:$in2), !add(32, 64)), 0,
- (texternalsym:$in1))>;
+// Atomic loads
+multiclass ATMLDm<SDPatternOperator from,
+ SDPatternOperator torri, SDPatternOperator torii,
+ SDPatternOperator tozri, SDPatternOperator tozii> {
+ def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
+ def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
+ def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
+ def : Pat<(from ADDRzii:$addr), (tozii MEMzii:$addr)>;
+}
+defm : ATMLDm<atomic_load_8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
+defm : ATMLDm<atomic_load_16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
+defm : ATMLDm<atomic_load_32, LDLZXrri, LDLZXrii, LDLZXzri, LDLZXzii>;
+defm : ATMLDm<atomic_load_64, LDrri, LDrii, LDzri, LDzii>;
+
+// Optimized atomic loads with sext
+multiclass SXATMLDm<SDPatternOperator from, Operand TY,
+ SDPatternOperator torri, SDPatternOperator torii,
+ SDPatternOperator tozri, SDPatternOperator tozii> {
+ def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrri:$addr))), TY)),
+ (i2l (torri MEMrri:$addr))>;
+ def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrii:$addr))), TY)),
+ (i2l (torii MEMrii:$addr))>;
+ def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRzri:$addr))), TY)),
+ (i2l (tozri MEMzri:$addr))>;
+ def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRzii:$addr))), TY)),
+ (i2l (tozii MEMzii:$addr))>;
+}
+multiclass SXATMLD32m<SDPatternOperator from,
+ SDPatternOperator torri, SDPatternOperator torii,
+ SDPatternOperator tozri, SDPatternOperator tozii> {
+ def : Pat<(i64 (sext (from ADDRrri:$addr))),
+ (i2l (torri MEMrri:$addr))>;
+ def : Pat<(i64 (sext (from ADDRrii:$addr))),
+ (i2l (torii MEMrii:$addr))>;
+ def : Pat<(i64 (sext (from ADDRzri:$addr))),
+ (i2l (tozri MEMzri:$addr))>;
+ def : Pat<(i64 (sext (from ADDRzii:$addr))),
+ (i2l (tozii MEMzii:$addr))>;
+}
+defm : SXATMLDm<atomic_load_8, i8, LD1BSXrri, LD1BSXrii, LD1BSXzri, LD1BSXzii>;
+defm : SXATMLDm<atomic_load_16, i16, LD2BSXrri, LD2BSXrii, LD2BSXzri,
+ LD2BSXzii>;
+defm : SXATMLD32m<atomic_load_32, LDLSXrri, LDLSXrii, LDLSXzri, LDLSXzii>;
+
+// Optimized atomic loads with zext
+multiclass ZXATMLDm<SDPatternOperator from, Operand VAL,
+ SDPatternOperator torri, SDPatternOperator torii,
+ SDPatternOperator tozri, SDPatternOperator tozii> {
+ def : Pat<(i64 (and (anyext (from ADDRrri:$addr)), VAL)),
+ (i2l (torri MEMrri:$addr))>;
+ def : Pat<(i64 (and (anyext (from ADDRrii:$addr)), VAL)),
+ (i2l (torii MEMrii:$addr))>;
+ def : Pat<(i64 (and (anyext (from ADDRzri:$addr)), VAL)),
+ (i2l (tozri MEMzri:$addr))>;
+ def : Pat<(i64 (and (anyext (from ADDRzii:$addr)), VAL)),
+ (i2l (tozii MEMzii:$addr))>;
+}
+multiclass ZXATMLD32m<SDPatternOperator from, Operand VAL,
+ SDPatternOperator torri, SDPatternOperator torii,
+ SDPatternOperator tozri, SDPatternOperator tozii> {
+ def : Pat<(i64 (zext (from ADDRrri:$addr))),
+ (i2l (torri MEMrri:$addr))>;
+ def : Pat<(i64 (zext (from ADDRrii:$addr))),
+ (i2l (torii MEMrii:$addr))>;
+ def : Pat<(i64 (zext (from ADDRzri:$addr))),
+ (i2l (tozri MEMzri:$addr))>;
+ def : Pat<(i64 (zext (from ADDRzii:$addr))),
+ (i2l (tozii MEMzii:$addr))>;
+}
+defm : ZXATMLDm<atomic_load_8, 0xFF, LD1BZXrri, LD1BZXrii, LD1BZXzri,
+ LD1BZXzii>;
+defm : ZXATMLDm<atomic_load_16, 0xFFFF, LD2BZXrri, LD2BZXrii, LD2BZXzri,
+ LD2BZXzii>;
+defm : ZXATMLD32m<atomic_load_32, 0xFFFFFFFF, LDLZXrri, LDLZXrii, LDLZXzri,
+ LDLZXzii>;
+
+// Atomic stores
+multiclass ATMSTm<SDPatternOperator from, ValueType ty,
+ SDPatternOperator torri, SDPatternOperator torii,
+ SDPatternOperator tozri, SDPatternOperator tozii> {
+ def : Pat<(from ADDRrri:$addr, ty:$src), (torri MEMrri:$addr, $src)>;
+ def : Pat<(from ADDRrii:$addr, ty:$src), (torii MEMrii:$addr, $src)>;
+ def : Pat<(from ADDRzri:$addr, ty:$src), (tozri MEMzri:$addr, $src)>;
+ def : Pat<(from ADDRzii:$addr, ty:$src), (tozii MEMzii:$addr, $src)>;
+}
+defm : ATMSTm<atomic_store_8, i32, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
+defm : ATMSTm<atomic_store_16, i32, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
+defm : ATMSTm<atomic_store_32, i32, STLrri, STLrii, STLzri, STLzii>;
+defm : ATMSTm<atomic_store_64, i64, STrri, STrii, STzri, STzii>;
+
+// Optimized atomic stores with truncate
+multiclass TRATMSTm<SDPatternOperator from,
+ ValueType ty,
+ SDPatternOperator torri,
+ SDPatternOperator torii,
+ SDPatternOperator tozri,
+ SDPatternOperator tozii> {
+ def : Pat<(from ADDRrri:$addr, (i32 (trunc i64:$src))),
+ (torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+ def : Pat<(from ADDRrii:$addr, (i32 (trunc i64:$src))),
+ (torii MEMrii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+ def : Pat<(from ADDRzri:$addr, (i32 (trunc i64:$src))),
+ (tozri MEMzri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+ def : Pat<(from ADDRzii:$addr, (i32 (trunc i64:$src))),
+ (tozii MEMzii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+}
+defm : TRATMSTm<atomic_store_8, i32, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
+defm : TRATMSTm<atomic_store_16, i32, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
+defm : TRATMSTm<atomic_store_32, i32, STLrri, STLrii, STLzri, STLzii>;
+
+// Atomic swaps
+def : Pat<(i32 (ts1am i64:$src, i32:$flag, i32:$new)),
+ (TS1AMWrir $src, 0, $flag, $new)>;
+def : Pat<(i32 (atomic_swap_32 ADDRri:$src, i32:$new)),
+ (TS1AMWrii MEMriRRM:$src, 15, $new)>;
+def : Pat<(i64 (atomic_swap_64 ADDRri:$src, i64:$new)),
+ (TS1AMLrir MEMriRRM:$src, (LEAzii 0, 0, 255), i64:$new)>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling patterns
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+ usesCustomInserter = 1 in {
+ let isTerminator = 1 in
+ def EH_SjLj_LongJmp : Pseudo<(outs), (ins I64:$buf),
+ "# EH_SJLJ_LONGJMP",
+ [(VEeh_sjlj_longjmp I64:$buf)]>;
+
+ def EH_SjLj_SetJmp : Pseudo<(outs I32:$dst), (ins I64:$buf),
+ "# EH_SJLJ_SETJMP",
+ [(set I32:$dst, (VEeh_sjlj_setjmp I64:$buf))]>;
+
+ def EH_SjLj_Setup_Dispatch : Pseudo<(outs), (ins), "# EH_SJLJ_SETUP_DISPATCH",
+ [(VEeh_sjlj_setup_dispatch)]>;
+}
+
+let isTerminator = 1, isBranch = 1, isCodeGenOnly = 1 in
+ def EH_SjLj_Setup : Pseudo<(outs), (ins brtarget32:$dst),
+ "# EH_SJlJ_SETUP $dst">;
+
+//===----------------------------------------------------------------------===//
+// Branch related patterns
+//===----------------------------------------------------------------------===//
// Branches
def : Pat<(br bb:$addr), (BRCFLa bb:$addr)>;
@@ -1681,6 +1956,8 @@ multiclass BRCCFm<ValueType ty, SDPatternOperator BrOpNode1,
}
defm : BRCCFm<f32, BRCFSrr, BRCFSir>;
defm : BRCCFm<f64, BRCFDrr, BRCFDir>;
+def : Pat<(brcc cond:$cond, f128:$l, f128:$r, bb:$addr),
+ (BRCFDir (fcond2cc $cond), 0, (FCMPQrr $r, $l), bb:$addr)>;
//===----------------------------------------------------------------------===//
// Pseudo Instructions
@@ -1737,53 +2014,42 @@ let Uses = [SX11], hasSideEffects = 1 in
def GETSTACKTOP : Pseudo<(outs I64:$dst), (ins),
"# GET STACK TOP",
[(set iPTR:$dst, (GetStackTop))]>;
+
+// MEMBARRIER
+let hasSideEffects = 1 in
+def MEMBARRIER : Pseudo<(outs), (ins), "# MEMBARRIER", [(MemBarrier)] >;
+
+//===----------------------------------------------------------------------===//
+// Other patterns
+//===----------------------------------------------------------------------===//
+
// SETCC pattern matches
//
// CMP %tmp, lhs, rhs ; compare lhs and rhs
// or %res, 0, (0)1 ; initialize by 0
// CMOV %res, (63)0, %tmp ; set 1 if %tmp is true
-def : Pat<(i32 (setcc i64:$LHS, i64:$RHS, CCSIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVLrm (icond2cc $cond),
- (CMPSLrr i64:$LHS, i64:$RHS),
- !add(63, 64),
- (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc i64:$LHS, i64:$RHS, CCUIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVLrm (icond2cc $cond),
- (CMPULrr i64:$LHS, i64:$RHS),
- !add(63, 64),
- (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc i32:$LHS, i32:$RHS, CCSIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVWrm (icond2cc $cond),
- (CMPSWSXrr i32:$LHS, i32:$RHS),
- !add(63, 64),
- (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc i32:$LHS, i32:$RHS, CCUIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVWrm (icond2cc $cond),
- (CMPUWrr i32:$LHS, i32:$RHS),
- !add(63, 64),
- (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc f64:$LHS, f64:$RHS, cond:$cond)),
- (EXTRACT_SUBREG
- (CMOVDrm (fcond2cc $cond),
- (FCMPDrr f64:$LHS, f64:$RHS),
- !add(63, 64),
- (ORim 0, 0)), sub_i32)>;
-
-def : Pat<(i32 (setcc f32:$LHS, f32:$RHS, cond:$cond)),
- (EXTRACT_SUBREG
- (CMOVSrm (fcond2cc $cond),
- (FCMPSrr f32:$LHS, f32:$RHS),
- !add(63, 64),
- (ORim 0, 0)), sub_i32)>;
+class setccrr<Instruction INSN> :
+ OutPatFrag<(ops node:$cond, node:$comp),
+ (EXTRACT_SUBREG
+ (INSN $cond, $comp,
+ !add(63, 64), // means (63)0 == 1
+ (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc i32:$l, i32:$r, CCSIOp:$cond)),
+ (setccrr<CMOVWrm> (icond2cc $cond), (CMPSWSXrr $l, $r))>;
+def : Pat<(i32 (setcc i32:$l, i32:$r, CCUIOp:$cond)),
+ (setccrr<CMOVWrm> (icond2cc $cond), (CMPUWrr $l, $r))>;
+def : Pat<(i32 (setcc i64:$l, i64:$r, CCSIOp:$cond)),
+ (setccrr<CMOVLrm> (icond2cc $cond), (CMPSLrr $l, $r))>;
+def : Pat<(i32 (setcc i64:$l, i64:$r, CCUIOp:$cond)),
+ (setccrr<CMOVLrm> (icond2cc $cond), (CMPULrr $l, $r))>;
+def : Pat<(i32 (setcc f32:$l, f32:$r, cond:$cond)),
+ (setccrr<CMOVSrm> (fcond2cc $cond), (FCMPSrr $l, $r))>;
+def : Pat<(i32 (setcc f64:$l, f64:$r, cond:$cond)),
+ (setccrr<CMOVDrm> (fcond2cc $cond), (FCMPDrr $l, $r))>;
+def : Pat<(i32 (setcc f128:$l, f128:$r, cond:$cond)),
+ (setccrr<CMOVDrm> (fcond2cc $cond), (FCMPQrr $l, $r))>;
// Special SELECTCC pattern matches
// Use min/max for better performance.
@@ -1824,152 +2090,171 @@ def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETLE)),
def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETLE)),
(MINSWSXrr $LHS, $RHS)>;
+// Helper classes to construct cmov patterns for the ease.
+//
+// Hiding INSERT_SUBREG/EXTRACT_SUBREG patterns.
+
+class cmovrr<Instruction INSN> :
+ OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+ (INSN $cond, $comp, $t, $f)>;
+class cmovrm<Instruction INSN, SDNodeXForm MOP = MIMM> :
+ OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+ (INSN $cond, $comp, (MOP $t), $f)>;
+class cmov32rr<Instruction INSN, SubRegIndex sub_oty> :
+ OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+ (EXTRACT_SUBREG
+ (INSN $cond, $comp,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_oty),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_oty)),
+ sub_oty)>;
+class cmov32rm<Instruction INSN, SubRegIndex sub_oty, SDNodeXForm MOP = MIMM> :
+ OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+ (EXTRACT_SUBREG
+ (INSN $cond, $comp,
+ (MOP $t),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_oty)),
+ sub_oty)>;
+class cmov128rr<Instruction INSN> :
+ OutPatFrag<(ops node:$cond, node:$comp, node:$t, node:$f),
+ (INSERT_SUBREG
+ (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+ (INSN $cond, $comp,
+ (EXTRACT_SUBREG $t, sub_odd),
+ (EXTRACT_SUBREG $f, sub_odd)), sub_odd),
+ (INSN $cond, $comp,
+ (EXTRACT_SUBREG $t, sub_even),
+ (EXTRACT_SUBREG $f, sub_even)), sub_even)>;
+
// Generic SELECTCC pattern matches
//
// CMP %tmp, %l, %r ; compare %l and %r
// or %res, %f, (0)1 ; initialize by %f
// CMOV %res, %t, %tmp ; set %t if %tmp is true
-// selectcc for i64 result
-def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCSIOp:$cond)),
- (CMOVWrr (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCUIOp:$cond)),
- (CMOVWrr (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCSIOp:$cond)),
- (CMOVLrr (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCUIOp:$cond)),
- (CMOVLrr (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc f32:$l, f32:$r, i64:$t, i64:$f, cond:$cond)),
- (CMOVSrr (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
-def : Pat<(i64 (selectcc f64:$l, f64:$r, i64:$t, i64:$f, cond:$cond)),
- (CMOVDrr (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
-
-// selectcc for i32 result
def : Pat<(i32 (selectcc i32:$l, i32:$r, i32:$t, i32:$f, CCSIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVWrr (icond2cc $cond),
- (CMPSWSXrr $l, $r),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+ (cmov32rr<CMOVWrr, sub_i32> (icond2cc $cond), (CMPSWSXrr $l, $r),
+ $t, $f)>;
def : Pat<(i32 (selectcc i32:$l, i32:$r, i32:$t, i32:$f, CCUIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVWrr (icond2cc $cond),
- (CMPUWrr $l, $r),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+ (cmov32rr<CMOVWrr, sub_i32> (icond2cc $cond), (CMPUWrr $l, $r),
+ $t, $f)>;
def : Pat<(i32 (selectcc i64:$l, i64:$r, i32:$t, i32:$f, CCSIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVLrr (icond2cc $cond),
- (CMPSLrr $l, $r),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+ (cmov32rr<CMOVLrr, sub_i32> (icond2cc $cond), (CMPSLrr $l, $r),
+ $t, $f)>;
def : Pat<(i32 (selectcc i64:$l, i64:$r, i32:$t, i32:$f, CCUIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVLrr (icond2cc $cond),
- (CMPULrr $l, $r),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+ (cmov32rr<CMOVLrr, sub_i32> (icond2cc $cond), (CMPULrr $l, $r),
+ $t, $f)>;
def : Pat<(i32 (selectcc f32:$l, f32:$r, i32:$t, i32:$f, cond:$cond)),
- (EXTRACT_SUBREG
- (CMOVSrr (fcond2cc $cond),
- (FCMPSrr $l, $r),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+ (cmov32rr<CMOVSrr, sub_i32> (fcond2cc $cond), (FCMPSrr $l, $r),
+ $t, $f)>;
def : Pat<(i32 (selectcc f64:$l, f64:$r, i32:$t, i32:$f, cond:$cond)),
- (EXTRACT_SUBREG
- (CMOVDrr (fcond2cc $cond),
- (FCMPDrr $l, $r),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+ (cmov32rr<CMOVDrr, sub_i32> (fcond2cc $cond), (FCMPDrr $l, $r),
+ $t, $f)>;
+def : Pat<(i32 (selectcc f128:$l, f128:$r, i32:$t, i32:$f, cond:$cond)),
+ (cmov32rr<CMOVDrr, sub_i32> (fcond2cc $cond), (FCMPQrr $l, $r),
+ $t, $f)>;
-// selectcc for f64 result
-def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCSIOp:$cond)),
- (CMOVWrr (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCUIOp:$cond)),
- (CMOVWrr (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCSIOp:$cond)),
- (CMOVLrr (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCUIOp:$cond)),
- (CMOVLrr (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc f32:$l, f32:$r, f64:$t, f64:$f, cond:$cond)),
- (CMOVSrr (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
-def : Pat<(f64 (selectcc f64:$l, f64:$r, f64:$t, f64:$f, cond:$cond)),
- (CMOVDrr (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCSIOp:$cond)),
+ (cmovrr<CMOVWrr> (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCUIOp:$cond)),
+ (cmovrr<CMOVWrr> (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCSIOp:$cond)),
+ (cmovrr<CMOVLrr> (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCUIOp:$cond)),
+ (cmovrr<CMOVLrr> (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f32:$l, f32:$r, i64:$t, i64:$f, cond:$cond)),
+ (cmovrr<CMOVSrr> (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f64:$l, f64:$r, i64:$t, i64:$f, cond:$cond)),
+ (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f128:$l, f128:$r, i64:$t, i64:$f, cond:$cond)),
+ (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPQrr $l, $r), $t, $f)>;
-// selectcc for f32 result
def : Pat<(f32 (selectcc i32:$l, i32:$r, f32:$t, f32:$f, CCSIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVWrr (icond2cc $cond),
- (CMPSWSXrr $l, $r),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+ (cmov32rr<CMOVWrr, sub_f32> (icond2cc $cond), (CMPSWSXrr $l, $r),
+ $t, $f)>;
def : Pat<(f32 (selectcc i32:$l, i32:$r, f32:$t, f32:$f, CCUIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVWrr (icond2cc $cond),
- (CMPUWrr $l, $r),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+ (cmov32rr<CMOVWrr, sub_f32> (icond2cc $cond), (CMPUWrr $l, $r),
+ $t, $f)>;
def : Pat<(f32 (selectcc i64:$l, i64:$r, f32:$t, f32:$f, CCSIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVLrr (icond2cc $cond),
- (CMPSLrr $l, $r),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+ (cmov32rr<CMOVLrr, sub_f32> (icond2cc $cond), (CMPSLrr $l, $r),
+ $t, $f)>;
def : Pat<(f32 (selectcc i64:$l, i64:$r, f32:$t, f32:$f, CCUIOp:$cond)),
- (EXTRACT_SUBREG
- (CMOVLrr (icond2cc $cond),
- (CMPULrr $l, $r),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+ (cmov32rr<CMOVLrr, sub_f32> (icond2cc $cond), (CMPULrr $l, $r),
+ $t, $f)>;
def : Pat<(f32 (selectcc f32:$l, f32:$r, f32:$t, f32:$f, cond:$cond)),
- (EXTRACT_SUBREG
- (CMOVSrr (fcond2cc $cond),
- (FCMPSrr $l, $r),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+ (cmov32rr<CMOVSrr, sub_f32> (fcond2cc $cond), (FCMPSrr $l, $r),
+ $t, $f)>;
def : Pat<(f32 (selectcc f64:$l, f64:$r, f32:$t, f32:$f, cond:$cond)),
- (EXTRACT_SUBREG
- (CMOVDrr (fcond2cc $cond),
- (FCMPDrr $l, $r),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+ (cmov32rr<CMOVDrr, sub_f32> (fcond2cc $cond), (FCMPDrr $l, $r),
+ $t, $f)>;
+def : Pat<(f32 (selectcc f128:$l, f128:$r, f32:$t, f32:$f, cond:$cond)),
+ (cmov32rr<CMOVDrr, sub_f32> (fcond2cc $cond), (FCMPQrr $l, $r),
+ $t, $f)>;
+
+def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCSIOp:$cond)),
+ (cmovrr<CMOVWrr> (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCUIOp:$cond)),
+ (cmovrr<CMOVWrr> (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCSIOp:$cond)),
+ (cmovrr<CMOVLrr> (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCUIOp:$cond)),
+ (cmovrr<CMOVLrr> (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f32:$l, f32:$r, f64:$t, f64:$f, cond:$cond)),
+ (cmovrr<CMOVSrr> (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f64:$l, f64:$r, f64:$t, f64:$f, cond:$cond)),
+ (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f128:$l, f128:$r, f64:$t, f64:$f, cond:$cond)),
+ (cmovrr<CMOVDrr> (fcond2cc $cond), (FCMPQrr $l, $r), $t, $f)>;
+
+def : Pat<(f128 (selectcc i32:$l, i32:$r, f128:$t, f128:$f, CCSIOp:$cond)),
+ (cmov128rr<CMOVWrr> (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc i32:$l, i32:$r, f128:$t, f128:$f, CCUIOp:$cond)),
+ (cmov128rr<CMOVWrr> (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc i64:$l, i64:$r, f128:$t, f128:$f, CCSIOp:$cond)),
+ (cmov128rr<CMOVLrr> (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc i64:$l, i64:$r, f128:$t, f128:$f, CCUIOp:$cond)),
+ (cmov128rr<CMOVLrr> (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc f32:$l, f32:$r, f128:$t, f128:$f, cond:$cond)),
+ (cmov128rr<CMOVSrr> (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc f64:$l, f64:$r, f128:$t, f128:$f, cond:$cond)),
+ (cmov128rr<CMOVDrr> (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+def : Pat<(f128 (selectcc f128:$l, f128:$r, f128:$t, f128:$f, cond:$cond)),
+ (cmov128rr<CMOVDrr> (fcond2cc $cond), (FCMPQrr $l, $r), $t, $f)>;
// Generic SELECT pattern matches
// Use cmov.w for all cases since %pred holds i32.
//
// CMOV.w.ne %res, %tval, %tmp ; set tval if %tmp is true
+def : Pat<(i32 (select i32:$pred, i32:$t, i32:$f)),
+ (cmov32rr<CMOVWrr, sub_i32> CC_INE, $pred, $t, $f)>;
+def : Pat<(i32 (select i32:$pred, (i32 mimm:$t), i32:$f)),
+ (cmov32rm<CMOVWrm, sub_i32> CC_INE, $pred, $t, $f)>;
+def : Pat<(i32 (select i32:$pred, i32:$t, (i32 mimm:$f))),
+ (cmov32rm<CMOVWrm, sub_i32> CC_IEQ, $pred, $f, $t)>;
+
def : Pat<(i64 (select i32:$pred, i64:$t, i64:$f)),
- (CMOVWrr CC_INE, $pred, $t, $f)>;
+ (cmovrr<CMOVWrr> CC_INE, $pred, $t, $f)>;
+def : Pat<(i64 (select i32:$pred, (i64 mimm:$t), i64:$f)),
+ (cmovrm<CMOVWrm, MIMM> CC_INE, $pred, $t, $f)>;
+def : Pat<(i64 (select i32:$pred, i64:$t, (i64 mimm:$f))),
+ (cmovrm<CMOVWrm, MIMM> CC_IEQ, $pred, $f, $t)>;
-def : Pat<(i32 (select i32:$pred, i32:$t, i32:$f)),
- (EXTRACT_SUBREG
- (CMOVWrr CC_INE, $pred,
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
- sub_i32)>;
+def : Pat<(f32 (select i32:$pred, f32:$t, f32:$f)),
+ (cmov32rr<CMOVWrr, sub_f32> CC_INE, $pred, $t, $f)>;
+def : Pat<(f32 (select i32:$pred, (f32 mimmfp:$t), f32:$f)),
+ (cmov32rm<CMOVWrm, sub_f32, MIMMFP> CC_INE, $pred, $t, $f)>;
+def : Pat<(f32 (select i32:$pred, f32:$t, (f32 mimmfp:$f))),
+ (cmov32rm<CMOVWrm, sub_f32, MIMMFP> CC_IEQ, $pred, $f, $t)>;
def : Pat<(f64 (select i32:$pred, f64:$t, f64:$f)),
- (CMOVWrr CC_INE, $pred, $t, $f)>;
+ (cmovrr<CMOVWrr> CC_INE, $pred, $t, $f)>;
+def : Pat<(f64 (select i32:$pred, (f64 mimmfp:$t), f64:$f)),
+ (cmovrm<CMOVWrm, MIMMFP> CC_INE, $pred, $t, $f)>;
+def : Pat<(f64 (select i32:$pred, f64:$t, (f64 mimmfp:$f))),
+ (cmovrm<CMOVWrm, MIMMFP> CC_IEQ, $pred, $f, $t)>;
-def : Pat<(f32 (select i32:$pred, f32:$t, f32:$f)),
- (EXTRACT_SUBREG
- (CMOVWrr CC_INE, $pred,
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_f32),
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_f32)),
- sub_f32)>;
+def : Pat<(f128 (select i32:$pred, f128:$t, f128:$f)),
+ (cmov128rr<CMOVWrr> CC_INE, $pred, $t, $f)>;
// bitconvert
def : Pat<(f64 (bitconvert i64:$src)), (COPY_TO_REGCLASS $src, I64)>;
@@ -1982,24 +2267,48 @@ def : Pat<(f32 (bitconvert i32:$op)),
(EXTRACT_SUBREG (SLLri (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
$op, sub_i32), 32), sub_f32)>;
-// Bits operations pattern matchings.
-def : Pat<(i32 (ctpop i32:$src)),
- (EXTRACT_SUBREG (PCNTr (ANDrm (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), $src, sub_i32), !add(32, 64))), sub_i32)>;
-def : Pat<(i32 (ctlz i32:$src)),
- (EXTRACT_SUBREG (LDZr (SLLri (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), $src, sub_i32), 32)), sub_i32)>;
-def : Pat<(i64 (bswap i64:$src)),
- (BSWPri $src, 0)>;
-def : Pat<(i32 (bswap i32:$src)),
- (EXTRACT_SUBREG (BSWPri (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)), $src, sub_i32), 1), sub_i32)>;
+// Optimize code A generated by `(unsigned char)c << 5` to B.
+// A) sla.w.sx %s0, %s0, 5
+// lea %s1, 224 ; 0xE0
+// and %s0, %s0, %s1
+// B) sla.w.sx %s0, %s0, 5
+// and %s0, %s0, (56)0
+
+def : Pat<(i32 (and i32:$val, 0xff)),
+ (EXTRACT_SUBREG
+ (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $val, sub_i32),
+ !add(56, 64)), sub_i32)>;
+def : Pat<(i32 (and i32:$val, 0xffff)),
+ (EXTRACT_SUBREG
+ (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $val, sub_i32),
+ !add(48, 64)), sub_i32)>;
+def : Pat<(i64 (and i64:$val, 0xffffffff)),
+ (ANDrm $val, !add(32, 64))>;
+
+//===----------------------------------------------------------------------===//
+// Vector Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+// Custom intermediate ISDs.
+class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
+def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
+ [SDTCisVec<0>, IsVLVT<2>]>>;
+
+// Whether this is an all-true mask (assuming undef-bits above VL are all-true).
+def true_mask : PatLeaf<
+ (vec_broadcast (i32 nonzero), (i32 srcvalue))>;
+// Match any broadcast (ignoring VL).
+def any_broadcast : PatFrag<(ops node:$sx),
+ (vec_broadcast node:$sx, (i32 srcvalue))>;
+
+// Vector instructions.
+include "VEInstrVec.td"
+
+// The vevlintrin
+include "VEInstrIntrinsicVL.td"
-// Several special pattern matches to optimize code
+// Patterns and intermediate SD nodes (VEC_*).
+include "VEInstrPatternsVec.td"
-def : Pat<(i32 (and i32:$lhs, 0xff)),
- (AND32rm $lhs, !add(56, 64))>;
-def : Pat<(i32 (and i32:$lhs, 0xffff)),
- (AND32rm $lhs, !add(48, 64))>;
-def : Pat<(i32 (and i32:$lhs, 0xffffffff)),
- (AND32rm $lhs, !add(32, 64))>;
+// Patterns and intermediate SD nodes (VVP_*).
+include "VVPInstrPatternsVec.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
new file mode 100644
index 000000000000..9ec10838db05
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.gen.td
@@ -0,0 +1,1604 @@
+def : Pat<(int_ve_vl_vld_vssl i64:$sy, i64:$sz, i32:$vl), (VLDrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld_vssl simm7:$I, i64:$sz, i32:$vl), (VLDirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu_vssl i64:$sy, i64:$sz, i32:$vl), (VLDUrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDUrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu_vssl simm7:$I, i64:$sz, i32:$vl), (VLDUirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDUirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldunc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDUNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldunc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDUNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldunc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDUNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldunc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDUNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLSXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLSXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLSXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLSXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlsxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLSXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLZXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLZXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDLZXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDLZXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldlzxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDLZXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2d_vssl i64:$sy, i64:$sz, i32:$vl), (VLD2Drrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2d_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLD2Drrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2d_vssl simm7:$I, i64:$sz, i32:$vl), (VLD2Dirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2d_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLD2Dirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2dnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLD2DNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2dnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLD2DNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vld2dnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLD2DNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vld2dnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLD2DNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2d_vssl i64:$sy, i64:$sz, i32:$vl), (VLDU2Drrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2d_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2Drrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2d_vssl simm7:$I, i64:$sz, i32:$vl), (VLDU2Dirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2d_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2Dirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDU2DNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2DNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDU2DNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldu2dnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDU2DNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DSXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DSXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DSXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DSXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dsxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DSXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DZXrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DZXirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzx_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssl i64:$sy, i64:$sz, i32:$vl), (VLDL2DZXNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssvl i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXNCrrl_v i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssl simm7:$I, i64:$sz, i32:$vl), (VLDL2DZXNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vldl2dzxnc_vssvl simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VLDL2DZXNCirl_v (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vst_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstunc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTUNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTUNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstuncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTUNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTLNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTLNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstlncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTLNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2Drrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2Dirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2Drrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2d_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2Dirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2DNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2DNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2DOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2DOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2DOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2DOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VST2DNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VST2DNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vst2dncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VST2DNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2Drrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2Dirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2Drrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2d_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2Dirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2DNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2DNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2DOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2DOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTU2DNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTU2DNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstu2dncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTU2DNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2Drrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2Dirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2Drrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2d_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2Dirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2DNCrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2DNCirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dnc_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2DOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2DOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssl v256f64:$vx, i64:$sy, i64:$sz, i32:$vl), (VSTL2DNCOTrrvl i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssl v256f64:$vx, simm7:$I, i64:$sz, i32:$vl), (VSTL2DNCOTirvl (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssml v256f64:$vx, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCOTrrvml i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vstl2dncot_vssml v256f64:$vx, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSTL2DNCOTirvml (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchv_ssl i64:$sy, i64:$sz, i32:$vl), (PFCHVrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchv_ssl simm7:$I, i64:$sz, i32:$vl), (PFCHVirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchvnc_ssl i64:$sy, i64:$sz, i32:$vl), (PFCHVNCrrl i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_pfchvnc_ssl simm7:$I, i64:$sz, i32:$vl), (PFCHVNCirl (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_lvm_mmss v256i1:$ptm, uimm6:$N, i64:$sz), (LVMir_m (ULO7 $N), i64:$sz, v256i1:$ptm)>;
+def : Pat<(int_ve_vl_lvm_MMss v512i1:$ptm, uimm6:$N, i64:$sz), (LVMyir_y (ULO7 $N), i64:$sz, v512i1:$ptm)>;
+def : Pat<(int_ve_vl_svm_sms v256i1:$vmz, uimm6:$N), (SVMmi v256i1:$vmz, (ULO7 $N))>;
+def : Pat<(int_ve_vl_svm_sMs v512i1:$vmz, uimm6:$N), (SVMyi v512i1:$vmz, (ULO7 $N))>;
+def : Pat<(int_ve_vl_vbrdd_vsl f64:$sy, i32:$vl), (VBRDrl f64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdd_vsvl f64:$sy, v256f64:$pt, i32:$vl), (VBRDrl_v f64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdd_vsmvl f64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDrml_v f64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsl i64:$sy, i32:$vl), (VBRDrl i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdl_vsvl i64:$sy, v256f64:$pt, i32:$vl), (VBRDrl_v i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsmvl i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDrml_v i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsl simm7:$I, i32:$vl), (VBRDil (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdl_vsvl simm7:$I, v256f64:$pt, i32:$vl), (VBRDil_v (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdl_vsmvl simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDiml_v (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrds_vsl f32:$sy, i32:$vl), (VBRDUrl f32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrds_vsvl f32:$sy, v256f64:$pt, i32:$vl), (VBRDUrl_v f32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrds_vsmvl f32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDUrml_v f32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsl i32:$sy, i32:$vl), (VBRDLrl i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdw_vsvl i32:$sy, v256f64:$pt, i32:$vl), (VBRDLrl_v i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsmvl i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDLrml_v i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsl simm7:$I, i32:$vl), (VBRDLil (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vbrdw_vsvl simm7:$I, v256f64:$pt, i32:$vl), (VBRDLil_v (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vbrdw_vsmvl simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VBRDLiml_v (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrd_vsl i64:$sy, i32:$vl), (PVBRDrl i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvbrd_vsvl i64:$sy, v256f64:$pt, i32:$vl), (PVBRDrl_v i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvbrd_vsMvl i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVBRDrml_v i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmv_vsvl uimm7:$N, v256f64:$vz, i32:$vl), (VMVivl (ULO7 $N), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmv_vsvvl uimm7:$N, v256f64:$vz, v256f64:$pt, i32:$vl), (VMVivl_v (ULO7 $N), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmv_vsvmvl uimm7:$N, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMVivml_v (ULO7 $N), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VADDULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vadduw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VADDUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vadduw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vadduw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vadduw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVADDUvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvaddu_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDUvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVADDUrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvaddu_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDUrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDUvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvaddu_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDUrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VADDSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VADDSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVADDSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvadds_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVADDSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvadds_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVADDSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvadds_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVADDSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VADDSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VADDSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VADDSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vaddsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VADDSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vaddsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VADDSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VSUBULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubuw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VSUBUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubuw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubuw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubuw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVSUBUvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubu_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBUvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVSUBUrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubu_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBUrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBUvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubu_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBUrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VSUBSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VSUBSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVSUBSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVSUBSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsubs_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVSUBSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsubs_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSUBSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VSUBSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VSUBSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VSUBSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsubsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VSUBSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsubsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSUBSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMULULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmuluw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmuluw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmuluw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmuluw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMULSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMULSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulslw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMULSLWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulslw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulslw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMULSLWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulslw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmulslw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMULSLWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmulslw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMULSLWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VDIVULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VDIVUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsl v256f64:$vy, i64:$sy, i32:$vl), (VDIVULvrl v256f64:$vy, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vvsvl v256f64:$vy, i64:$sy, v256f64:$pt, i32:$vl), (VDIVULvrl_v v256f64:$vy, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVULvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivul_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVULvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsmvl v256f64:$vy, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULvrml_v v256f64:$vy, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivul_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVULviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsl v256f64:$vy, i32:$sy, i32:$vl), (VDIVUWvrl v256f64:$vy, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vvsvl v256f64:$vy, i32:$sy, v256f64:$pt, i32:$vl), (VDIVUWvrl_v v256f64:$vy, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVUWvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivuw_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVUWvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsmvl v256f64:$vy, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWvrml_v v256f64:$vy, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivuw_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVUWviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VDIVSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VDIVSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsl v256f64:$vy, i32:$sy, i32:$vl), (VDIVSWSXvrl v256f64:$vy, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsvl v256f64:$vy, i32:$sy, v256f64:$pt, i32:$vl), (VDIVSWSXvrl_v v256f64:$vy, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVSWSXvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVSWSXvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsmvl v256f64:$vy, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXvrml_v v256f64:$vy, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswsx_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWSXviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsl v256f64:$vy, i32:$sy, i32:$vl), (VDIVSWZXvrl v256f64:$vy, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsvl v256f64:$vy, i32:$sy, v256f64:$pt, i32:$vl), (VDIVSWZXvrl_v v256f64:$vy, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVSWZXvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVSWZXvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsmvl v256f64:$vy, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXvrml_v v256f64:$vy, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivswzx_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSWZXviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VDIVSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VDIVSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VDIVSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VDIVSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsl v256f64:$vy, i64:$sy, i32:$vl), (VDIVSLvrl v256f64:$vy, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vvsvl v256f64:$vy, i64:$sy, v256f64:$pt, i32:$vl), (VDIVSLvrl_v v256f64:$vy, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsl v256f64:$vy, simm7:$I, i32:$vl), (VDIVSLvil v256f64:$vy, (LO7 $I), i32:$vl)>;
+def : Pat<(int_ve_vl_vdivsl_vvsvl v256f64:$vy, simm7:$I, v256f64:$pt, i32:$vl), (VDIVSLvil_v v256f64:$vy, (LO7 $I), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsmvl v256f64:$vy, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLvrml_v v256f64:$vy, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vdivsl_vvsmvl v256f64:$vy, simm7:$I, v256i1:$vm, v256f64:$pt, i32:$vl), (VDIVSLviml_v v256f64:$vy, (LO7 $I), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VCMPULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPULivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpul_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPULivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPULvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPULrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpul_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPULivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPUWvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpuw_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPUWvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VCMPUWrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPUWrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPUWivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPUWivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPUWvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPUWrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpuw_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPUWivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVCMPUvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmpu_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPUvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVCMPUrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmpu_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPUrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPUvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmpu_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPUrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VCMPSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VCMPSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVCMPSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmps_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVCMPSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcmps_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVCMPSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcmps_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCMPSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VCMPSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VCMPSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VCMPSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VCMPSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcmpsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCMPSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMAXSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMAXSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMAXSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMAXSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMAXSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMAXSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVMAXSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmaxs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMAXSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVMAXSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmaxs_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMAXSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMAXSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmaxs_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMAXSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMINSWSXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswsx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWSXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMINSWSXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswsx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWSXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMINSWSXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswsx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWSXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWSXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWSXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswsx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWSXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMINSWZXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswzx_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWZXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvl i32:$sy, v256f64:$vz, i32:$vl), (VMINSWZXrvl i32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswzx_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWZXrvl_v i32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMINSWZXivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminswzx_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSWZXivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWZXvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWZXrvml_v i32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminswzx_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSWZXivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVMINSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmins_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMINSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVMINSrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvmins_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVMINSrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMINSvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvmins_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVMINSrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMAXSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMAXSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMAXSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMAXSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmaxsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMAXSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VMINSLvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminsl_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSLvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VMINSLrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminsl_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSLrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvl simm7:$I, v256f64:$vz, i32:$vl), (VMINSLivl (LO7 $I), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vminsl_vsvvl simm7:$I, v256f64:$vz, v256f64:$pt, i32:$vl), (VMINSLivl_v (LO7 $I), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSLvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSLrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vminsl_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMINSLivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VANDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vand_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VANDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VANDrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vand_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VANDrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VANDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vand_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VANDrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVANDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvand_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVANDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVANDrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvand_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVANDrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVANDvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvand_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVANDrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VORvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vor_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VORrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVORvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvor_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVORrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VXORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vxor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VXORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VXORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vxor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VXORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VXORvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vxor_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VXORrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVXORvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvxor_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVXORvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVXORrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvxor_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVXORrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVXORvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvxor_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVXORrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VEQVvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_veqv_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VEQVvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vsvl i64:$sy, v256f64:$vz, i32:$vl), (VEQVrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_veqv_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VEQVrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VEQVvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_veqv_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VEQVrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVEQVvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pveqv_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVEQVvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVEQVrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pveqv_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVEQVrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVEQVvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pveqv_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVEQVrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vseq_vl i32:$vl), (VSEQl i32:$vl)>;
+def : Pat<(int_ve_vl_vseq_vvl v256f64:$pt, i32:$vl), (VSEQl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvseqlo_vl i32:$vl), (PVSEQLOl i32:$vl)>;
+def : Pat<(int_ve_vl_pvseqlo_vvl v256f64:$pt, i32:$vl), (PVSEQLOl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsequp_vl i32:$vl), (PVSEQUPl i32:$vl)>;
+def : Pat<(int_ve_vl_pvsequp_vvl v256f64:$pt, i32:$vl), (PVSEQUPl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvseq_vl i32:$vl), (PVSEQl i32:$vl)>;
+def : Pat<(int_ve_vl_pvseq_vvl v256f64:$pt, i32:$vl), (PVSEQl_v i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsll_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSLLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsll_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSLLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLLvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsll_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLLvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLLvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLLvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsll_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLLviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSLLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsll_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSLLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSLLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsll_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSLLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLLvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsll_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLLvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrl_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSRLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrl_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSRLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRLvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsrl_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRLvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRLvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRLvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrl_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRLviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSRLvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsrl_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSRLvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSRLvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsrl_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSRLvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRLvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsrl_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRLvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLAWSXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawsx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLAWSXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSLAWSXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawsx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSLAWSXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLAWSXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vslawsx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLAWSXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWSXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWSXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawsx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWSXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLAWZXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawzx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLAWZXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSLAWZXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslawzx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSLAWZXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLAWZXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vslawzx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLAWZXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWZXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWZXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslawzx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLAWZXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSLAvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsla_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSLAvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSLAvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsla_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSLAvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLAvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsla_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSLAvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSLALvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslal_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSLALvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSLALvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vslal_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSLALvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSLALvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vslal_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSLALvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLALvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLALvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vslal_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSLALviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRAWSXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawsx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRAWSXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSRAWSXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSRAWSXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRAWSXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRAWSXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWSXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWSXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawsx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWSXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRAWZXvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawzx_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRAWZXvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsl v256f64:$vz, i32:$sy, i32:$vl), (VSRAWZXvrl v256f64:$vz, i32:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsvl v256f64:$vz, i32:$sy, v256f64:$pt, i32:$vl), (VSRAWZXvrl_v v256f64:$vz, i32:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRAWZXvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRAWZXvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWZXvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsmvl v256f64:$vz, i32:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWZXvrml_v v256f64:$vz, i32:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsrawzx_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRAWZXviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (PVSRAvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsra_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (PVSRAvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvsl v256f64:$vz, i64:$sy, i32:$vl), (PVSRAvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvsra_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (PVSRAvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvvMvl v256f64:$vz, v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRAvvml_v v256f64:$vz, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvsra_vvsMvl v256f64:$vz, i64:$sy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVSRAvrml_v v256f64:$vz, i64:$sy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvvl v256f64:$vz, v256f64:$vy, i32:$vl), (VSRALvvl v256f64:$vz, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsral_vvvvl v256f64:$vz, v256f64:$vy, v256f64:$pt, i32:$vl), (VSRALvvl_v v256f64:$vz, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsl v256f64:$vz, i64:$sy, i32:$vl), (VSRALvrl v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsral_vvsvl v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSRALvrl_v v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsl v256f64:$vz, uimm6:$N, i32:$vl), (VSRALvil v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vsral_vvsvl v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSRALvil_v v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvvmvl v256f64:$vz, v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRALvvml_v v256f64:$vz, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsmvl v256f64:$vz, i64:$sy, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRALvrml_v v256f64:$vz, i64:$sy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsral_vvsmvl v256f64:$vz, uimm6:$N, v256i1:$vm, v256f64:$pt, i32:$vl), (VSRALviml_v v256f64:$vz, (ULO7 $N), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssl v256f64:$vz, i64:$sy, i64:$sz, i32:$vl), (VSFAvrrl v256f64:$vz, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsfa_vvssvl v256f64:$vz, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VSFAvrrl_v v256f64:$vz, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssl v256f64:$vz, simm7:$I, i64:$sz, i32:$vl), (VSFAvirl v256f64:$vz, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vsfa_vvssvl v256f64:$vz, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VSFAvirl_v v256f64:$vz, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssmvl v256f64:$vz, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSFAvrrml_v v256f64:$vz, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsfa_vvssmvl v256f64:$vz, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VSFAvirml_v v256f64:$vz, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFADDDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfaddd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFADDDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfaddd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfaddd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFADDSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfadds_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFADDSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfadds_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFADDSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfadds_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFADDSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFADDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfadd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFADDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFADDrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfadd_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFADDrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFADDvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfadd_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFADDrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFSUBDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFSUBDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFSUBSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFSUBSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsubs_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFSUBSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsubs_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFSUBSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFSUBvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfsub_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFSUBvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFSUBrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfsub_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFSUBrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFSUBvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfsub_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFSUBrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMULDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuld_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFMULDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuld_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuld_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMULSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuls_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFMULSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmuls_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMULSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmuls_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMULSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFMULvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmul_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMULvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFMULrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmul_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMULrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMULvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmul_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMULrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFDIVDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFDIVDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFDIVSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFDIVSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfdivs_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFDIVSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfdivs_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFDIVSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsqrtd_vvl v256f64:$vy, i32:$vl), (VFSQRTDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsqrtd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFSQRTDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfsqrts_vvl v256f64:$vy, i32:$vl), (VFSQRTSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsqrts_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFSQRTSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFCMPDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmpd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFCMPDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmpd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmpd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFCMPSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmps_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFCMPSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfcmps_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFCMPSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfcmps_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFCMPSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFCMPvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfcmp_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFCMPvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFCMPrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfcmp_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFCMPrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFCMPvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfcmp_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFCMPrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMAXDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFMAXDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxd_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxd_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMAXSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFMAXSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmaxs_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMAXSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmaxs_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMAXSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFMAXvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmax_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMAXvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFMAXrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmax_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMAXrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMAXvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmax_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMAXrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMINDvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmind_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINDvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vsvl f64:$sy, v256f64:$vz, i32:$vl), (VFMINDrvl f64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmind_vsvvl f64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINDrvl_v f64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINDvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmind_vsvmvl f64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINDrvml_v f64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (VFMINSvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmins_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINSvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vsvl f32:$sy, v256f64:$vz, i32:$vl), (VFMINSrvl f32:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmins_vsvvl f32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (VFMINSrvl_v f32:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINSvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmins_vsvmvl f32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMINSrvml_v f32:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vvvl v256f64:$vy, v256f64:$vz, i32:$vl), (PVFMINvvl v256f64:$vy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmin_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMINvvl_v v256f64:$vy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vsvl i64:$sy, v256f64:$vz, i32:$vl), (PVFMINrvl i64:$sy, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmin_vsvvl i64:$sy, v256f64:$vz, v256f64:$pt, i32:$vl), (PVFMINrvl_v i64:$sy, v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMINvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmin_vsvMvl i64:$sy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMINrvml_v i64:$sy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmadd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmadd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFMADDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmadd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmadd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmads_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMADSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmads_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFMADSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmads_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMADSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmads_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMADSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMADvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmad_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMADvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMADrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmad_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMADrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFMADvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmad_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMADvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMADvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMADrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmad_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMADvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFMSBDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbs_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFMSBSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbs_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFMSBSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmsbs_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFMSBSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmsbs_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFMSBSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMSBvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmsb_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMSBvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFMSBrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmsb_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMSBrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFMSBvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmsb_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFMSBvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMSBvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMSBrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfmsb_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFMSBvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmadd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmadd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFNMADDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmadd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmadd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmads_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMADSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmads_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFNMADSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmads_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMADSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmads_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMADSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMADvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmad_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMADvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMADrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmad_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMADrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFNMADvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmad_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMADvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMADvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMADrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmad_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMADvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBDvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBDvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vsvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBDrvvl f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbd_vsvvvl f64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBDrvvl_v f64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvsvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl), (VFNMSBDvrvl v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvsvvl v256f64:$vy, f64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBDvrvl_v v256f64:$vy, f64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBDvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vsvvmvl f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBDrvvml_v f64:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbd_vvsvmvl v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBDvrvml_v v256f64:$vy, f64:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBSvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBSvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vsvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (VFNMSBSrvvl f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbs_vsvvvl f32:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBSrvvl_v f32:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvsvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl), (VFNMSBSvrvl v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvsvvl v256f64:$vy, f32:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (VFNMSBSvrvl_v v256f64:$vy, f32:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvvvmvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBSvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vsvvmvl f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBSrvvml_v f32:$sy, v256f64:$vz, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfnmsbs_vvsvmvl v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, v256f64:$pt, i32:$vl), (VFNMSBSvrvml_v v256f64:$vy, f32:$sy, v256f64:$vw, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMSBvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvvvvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMSBvvvl_v v256f64:$vy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vsvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl), (PVFNMSBrvvl i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmsb_vsvvvl i64:$sy, v256f64:$vz, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMSBrvvl_v i64:$sy, v256f64:$vz, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvsvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl), (PVFNMSBvrvl v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvsvvl v256f64:$vy, i64:$sy, v256f64:$vw, v256f64:$pt, i32:$vl), (PVFNMSBvrvl_v v256f64:$vy, i64:$sy, v256f64:$vw, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvvvMvl v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMSBvvvml_v v256f64:$vy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vsvvMvl i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMSBrvvml_v i64:$sy, v256f64:$vz, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvfnmsb_vvsvMvl v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, v256f64:$pt, i32:$vl), (PVFNMSBvrvml_v v256f64:$vy, i64:$sy, v256f64:$vw, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrcpd_vvl v256f64:$vy, i32:$vl), (VRCPDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrcpd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRCPDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrcps_vvl v256f64:$vy, i32:$vl), (VRCPSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrcps_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRCPSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvrcp_vvl v256f64:$vy, i32:$vl), (PVRCPvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvrcp_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVRCPvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrtd_vvl v256f64:$vy, i32:$vl), (VRSQRTDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrtd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrts_vvl v256f64:$vy, i32:$vl), (VRSQRTSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrts_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvrsqrt_vvl v256f64:$vy, i32:$vl), (PVRSQRTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvrsqrt_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVRSQRTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrtdnex_vvl v256f64:$vy, i32:$vl), (VRSQRTDNEXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrtdnex_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTDNEXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrsqrtsnex_vvl v256f64:$vy, i32:$vl), (VRSQRTSNEXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrsqrtsnex_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRSQRTSNEXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvrsqrtnex_vvl v256f64:$vy, i32:$vl), (PVRSQRTNEXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvrsqrtnex_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVRSQRTNEXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsx_vvl v256f64:$vy, i32:$vl), (VCVTWDSXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDSXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDSXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsxrz_vvl v256f64:$vy, i32:$vl), (VCVTWDSXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdsxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDSXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdsxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDSXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzx_vvl v256f64:$vy, i32:$vl), (VCVTWDZXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDZXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDZXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzxrz_vvl v256f64:$vy, i32:$vl), (VCVTWDZXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwdzxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWDZXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwdzxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWDZXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssx_vvl v256f64:$vy, i32:$vl), (VCVTWSSXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwssx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSSXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSSXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssxrz_vvl v256f64:$vy, i32:$vl), (VCVTWSSXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwssxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSSXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwssxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSSXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszx_vvl v256f64:$vy, i32:$vl), (VCVTWSZXvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwszx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSZXvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszx_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSZXvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszxrz_vvl v256f64:$vy, i32:$vl), (VCVTWSZXvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtwszxrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTWSZXvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtwszxrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTWSZXvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtws_vvl v256f64:$vy, i32:$vl), (PVCVTWSvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcvtws_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVCVTWSvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtws_vvMvl v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCVTWSvml_v RD_NONE, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtwsrz_vvl v256f64:$vy, i32:$vl), (PVCVTWSvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcvtwsrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVCVTWSvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtwsrz_vvMvl v256f64:$vy, v512i1:$vm, v256f64:$pt, i32:$vl), (PVCVTWSvml_v RD_RZ, v256f64:$vy, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtld_vvl v256f64:$vy, i32:$vl), (VCVTLDvl RD_NONE, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtld_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTLDvl_v RD_NONE, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtld_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTLDvml_v RD_NONE, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtldrz_vvl v256f64:$vy, i32:$vl), (VCVTLDvl RD_RZ, v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtldrz_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTLDvl_v RD_RZ, v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtldrz_vvmvl v256f64:$vy, v256i1:$vm, v256f64:$pt, i32:$vl), (VCVTLDvml_v RD_RZ, v256f64:$vy, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtdw_vvl v256f64:$vy, i32:$vl), (VCVTDWvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtdw_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTDWvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtsw_vvl v256f64:$vy, i32:$vl), (VCVTSWvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtsw_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTSWvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_pvcvtsw_vvl v256f64:$vy, i32:$vl), (PVCVTSWvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_pvcvtsw_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (PVCVTSWvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtdl_vvl v256f64:$vy, i32:$vl), (VCVTDLvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtdl_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTDLvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtds_vvl v256f64:$vy, i32:$vl), (VCVTDSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtds_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTDSvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcvtsd_vvl v256f64:$vy, i32:$vl), (VCVTSDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vcvtsd_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VCVTSDvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrg_vvvml v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl), (VMRGvvml v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrg_vvvmvl v256f64:$vy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMRGvvml_v v256f64:$vy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrg_vsvml i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl), (VMRGrvml i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrg_vsvmvl i64:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMRGrvml_v i64:$sy, v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrg_vsvml simm7:$I, v256f64:$vz, v256i1:$vm, i32:$vl), (VMRGivml (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrg_vsvmvl simm7:$I, v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VMRGivml_v (LO7 $I), v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmrgw_vvvMl v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl), (VMRGWvvml v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrgw_vvvMvl v256f64:$vy, v256f64:$vz, v512i1:$vm, v256f64:$pt, i32:$vl), (VMRGWvvml_v v256f64:$vy, v256f64:$vz, v512i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vshf_vvvsl v256f64:$vy, v256f64:$vz, i64:$sy, i32:$vl), (VSHFvvrl v256f64:$vy, v256f64:$vz, i64:$sy, i32:$vl)>;
+def : Pat<(int_ve_vl_vshf_vvvsvl v256f64:$vy, v256f64:$vz, i64:$sy, v256f64:$pt, i32:$vl), (VSHFvvrl_v v256f64:$vy, v256f64:$vz, i64:$sy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vshf_vvvsl v256f64:$vy, v256f64:$vz, uimm6:$N, i32:$vl), (VSHFvvil v256f64:$vy, v256f64:$vz, (ULO7 $N), i32:$vl)>;
+def : Pat<(int_ve_vl_vshf_vvvsvl v256f64:$vy, v256f64:$vz, uimm6:$N, v256f64:$pt, i32:$vl), (VSHFvvil_v v256f64:$vy, v256f64:$vz, (ULO7 $N), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vcp_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VCPvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vex_vvmvl v256f64:$vz, v256i1:$vm, v256f64:$pt, i32:$vl), (VEXvml_v v256f64:$vz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfmklat_ml i32:$vl), (VFMKLal i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklaf_ml i32:$vl), (VFMKLnal i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkat_Ml i32:$vl), (VFMKyal i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkaf_Ml i32:$vl), (VFMKynal i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgt_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllt_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklne_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleq_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklge_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklle_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnum_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgtnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklltnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnenan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleqnan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkleqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgenan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmklgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllenan_mvl v256f64:$vz, i32:$vl), (VFMKLvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkllenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKLvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgt_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlt_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwne_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweq_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwge_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwle_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnum_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgtnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwltnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnenan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweqnan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkweqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgenan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlenan_mvl v256f64:$vz, i32:$vl), (VFMKWvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkwlenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKWvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogt_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgt_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IG, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolt_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplt_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IL, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlone_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupne_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlone_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_INE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeq_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeq_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IEQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloge_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupge_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_IGE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlole_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuple_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlole_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuple_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_ILE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonum_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnum_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogtnan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgtnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloltnan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupltnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonenan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnenan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlonenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwloeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogenan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgenan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlogenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwupgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolenan_mvl v256f64:$vz, i32:$vl), (PVFMKWLOvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplenan_mvl v256f64:$vz, i32:$vl), (PVFMKWUPvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlolenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWLOvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwuplenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKWUPvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgt_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IG, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IG, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlt_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IL, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IL, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwne_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_INE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwne_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_INE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweq_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IEQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweq_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IEQ, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwge_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_IGE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwge_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_IGE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwle_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_ILE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwle_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_ILE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnum_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnum_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_NUM, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_NAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgtnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgtnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_GNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwltnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwltnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_LNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnenan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwnenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_NENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweqnan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkweqnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_EQNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgenan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwgenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_GENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlenan_Mvl v256f64:$vz, i32:$vl), (VFMKWyvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkwlenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKWyvyl CC_LENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgt_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlt_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdne_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeq_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdge_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdle_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnum_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgtnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdltnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnenan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeqnan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgenan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlenan_mvl v256f64:$vz, i32:$vl), (VFMKDvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkdlenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKDvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgt_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslt_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksne_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseq_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksge_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksle_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksle_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnum_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgtnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksltnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnenan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseqnan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkseqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgenan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmksgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslenan_mvl v256f64:$vz, i32:$vl), (VFMKSvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vfmkslenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (VFMKSvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogt_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgt_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_G, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolt_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplt_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplt_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_L, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslone_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupne_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslone_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupne_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeq_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeq_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeq_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_EQ, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloge_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupge_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupge_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_GE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslole_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuple_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslole_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuple_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_LE, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonum_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnum_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnum_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NUM, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogtnan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgtnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgtnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_GNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloltnan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupltnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupltnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_LNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonenan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnenan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslonenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupnenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_NENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeqnan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksloeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupeqnan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_EQNAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogenan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgenan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslogenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksupgenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_GENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolenan_mvl v256f64:$vz, i32:$vl), (PVFMKSLOvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplenan_mvl v256f64:$vz, i32:$vl), (PVFMKSUPvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslolenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSLOvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksuplenan_mvml v256f64:$vz, v256i1:$vm, i32:$vl), (PVFMKSUPvml CC_LENAN, v256f64:$vz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgt_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_G, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_G, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslt_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_L, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslt_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_L, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksne_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksne_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseq_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_EQ, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseq_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_EQ, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksge_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_GE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksge_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_GE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksle_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_LE, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksle_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_LE, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnum_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NUM, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnum_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NUM, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgtnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_GNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgtnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_GNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksltnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_LNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksltnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_LNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnenan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_NENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksnenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_NENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseqnan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_EQNAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkseqnan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_EQNAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgenan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_GENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmksgenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_GENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslenan_Mvl v256f64:$vz, i32:$vl), (VFMKSyvl CC_LENAN, v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_pvfmkslenan_MvMl v256f64:$vz, v512i1:$vm, i32:$vl), (VFMKSyvyl CC_LENAN, v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwsx_vvl v256f64:$vy, i32:$vl), (VSUMWSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwsx_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VSUMWSXvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwzx_vvl v256f64:$vy, i32:$vl), (VSUMWZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsumwzx_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VSUMWZXvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsuml_vvl v256f64:$vy, i32:$vl), (VSUMLvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vsuml_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VSUMLvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsumd_vvl v256f64:$vy, i32:$vl), (VFSUMDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsumd_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VFSUMDvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsums_vvl v256f64:$vy, i32:$vl), (VFSUMSvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfsums_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VFSUMSvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswfstsx_vvl v256f64:$vy, i32:$vl), (VRMAXSWFSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswfstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWFSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxswlstsx_vvl v256f64:$vy, i32:$vl), (VRMAXSWLSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswlstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWLSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxswfstzx_vvl v256f64:$vy, i32:$vl), (VRMAXSWFSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswfstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWFSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxswlstzx_vvl v256f64:$vy, i32:$vl), (VRMAXSWLSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxswlstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSWLSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswfstsx_vvl v256f64:$vy, i32:$vl), (VRMINSWFSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswfstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWFSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswlstsx_vvl v256f64:$vy, i32:$vl), (VRMINSWLSTSXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswlstsx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWLSTSXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswfstzx_vvl v256f64:$vy, i32:$vl), (VRMINSWFSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswfstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWFSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminswlstzx_vvl v256f64:$vy, i32:$vl), (VRMINSWLSTZXvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminswlstzx_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSWLSTZXvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxslfst_vvl v256f64:$vy, i32:$vl), (VRMAXSLFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxslfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSLFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrmaxsllst_vvl v256f64:$vy, i32:$vl), (VRMAXSLLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrmaxsllst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMAXSLLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminslfst_vvl v256f64:$vy, i32:$vl), (VRMINSLFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminslfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSLFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrminsllst_vvl v256f64:$vy, i32:$vl), (VRMINSLLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrminsllst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VRMINSLLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxdfst_vvl v256f64:$vy, i32:$vl), (VFRMAXDFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxdfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXDFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxdlst_vvl v256f64:$vy, i32:$vl), (VFRMAXDLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxdlst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXDLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxsfst_vvl v256f64:$vy, i32:$vl), (VFRMAXSFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxsfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXSFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmaxslst_vvl v256f64:$vy, i32:$vl), (VFRMAXSLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmaxslst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMAXSLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmindfst_vvl v256f64:$vy, i32:$vl), (VFRMINDFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmindfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINDFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrmindlst_vvl v256f64:$vy, i32:$vl), (VFRMINDLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrmindlst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINDLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrminsfst_vvl v256f64:$vy, i32:$vl), (VFRMINSFSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrminsfst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINSFSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vfrminslst_vvl v256f64:$vy, i32:$vl), (VFRMINSLSTvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vfrminslst_vvvl v256f64:$vy, v256f64:$pt, i32:$vl), (VFRMINSLSTvl_v v256f64:$vy, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vrand_vvl v256f64:$vy, i32:$vl), (VRANDvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrand_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VRANDvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vror_vvl v256f64:$vy, i32:$vl), (VRORvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vror_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VRORvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vrxor_vvl v256f64:$vy, i32:$vl), (VRXORvl v256f64:$vy, i32:$vl)>;
+def : Pat<(int_ve_vl_vrxor_vvml v256f64:$vy, v256i1:$vm, i32:$vl), (VRXORvml v256f64:$vy, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgt_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgt_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtnc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtnc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTUvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTUvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTUvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTUvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTUvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTUvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTUvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTUvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTUvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTUvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTUvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtu_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTUvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtu_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTUNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTUNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTUNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTUNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTUNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTUNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTUNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTUNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTUNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTUNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTUNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtunc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTUNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtunc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTUNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLSXvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLSXvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLSXvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLSXvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsx_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLSXNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLSXNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLSXNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLSXNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLSXNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLSXNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLSXNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLSXNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlsxnc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLSXNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLZXvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLZXvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLZXvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLZXvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzx_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VGTLZXNCvrrl v256f64:$vy, i64:$sy, i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXNCvrrl_v v256f64:$vy, i64:$sy, i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VGTLZXNCvrzl v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, i64:$sy, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXNCvrzl_v v256f64:$vy, i64:$sy, (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VGTLZXNCvirl v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, simm7:$I, i64:$sz, v256f64:$pt, i32:$vl), (VGTLZXNCvirl_v v256f64:$vy, (LO7 $I), i64:$sz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssl v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VGTLZXNCvizl v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssvl v256f64:$vy, simm7:$I, zero:$Z, v256f64:$pt, i32:$vl), (VGTLZXNCvizl_v v256f64:$vy, (LO7 $I), (LO7 $Z), i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXNCvrrml v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvrrml_v v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXNCvrzml v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvrzml_v v256f64:$vy, i64:$sy, (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VGTLZXNCvirml v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvirml_v v256f64:$vy, (LO7 $I), i64:$sz, v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssml v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VGTLZXNCvizml v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vgtlzxnc_vvssmvl v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, v256f64:$pt, i32:$vl), (VGTLZXNCvizml_v v256f64:$vy, (LO7 $I), (LO7 $Z), v256i1:$vm, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCNCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCNCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCNCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCNCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCNCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCNCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCNCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCNCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCNCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCNCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscu_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUNCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUNCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUNCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUNCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscunc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCUNCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCUNCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCUNCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCUNCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCUNCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscuncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCUNCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vscl_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLNCvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLNCvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLNCvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLNCvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclnc_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, i32:$vl), (VSCLNCOTvrrvl v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, i32:$vl), (VSCLNCOTvrzvl v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, i32:$vl), (VSCLNCOTvirvl v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssl v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, i32:$vl), (VSCLNCOTvizvl v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCOTvrrvml v256f64:$vy, i64:$sy, i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, i64:$sy, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCOTvrzvml v256f64:$vy, i64:$sy, (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, i64:$sz, v256i1:$vm, i32:$vl), (VSCLNCOTvirvml v256f64:$vy, (LO7 $I), i64:$sz, v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vsclncot_vvssml v256f64:$vx, v256f64:$vy, simm7:$I, zero:$Z, v256i1:$vm, i32:$vl), (VSCLNCOTvizvml v256f64:$vy, (LO7 $I), (LO7 $Z), v256f64:$vx, v256i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_andm_mmm v256i1:$vmy, v256i1:$vmz), (ANDMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_andm_MMM v512i1:$vmy, v512i1:$vmz), (ANDMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_orm_mmm v256i1:$vmy, v256i1:$vmz), (ORMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_orm_MMM v512i1:$vmy, v512i1:$vmz), (ORMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_xorm_mmm v256i1:$vmy, v256i1:$vmz), (XORMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_xorm_MMM v512i1:$vmy, v512i1:$vmz), (XORMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_eqvm_mmm v256i1:$vmy, v256i1:$vmz), (EQVMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_eqvm_MMM v512i1:$vmy, v512i1:$vmz), (EQVMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_nndm_mmm v256i1:$vmy, v256i1:$vmz), (NNDMmm v256i1:$vmy, v256i1:$vmz)>;
+def : Pat<(int_ve_vl_nndm_MMM v512i1:$vmy, v512i1:$vmz), (NNDMyy v512i1:$vmy, v512i1:$vmz)>;
+def : Pat<(int_ve_vl_negm_mm v256i1:$vmy), (NEGMm v256i1:$vmy)>;
+def : Pat<(int_ve_vl_negm_MM v512i1:$vmy), (NEGMy v512i1:$vmy)>;
+def : Pat<(int_ve_vl_pcvm_sml v256i1:$vmy, i32:$vl), (PCVMml v256i1:$vmy, i32:$vl)>;
+def : Pat<(int_ve_vl_lzvm_sml v256i1:$vmy, i32:$vl), (LZVMml v256i1:$vmy, i32:$vl)>;
+def : Pat<(int_ve_vl_tovm_sml v256i1:$vmy, i32:$vl), (TOVMml v256i1:$vmy, i32:$vl)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
new file mode 100644
index 000000000000..69ea133ceed0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrIntrinsicVL.td
@@ -0,0 +1,64 @@
+/// Pattern Matchings for VEL intrinsic instructions.
+
+/// Intrinsic patterns written by hand.
+
+// SVOB pattern.
+def : Pat<(int_ve_vl_svob), (SVOB)>;
+
+// Pack patterns.
+def : Pat<(i64 (int_ve_vl_pack_f32p ADDRrii:$addr0, ADDRrii:$addr1)),
+ (ORrr (f2l (LDUrii MEMrii:$addr0)),
+ (i2l (LDLZXrii MEMrii:$addr1)))>;
+
+def : Pat<(i64 (int_ve_vl_pack_f32a ADDRrii:$addr)),
+ (MULULrr
+ (i2l (LDLZXrii MEMrii:$addr)),
+ (LEASLrii (ANDrm (LEAzii 0, 0, (LO32 (i64 0x0000000100000001))),
+ !add(32, 64)), 0,
+ (HI32 (i64 0x0000000100000001))))>;
+
+// The extract/insert patterns.
+def : Pat<(v256i1 (int_ve_vl_extract_vm512u v512i1:$vm)),
+ (EXTRACT_SUBREG v512i1:$vm, sub_vm_even)>;
+
+def : Pat<(v256i1 (int_ve_vl_extract_vm512l v512i1:$vm)),
+ (EXTRACT_SUBREG v512i1:$vm, sub_vm_odd)>;
+
+def : Pat<(v512i1 (int_ve_vl_insert_vm512u v512i1:$vmx, v256i1:$vmy)),
+ (INSERT_SUBREG v512i1:$vmx, v256i1:$vmy, sub_vm_even)>;
+
+def : Pat<(v512i1 (int_ve_vl_insert_vm512l v512i1:$vmx, v256i1:$vmy)),
+ (INSERT_SUBREG v512i1:$vmx, v256i1:$vmy, sub_vm_odd)>;
+
+// VMRG patterns.
+def : Pat<(int_ve_vl_vmrgw_vsvMl i32:$sy, v256f64:$vz, v512i1:$vm, i32:$vl),
+ (VMRGWrvml (i2l i32:$sy), v256f64:$vz, v512i1:$vm, i32:$vl)>;
+def : Pat<(int_ve_vl_vmrgw_vsvMvl i32:$sy, v256f64:$vz, v512i1:$vm,
+ v256f64:$pt, i32:$vl),
+ (VMRGWrvml_v (i2l i32:$sy), v256f64:$vz, v512i1:$vm, i32:$vl,
+ v256f64:$pt)>;
+
+// VMV patterns.
+def : Pat<(int_ve_vl_vmv_vsvl i32:$sy, v256f64:$vz, i32:$vl),
+ (VMVrvl (i2l i32:$sy), v256f64:$vz, i32:$vl)>;
+def : Pat<(int_ve_vl_vmv_vsvvl i32:$sy, v256f64:$vz, v256f64:$pt, i32:$vl),
+ (VMVrvl_v (i2l i32:$sy), v256f64:$vz, i32:$vl, v256f64:$pt)>;
+def : Pat<(int_ve_vl_vmv_vsvmvl i32:$sy, v256f64:$vz, v256i1:$vm, v256f64:$pt,
+ i32:$vl),
+ (VMVrvml_v (i2l i32:$sy), v256f64:$vz, v256i1:$vm, i32:$vl,
+ v256f64:$pt)>;
+
+// LSV patterns.
+def : Pat<(int_ve_vl_lsv_vvss v256f64:$pt, i32:$sy, i64:$sz),
+ (LSVrr_v (i2l i32:$sy), i64:$sz, v256f64:$pt)>;
+
+// LVS patterns.
+def : Pat<(int_ve_vl_lvsl_svs v256f64:$vx, i32:$sy),
+ (LVSvr v256f64:$vx, (i2l i32:$sy))>;
+def : Pat<(int_ve_vl_lvsd_svs v256f64:$vx, i32:$sy),
+ (LVSvr v256f64:$vx, (i2l i32:$sy))>;
+def : Pat<(int_ve_vl_lvss_svs v256f64:$vx, i32:$sy),
+ (l2f (LVSvr v256f64:$vx, (i2l i32:$sy)))>;
+
+/// Intrinsic patterns automatically generated.
+include "VEInstrIntrinsicVL.gen.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
new file mode 100644
index 000000000000..0084876f9f1b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -0,0 +1,91 @@
+//===-- VEInstrPatternsVec.td - VEC_-type SDNodes and isel for VE Target --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the VEC_* prefixed intermediate SDNodes and their
+// isel patterns.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
+ SDNodeXForm ImmCast, SDNodeXForm SuperRegCast> {
+ // VBRDil
+ def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)),
+ (VBRDil (ImmCast $sy), i32:$vl)>;
+
+ // VBRDrl
+ def : Pat<(v32 (vec_broadcast s32:$sy, i32:$vl)),
+ (VBRDrl (SuperRegCast $sy), i32:$vl)>;
+}
+
+multiclass vbrd_elem64<ValueType v64, ValueType s64,
+ SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+ // VBRDil
+ def : Pat<(v64 (vec_broadcast (s64 ImmOp:$sy), i32:$vl)),
+ (VBRDil (ImmCast $sy), i32:$vl)>;
+
+ // VBRDrl
+ def : Pat<(v64 (vec_broadcast s64:$sy, i32:$vl)),
+ (VBRDrl s64:$sy, i32:$vl)>;
+}
+
+multiclass extract_insert_elem32<ValueType v32, ValueType s32,
+ SDNodeXForm SubRegCast,
+ SDNodeXForm SuperRegCast> {
+ // LVSvi
+ def: Pat<(s32 (extractelt v32:$vec, uimm7:$idx)),
+ (SubRegCast (LVSvi v32:$vec, (ULO7 $idx)))>;
+ // LVSvr
+ def: Pat<(s32 (extractelt v32:$vec, i64:$idx)),
+ (SubRegCast (LVSvr v32:$vec, $idx))>;
+
+ // LSVir
+ def: Pat<(v32 (insertelt v32:$vec, s32:$val, uimm7:$idx)),
+ (LSVir_v (ULO7 $idx), (SuperRegCast $val), $vec)>;
+ // LSVrr
+ def: Pat<(v32 (insertelt v32:$vec, s32:$val, i64:$idx)),
+ (LSVrr_v $idx, (SuperRegCast $val), $vec)>;
+}
+
+multiclass extract_insert_elem64<ValueType v64, ValueType s64> {
+ // LVSvi
+ def: Pat<(s64 (extractelt v64:$vec, uimm7:$idx)),
+ (LVSvi v64:$vec, (ULO7 $idx))>;
+ // LVSvr
+ def: Pat<(s64 (extractelt v64:$vec, i64:$idx)),
+ (LVSvr v64:$vec, $idx)>;
+
+ // LSVir
+ def: Pat<(v64 (insertelt v64:$vec, s64:$val, uimm7:$idx)),
+ (LSVir_v (ULO7 $idx), $val, $vec)>;
+ // LSVrr
+ def: Pat<(v64 (insertelt v64:$vec, s64:$val, i64:$idx)),
+ (LSVrr_v $idx, $val, $vec)>;
+}
+
+multiclass patterns_elem32<ValueType v32, ValueType s32,
+ SDPatternOperator ImmOp, SDNodeXForm ImmCast,
+ SDNodeXForm SubRegCast, SDNodeXForm SuperRegCast> {
+ defm : vbrd_elem32<v32, s32, ImmOp, ImmCast, SuperRegCast>;
+ defm : extract_insert_elem32<v32, s32, SubRegCast, SuperRegCast>;
+}
+
+multiclass patterns_elem64<ValueType v64, ValueType s64,
+ SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+ defm : vbrd_elem64<v64, s64, ImmOp, ImmCast>;
+ defm : extract_insert_elem64<v64, s64>;
+}
+
+defm : patterns_elem32<v256i32, i32, simm7, LO7, l2i, i2l>;
+defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>;
+
+defm : patterns_elem64<v256i64, i64, simm7, LO7>;
+defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td
new file mode 100644
index 000000000000..4a8476f7288a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrVec.td
@@ -0,0 +1,1510 @@
+//===----------------------------------------------------------------------===//
+// Vector Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions for VM512 modifications
+//===----------------------------------------------------------------------===//
+
+// LVM/SVM instructions using VM512
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+ let Constraints = "$vx = $vd", DisableEncoding = "$vd" in {
+ def LVMyir_y : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz, VM512:$vd),
+ "# pseudo LVM $vx, $sy, $sz, $vd">;
+ def LVMyim_y : Pseudo<(outs VM512:$vx),
+ (ins uimm3:$sy, mimm:$sz, VM512:$vd),
+ "# pseudo LVM $vx, $sy, $sz, $vd">;
+ }
+ def LVMyir : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz),
+ "# pseudo LVM $vx, $sy, $sz">;
+ def LVMyim : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, mimm:$sz),
+ "# pseudo LVM $vx, $sy, $sz">;
+ def SVMyi : Pseudo<(outs I64:$sx), (ins VM512:$vz, uimm3:$sy),
+ "# pseudo SVM $sx, $vz, $sy">;
+}
+
+// VFMK/VFMKW/VFMKS instructions using VM512
+let hasSideEffects = 0, isCodeGenOnly = 1, DisableEncoding = "$vl" in {
+ def VFMKyal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
+ "# pseudo-vfmk.at $vmx">;
+ def VFMKynal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
+ "# pseudo-vfmk.af $vmx">;
+ def VFMKWyvl : Pseudo<(outs VM512:$vmx),
+ (ins CCOp:$cf, V64:$vz, I32:$vl),
+ "# pseudo-vfmk.w.$cf $vmx, $vz">;
+ def VFMKWyvyl : Pseudo<(outs VM512:$vmx),
+ (ins CCOp:$cf, V64:$vz, VM512:$vm, I32:$vl),
+ "# pseudo-vfmk.w.$cf $vmx, $vz, $vm">;
+ def VFMKSyvl : Pseudo<(outs VM512:$vmx),
+ (ins CCOp:$cf, V64:$vz, I32:$vl),
+ "# pseudo-vfmk.s.$cf $vmx, $vz">;
+ def VFMKSyvyl : Pseudo<(outs VM512:$vmx),
+ (ins CCOp:$cf, V64:$vz, VM512:$vm, I32:$vl),
+ "# pseudo-vfmk.s.$cf $vmx, $vz, $vm">;
+}
+
+// ANDM/ORM/XORM/EQVM/NNDM/NEGM instructions using VM512
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+ def ANDMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+ "# andm $vmx, $vmy, $vmz">;
+ def ORMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+ "# orm $vmx, $vmy, $vmz">;
+ def XORMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+ "# xorm $vmx, $vmy, $vmz">;
+ def EQVMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+ "# eqvm $vmx, $vmy, $vmz">;
+ def NNDMyy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy, VM512:$vmz),
+ "# nndm $vmx, $vmy, $vmz">;
+ def NEGMy : Pseudo<(outs VM512:$vmx), (ins VM512:$vmy),
+ "# negm $vmx, $vmy">;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//
+// Define all vector instructions defined in SX-Aurora TSUBASA Architecture
+// Guide here. As those mnemonics, we use mnemonics defined in Vector Engine
+// Assembly Language Reference Manual.
+//
+// Some instructions can update existing data by following instructions
+// sequence.
+//
+// lea %s0, 256
+// lea %s1, 128
+// lvl %s0
+// vbrd %v0, 2 # v0 = { 2, 2, 2, ..., 2, 2, 2 }
+// lvl %s1
+// vbrd %v0, 3 # v0 = { 3, 3, 3, ..., 3, 2, 2, 2, ..., 2, 2, 2 }
+//
+// In order to represent above with a virtual register, we defines instructions
+// with an additional base register and `_v` suffiex in mnemonic.
+//
+// lea t0, 256
+// lea t1, 128
+// lea t0
+// vbrd tv0, 2
+// lvl t1
+// vbrd_v tv1, 2, tv0
+//
+// We also have some instructions uses VL register with an pseudo VL value
+// with following suffixes in mnemonic.
+//
+// l: have an additional I32 register to represent the VL value.
+// L: have an additional VL register to represent the VL value.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------------------------------------------------
+// Section 8.9 - Vector Load/Store and Move Instructions
+//-----------------------------------------------------------------------------
+
+// Multiclass for VLD instructions
+let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in,
+ string disEnc = ""> {
+ let DisableEncoding = disEnc in
+ def "" : RVM<opc, (outs RC:$vx), dag_in,
+ !strconcat(opcStr, " $vx, $sy, $sz")>;
+ let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ isCodeGenOnly = 1 in
+ def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " $vx, $sy, $sz")>;
+}
+multiclass VLDlm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> {
+ defm "" : VLDbm<opcStr, opc, RC, dag_in>;
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl)), "$vl,">;
+ defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl)), "$vl,">;
+ }
+}
+let VE_VLIndex = 3 in
+multiclass VLDtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+ defm rr : VLDlm<opcStr, opc, RC, (ins I64:$sy, I64:$sz)>;
+ let cy = 0 in
+ defm ir : VLDlm<opcStr, opc, RC, (ins simm7:$sy, I64:$sz)>;
+ let cz = 0 in
+ defm rz : VLDlm<opcStr, opc, RC, (ins I64:$sy, zero:$sz)>;
+ let cy = 0, cz = 0 in
+ defm iz : VLDlm<opcStr, opc, RC, (ins simm7:$sy, zero:$sz)>;
+}
+multiclass VLDm<string opcStr, bits<8>opc, RegisterClass RC> {
+ let vc = 1 in defm "" : VLDtgm<opcStr, opc, RC>;
+ let vc = 0 in defm NC : VLDtgm<opcStr#".nc", opc, RC>;
+}
+
+// Section 8.9.1 - VLD (Vector Load)
+defm VLD : VLDm<"vld", 0x81, V64>;
+
+// Section 8.9.2 - VLDU (Vector Load Upper)
+defm VLDU : VLDm<"vldu", 0x82, V64>;
+
+// Section 8.9.3 - VLDL (Vector Load Lower)
+defm VLDLSX : VLDm<"vldl.sx", 0x83, V64>;
+let cx = 1 in defm VLDLZX : VLDm<"vldl.zx", 0x83, V64>;
+
+// Section 8.9.4 - VLD2D (Vector Load 2D)
+defm VLD2D : VLDm<"vld2d", 0xc1, V64>;
+
+// Section 8.9.5 - VLDU2D (Vector Load Upper 2D)
+defm VLDU2D : VLDm<"vldu2d", 0xc2, V64>;
+
+// Section 8.9.6 - VLDL2D (Vector Load Lower 2D)
+defm VLDL2DSX : VLDm<"vldl2d.sx", 0xc3, V64>;
+let cx = 1 in defm VLDL2DZX : VLDm<"vldl2d.zx", 0xc3, V64>;
+
+// Multiclass for VST instructions
+let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VSTbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+ def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
+ let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
+ !strconcat(opcStr, argStr)>;
+ def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
+ !strconcat(opcStr, argStr)>;
+ }
+}
+multiclass VSTmm<string opcStr, bits<8>opc, dag dag_in> {
+ defm "" : VSTbm<opcStr, " $vx, $sy, $sz", opc, dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : VSTbm<opcStr, " $vx, $sy, $sz, $m", opc, !con(dag_in, (ins VM:$m))>;
+}
+let VE_VLIndex = 3 in
+multiclass VSTtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+ defm rrv : VSTmm<opcStr, opc, (ins I64:$sy, I64:$sz, RC:$vx)>;
+ let cy = 0 in
+ defm irv : VSTmm<opcStr, opc, (ins simm7:$sy, I64:$sz, RC:$vx)>;
+ let cz = 0 in
+ defm rzv : VSTmm<opcStr, opc, (ins I64:$sy, zero:$sz, RC:$vx)>;
+ let cy = 0, cz = 0 in
+ defm izv : VSTmm<opcStr, opc, (ins simm7:$sy, zero:$sz, RC:$vx)>;
+}
+multiclass VSTm<string opcStr, bits<8>opc, RegisterClass RC> {
+ let vc = 1, cx = 0 in defm "" : VSTtgm<opcStr, opc, RC>;
+ let vc = 0, cx = 0 in defm NC : VSTtgm<opcStr#".nc", opc, RC>;
+ let vc = 1, cx = 1 in defm OT : VSTtgm<opcStr#".ot", opc, RC>;
+ let vc = 0, cx = 1 in defm NCOT : VSTtgm<opcStr#".nc.ot", opc, RC>;
+}
+
+// Section 8.9.7 - VST (Vector Store)
+defm VST : VSTm<"vst", 0x91, V64>;
+
+// Section 8.9.8 - VST (Vector Store Upper)
+defm VSTU : VSTm<"vstu", 0x92, V64>;
+
+// Section 8.9.9 - VSTL (Vector Store Lower)
+defm VSTL : VSTm<"vstl", 0x93, V64>;
+
+// Section 8.9.10 - VST2D (Vector Store 2D)
+defm VST2D : VSTm<"vst2d", 0xd1, V64>;
+
+// Section 8.9.11 - VSTU2D (Vector Store Upper 2D)
+defm VSTU2D : VSTm<"vstu2d", 0xd2, V64>;
+
+// Section 8.9.12 - VSTL2D (Vector Store Lower 2D)
+defm VSTL2D : VSTm<"vstl2d", 0xd3, V64>;
+
+// Multiclass for VGT instructions
+let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VGTbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in, string disEnc = ""> {
+ let DisableEncoding = disEnc in
+ def "" : RVM<opc, (outs RC:$vx), dag_in,
+ !strconcat(opcStr, " $vx, ", argStr)>;
+ let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ isCodeGenOnly = 1 in
+ def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " $vx, ", argStr)>;
+}
+multiclass VGTlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm "" : VGTbm<opcStr, argStr, opc, RC, dag_in>;
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+ "$vl,">;
+ defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+ "$vl,">;
+ }
+}
+multiclass VGTmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm "" : VGTlm<opcStr, argStr, opc, RC, dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : VGTlm<opcStr, argStr#", $m", opc, RC, !con(dag_in, (ins VM:$m))>;
+}
+let VE_VLIndex = 4 in
+multiclass VGTlhm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm rr : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+ !con(dag_in, (ins I64:$sy, I64:$sz))>;
+ let cy = 0 in
+ defm ir : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+ !con(dag_in, (ins simm7:$sy, I64:$sz))>;
+ let cz = 0 in
+ defm rz : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+ !con(dag_in, (ins I64:$sy, zero:$sz))>;
+ let cy = 0, cz = 0 in
+ defm iz : VGTmm<opcStr, argStr#", $sy, $sz", opc, RC,
+ !con(dag_in, (ins simm7:$sy, zero:$sz))>;
+}
+multiclass VGTtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+ let vy = ? in defm v : VGTlhm<opcStr, "$vy", opc, RC, (ins V64:$vy)>;
+ let cs = 1, sw = ? in defm s : VGTlhm<opcStr, "$sw", opc, RC, (ins I64:$sw)>;
+}
+multiclass VGTm<string opcStr, bits<8>opc, RegisterClass RC> {
+ let vc = 1 in defm "" : VGTtgm<opcStr, opc, RC>;
+ let vc = 0 in defm NC : VGTtgm<opcStr#".nc", opc, RC>;
+}
+
+// Section 8.9.13 - VGT (Vector Gather)
+defm VGT : VGTm<"vgt", 0xa1, V64>;
+
+// Section 8.9.14 - VGTU (Vector Gather Upper)
+defm VGTU : VGTm<"vgtu", 0xa2, V64>;
+
+// Section 8.9.15 - VGTL (Vector Gather Lower)
+defm VGTLSX : VGTm<"vgtl.sx", 0xa3, V64>;
+let cx = 1 in defm VGTLZX : VGTm<"vgtl.zx", 0xa3, V64>;
+def : MnemonicAlias<"vgtl", "vgtl.zx">;
+def : MnemonicAlias<"vgtl.nc", "vgtl.zx.nc">;
+
+// Multiclass for VSC instructions
+let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
+multiclass VSCbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+ def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
+ let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
+ !strconcat(opcStr, argStr)>;
+ def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
+ !strconcat(opcStr, argStr)>;
+ }
+}
+multiclass VSCmm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+ defm "" : VSCbm<opcStr, argStr, opc, dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : VSCbm<opcStr, argStr#", $m", opc, !con(dag_in, (ins VM:$m))>;
+}
+let VE_VLIndex = 4 in
+multiclass VSClhm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm rrv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+ !con(dag_in, (ins I64:$sy, I64:$sz, RC:$vx))>;
+ let cy = 0 in
+ defm irv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+ !con(dag_in, (ins simm7:$sy, I64:$sz, RC:$vx))>;
+ let cz = 0 in
+ defm rzv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+ !con(dag_in, (ins I64:$sy, zero:$sz, RC:$vx))>;
+ let cy = 0, cz = 0 in
+ defm izv : VSCmm<opcStr, " $vx, "#argStr#", $sy, $sz", opc,
+ !con(dag_in, (ins simm7:$sy, zero:$sz, RC:$vx))>;
+}
+multiclass VSCtgm<string opcStr, bits<8>opc, RegisterClass RC> {
+ let vy = ? in defm v : VSClhm<opcStr, "$vy", opc, RC, (ins V64:$vy)>;
+ let cs = 1, sw = ? in defm s : VSClhm<opcStr, "$sw", opc, RC, (ins I64:$sw)>;
+}
+multiclass VSCm<string opcStr, bits<8>opc, RegisterClass RC> {
+ let vc = 1, cx = 0 in defm "" : VSCtgm<opcStr, opc, RC>;
+ let vc = 0, cx = 0 in defm NC : VSCtgm<opcStr#".nc", opc, RC>;
+ let vc = 1, cx = 1 in defm OT : VSCtgm<opcStr#".ot", opc, RC>;
+ let vc = 0, cx = 1 in defm NCOT : VSCtgm<opcStr#".nc.ot", opc, RC>;
+}
+
+// Section 8.9.16 - VSC (Vector Scatter)
+defm VSC : VSCm<"vsc", 0xb1, V64>;
+
+// Section 8.9.17 - VSCU (Vector Scatter Upper)
+defm VSCU : VSCm<"vscu", 0xb2, V64>;
+
+// Section 8.9.18 - VSCL (Vector Scatter Lower)
+defm VSCL : VSCm<"vscl", 0xb3, V64>;
+
+// Section 8.9.19 - PFCHV (Prefetch Vector)
+let Uses = [VL] in
+multiclass PFCHVbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+ def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
+ let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
+ !strconcat(opcStr, argStr)>;
+ def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
+ !strconcat(opcStr, argStr)>;
+ }
+}
+let VE_VLIndex = 2 in
+multiclass PFCHVm<string opcStr, bits<8>opc> {
+ defm rr : PFCHVbm<opcStr, " $sy, $sz", opc, (ins I64:$sy, I64:$sz)>;
+ let cy = 0 in
+ defm ir : PFCHVbm<opcStr, " $sy, $sz", opc, (ins simm7:$sy, I64:$sz)>;
+ let cz = 0 in
+ defm rz : PFCHVbm<opcStr, " $sy, $sz", opc, (ins I64:$sy, zero:$sz)>;
+ let cy = 0, cz = 0 in
+ defm iz : PFCHVbm<opcStr, " $sy, $sz", opc, (ins simm7:$sy, zero:$sz)>;
+}
+let vc = 1, vx = 0 in defm PFCHV : PFCHVm<"pfchv", 0x80>;
+let vc = 0, vx = 0 in defm PFCHVNC : PFCHVm<"pfchv.nc", 0x80>;
+
+// Section 8.9.20 - LSV (Load S to V)
+let sx = 0, vx = ?, hasSideEffects = 0 in
+multiclass LSVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ def "" : RR<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " ${vx}", argStr)>;
+ let Constraints = "$vx = $base", DisableEncoding = "$base",
+ isCodeGenOnly = 1 in
+ def _v : RR<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " ${vx}", argStr)>;
+}
+multiclass LSVm<string opcStr, bits<8>opc, RegisterClass RC> {
+ defm rr : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins I64:$sy, I64:$sz)>;
+ let cy = 0 in
+ defm ir : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins uimm7:$sy, I64:$sz)>;
+ let cz = 0 in
+ defm rm : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins I64:$sy, mimm:$sz)>;
+ let cy = 0, cz = 0 in
+ defm im : LSVbm<opcStr, "(${sy}), $sz", opc, RC, (ins uimm7:$sy, mimm:$sz)>;
+}
+defm LSV : LSVm<"lsv", 0x8e, V64>;
+
+// Section 8.9.21 - LVS (Load V to S)
+let cz = 0, sz = 0, vx = ?, hasSideEffects = 0 in
+multiclass LVSm<string opcStr, bits<8>opc, RegisterClass RC> {
+ def vr : RR<opc, (outs I64:$sx), (ins RC:$vx, I64:$sy),
+ opcStr#" $sx, ${vx}(${sy})">;
+ let cy = 0 in
+ def vi : RR<opc, (outs I64:$sx), (ins RC:$vx, uimm7:$sy),
+ opcStr#" $sx, ${vx}(${sy})">;
+}
+defm LVS : LVSm<"lvs", 0x9e, V64>;
+
+// Section 8.9.22 - LVM (Load VM)
+let sx = 0, vx = ?, hasSideEffects = 0 in
+multiclass LVMbm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM,
+ dag dag_in> {
+ def "" : RR<opc, (outs RCM:$vx), dag_in,
+ !strconcat(opcStr, " $vx, ", argStr)>;
+ let Constraints = "$vx = $base", DisableEncoding = "$base",
+ isCodeGenOnly = 1 in {
+ def _m : RR<opc, (outs RCM:$vx), !con(dag_in, (ins RCM:$base)),
+ !strconcat(opcStr, " $vx, ", argStr)>;
+ }
+}
+multiclass LVMom<string opcStr, bits<8>opc, RegisterClass RCM> {
+ defm rr : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins I64:$sy, I64:$sz)>;
+ let cy = 0 in
+ defm ir : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins uimm2:$sy, I64:$sz)>;
+ let cz = 0 in
+ defm rm : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins I64:$sy, mimm:$sz)>;
+ let cy = 0, cz = 0 in
+ defm im : LVMbm<opcStr, "$sy, $sz", opc, RCM, (ins uimm2:$sy, mimm:$sz)>;
+}
+multiclass LVMm<string opcStr, bits<8>opc, RegisterClass RCM> {
+ defm "" : LVMom<opcStr, opc, RCM>;
+}
+defm LVM : LVMm<"lvm", 0xb7, VM>;
+
+// Section 8.9.23 - SVM (Save VM)
+let cz = 0, sz = 0, vz = ?, hasSideEffects = 0 in
+multiclass SVMm<string opcStr, bits<8>opc, RegisterClass RCM> {
+ def mr : RR<opc, (outs I64:$sx), (ins RCM:$vz, I64:$sy),
+ opcStr#" $sx, $vz, $sy">;
+ let cy = 0 in
+ def mi : RR<opc, (outs I64:$sx), (ins RCM:$vz, uimm2:$sy),
+ opcStr#" $sx, $vz, $sy">;
+}
+defm SVM : SVMm<"svm", 0xa7, VM>;
+
+// Section 8.9.24 - VBRD (Vector Broadcast)
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass VBRDbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in, string disEnc = ""> {
+ let DisableEncoding = disEnc in
+ def "" : RV<opc, (outs RC:$vx), dag_in,
+ !strconcat(opcStr, " $vx, ", argStr)>;
+ let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ isCodeGenOnly = 1 in
+ def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " $vx, ", argStr)>;
+}
+multiclass VBRDlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm "" : VBRDbm<opcStr, argStr, opc, RC, dag_in>;
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+ "$vl,">;
+ defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+ "$vl,">;
+ }
+}
+multiclass VBRDmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM, dag dag_in> {
+ defm "" : VBRDlm<opcStr, argStr, opc, RC, dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : VBRDlm<opcStr, argStr#", $m", opc, RC, !con(dag_in, (ins RCM:$m))>;
+}
+let VE_VLIndex = 2 in
+multiclass VBRDm<string opcStr, bits<8>opc, RegisterClass VRC, RegisterClass RC,
+ RegisterClass RCM> {
+ defm r : VBRDmm<opcStr, "$sy", opc, VRC, RCM, (ins RC:$sy)>;
+ let cy = 0 in
+ defm i : VBRDmm<opcStr, "$sy", opc, VRC, RCM, (ins simm7:$sy)>;
+}
+let cx = 0, cx2 = 0 in
+defm VBRD : VBRDm<"vbrd", 0x8c, V64, I64, VM>;
+let cx = 0, cx2 = 1 in
+defm VBRDL : VBRDm<"vbrdl", 0x8c, V64, I32, VM>;
+let cx = 1, cx2 = 0 in
+defm VBRDU : VBRDm<"vbrdu", 0x8c, V64, F32, VM>;
+let cx = 1, cx2 = 1 in
+defm PVBRD : VBRDm<"pvbrd", 0x8c, V64, I64, VM512>;
+
+// Section 8.9.25 - VMV (Vector Move)
+let vx = ?, vz = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass VMVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in, string disEnc = ""> {
+ let DisableEncoding = disEnc in
+ def "" : RV<opc, (outs RC:$vx), dag_in,
+ !strconcat(opcStr, " $vx, ", argStr)>;
+ let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ isCodeGenOnly = 1 in
+ def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " $vx, ", argStr)>;
+}
+multiclass VMVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm "" : VMVbm<opcStr, argStr, opc, RC, dag_in>;
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+ "$vl,">;
+ defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+ "$vl,">;
+ }
+}
+multiclass VMVmm<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM, dag dag_in> {
+ defm "" : VMVlm<opcStr, "$sy, $vz", opc, RC, dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : VMVlm<opcStr, "$sy, $vz, $m", opc, RC, !con(dag_in, (ins RCM:$m))>;
+}
+let VE_VLIndex = 3 in
+multiclass VMVm<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM> {
+ defm rv : VMVmm<opcStr, opc, RC, RCM, (ins I64:$sy, RC:$vz)>;
+ let cy = 0 in
+ defm iv : VMVmm<opcStr, opc, RC, RCM, (ins uimm7:$sy, RC:$vz)>;
+}
+defm VMV : VMVm<"vmv", 0x9c, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.10 - Vector Fixed-Point Arithmetic Instructions
+//-----------------------------------------------------------------------------
+
+// Multiclass for generic vector calculation
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in, string disEnc = ""> {
+ let DisableEncoding = disEnc in
+ def "" : RV<opc, (outs RC:$vx), dag_in,
+ !strconcat(opcStr, " $vx", argStr)>;
+ let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+ isCodeGenOnly = 1 in
+ def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " $vx", argStr)>;
+}
+multiclass RVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm "" : RVbm<opcStr, argStr, opc, RC, dag_in>;
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+ "$vl,">;
+ defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+ "$vl,">;
+ }
+}
+multiclass RVmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM, dag dag_in> {
+ defm "" : RVlm<opcStr, argStr, opc, RC, dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : RVlm<opcStr, argStr#", $m", opc, RC, !con(dag_in, (ins RCM:$m))>;
+}
+// Generic RV multiclass with 2 arguments.
+// e.g. VADD, VSUB, VMPY, and etc.
+let VE_VLIndex = 3 in
+multiclass RVm<string opcStr, bits<8>opc, RegisterClass VRC, RegisterClass RC,
+ RegisterClass RCM, Operand SIMM = simm7> {
+ let cy = 0, sy = 0, vy = ?, vz = ? in
+ defm vv : RVmm<opcStr, ", $vy, $vz", opc, VRC, RCM, (ins VRC:$vy, VRC:$vz)>;
+ let cs = 1, vz = ? in
+ defm rv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins RC:$sy, VRC:$vz)>;
+ let cs = 1, cy = 0, vz = ? in
+ defm iv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins SIMM:$sy, VRC:$vz)>;
+}
+// Special RV multiclass with 2 arguments using cs2.
+// e.g. VDIV, VDVS, and VDVX.
+let VE_VLIndex = 3 in
+multiclass RVDIVm<string opcStr, bits<8>opc, RegisterClass VRC,
+ RegisterClass RC, RegisterClass RCM, Operand SIMM = simm7> {
+ let cy = 0, sy = 0, vy = ?, vz = ? in
+ defm vv : RVmm<opcStr, ", $vy, $vz", opc, VRC, RCM, (ins VRC:$vy, VRC:$vz)>;
+ let cs2 = 1, vy = ? in
+ defm vr : RVmm<opcStr, ", $vy, $sy", opc, VRC, RCM, (ins VRC:$vy, RC:$sy)>;
+ let cs2 = 1, cy = 0, vy = ? in
+ defm vi : RVmm<opcStr, ", $vy, $sy", opc, VRC, RCM, (ins VRC:$vy, SIMM:$sy)>;
+ let cs = 1, vz = ? in
+ defm rv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins RC:$sy, VRC:$vz)>;
+ let cs = 1, cy = 0, vz = ? in
+ defm iv : RVmm<opcStr, ", $sy, $vz", opc, VRC, RCM, (ins SIMM:$sy, VRC:$vz)>;
+}
+// Generic RV multiclass with 2 arguments for logical operations.
+// e.g. VAND, VOR, VXOR, and etc.
+let VE_VLIndex = 3 in
+multiclass RVLm<string opcStr, bits<8>opc, RegisterClass ScaRC,
+ RegisterClass RC, RegisterClass RCM> {
+ let cy = 0, sy = 0, vy = ?, vz = ? in
+ defm vv : RVmm<opcStr, ", $vy, $vz", opc, RC, RCM, (ins RC:$vy, RC:$vz)>;
+ let cs = 1, vz = ? in
+ defm rv : RVmm<opcStr, ", $sy, $vz", opc, RC, RCM, (ins ScaRC:$sy, RC:$vz)>;
+ let cs = 1, cy = 0, vz = ? in
+ defm mv : RVmm<opcStr, ", $sy, $vz", opc, RC, RCM, (ins mimm:$sy, RC:$vz)>;
+}
+// Generic RV multiclass with 1 argument.
+// e.g. VLDZ, VPCNT, and VBRV.
+let VE_VLIndex = 2 in
+multiclass RV1m<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let cy = 0, sy = 0, vz = ? in
+ defm v : RVmm<opcStr, ", $vz", opc, RC, RCM, (ins RC:$vz)>;
+}
+// Generic RV multiclass with no argument.
+// e.g. VSEQ.
+let VE_VLIndex = 1 in
+multiclass RV0m<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let cy = 0, sy = 0 in
+ defm "" : RVmm<opcStr, "", opc, RC, RCM, (ins)>;
+}
+// Generic RV multiclass with 2 arguments for shift operations.
+// e.g. VSLL, VSRL, VSLA, and etc.
+let VE_VLIndex = 3 in
+multiclass RVSm<string opcStr, bits<8>opc, RegisterClass ScaRC,
+ RegisterClass RC, RegisterClass RCM> {
+ let cy = 0, sy = 0, vy = ?, vz = ? in
+ defm vv : RVmm<opcStr, ", $vz, $vy", opc, RC, RCM, (ins RC:$vz, RC:$vy)>;
+ let cs = 1, vz = ? in
+ defm vr : RVmm<opcStr, ", $vz, $sy", opc, RC, RCM, (ins RC:$vz, ScaRC:$sy)>;
+ let cs = 1, cy = 0, vz = ? in
+ defm vi : RVmm<opcStr, ", $vz, $sy", opc, RC, RCM, (ins RC:$vz, uimm7:$sy)>;
+}
+// Generic RV multiclass with 3 arguments for shift operations.
+// e.g. VSLD and VSRD.
+let VE_VLIndex = 4 in
+multiclass RVSDm<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let vy = ?, vz = ? in
+ defm vvr : RVmm<opcStr, ", ($vy, ${vz}), $sy", opc, RC, RCM,
+ (ins RC:$vy, RC:$vz, I64:$sy)>;
+ let cy = 0, vy = ?, vz = ? in
+ defm vvi : RVmm<opcStr, ", ($vy, ${vz}), $sy", opc, RC, RCM,
+ (ins RC:$vy, RC:$vz, uimm7:$sy)>;
+}
+// Special RV multiclass with 3 arguments.
+// e.g. VSFA
+let VE_VLIndex = 4 in
+multiclass RVSAm<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let cz = 1, sz = ?, vz = ? in
+ defm vrr : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+ (ins RC:$vz, I64:$sy, I64:$sz)>;
+ let cz = 0, sz = ?, vz = ? in
+ defm vrm : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+ (ins RC:$vz, I64:$sy, mimm:$sz)>;
+ let cy = 0, cz = 1, sz = ?, vz = ? in
+ defm vir : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+ (ins RC:$vz, uimm3:$sy, I64:$sz)>;
+ let cy = 0, cz = 0, sz = ?, vz = ? in
+ defm vim : RVmm<opcStr, ", $vz, $sy, $sz", opc, RC, RCM,
+ (ins RC:$vz, uimm3:$sy, mimm:$sz)>;
+}
+// Generic RV multiclass with 1 argument using vy field.
+// e.g. VFSQRT, VRCP, and VRSQRT.
+let VE_VLIndex = 2 in
+multiclass RVF1m<string opcStr, bits<8>opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let cy = 0, sy = 0, vy = ? in
+ defm v : RVmm<opcStr, ", $vy", opc, RC, RCM, (ins RC:$vy)>;
+}
+// Special RV multiclass with 3 arguments using cs2.
+// e.g. VFMAD, VFMSB, VFNMAD, and etc.
+let VE_VLIndex = 4 in
+multiclass RVMm<string opcStr, bits<8>opc, RegisterClass VRC, RegisterClass RC,
+ RegisterClass RCM, Operand SIMM = simm7> {
+ let cy = 0, sy = 0, vy = ?, vz = ?, vw = ? in
+ defm vvv : RVmm<opcStr, ", $vy, $vz, $vw", opc, VRC, RCM,
+ (ins VRC:$vy, VRC:$vz, VRC:$vw)>;
+ let cs2 = 1, vy = ?, vw = ? in
+ defm vrv : RVmm<opcStr, ", $vy, $sy, $vw", opc, VRC, RCM,
+ (ins VRC:$vy, RC:$sy, VRC:$vw)>;
+ let cs2 = 1, cy = 0, vy = ?, vw = ? in
+ defm viv : RVmm<opcStr, ", $vy, $sy, $vw", opc, VRC, RCM,
+ (ins VRC:$vy, SIMM:$sy, VRC:$vw)>;
+ let cs = 1, vz = ?, vw = ? in
+ defm rvv : RVmm<opcStr, ", $sy, $vz, $vw", opc, VRC, RCM,
+ (ins RC:$sy, VRC:$vz, VRC:$vw)>;
+ let cs = 1, cy = 0, vz = ?, vw = ? in
+ defm ivv : RVmm<opcStr, ", $sy, $vz, $vw", opc, VRC, RCM,
+ (ins SIMM:$sy, VRC:$vz, VRC:$vw)>;
+}
+// Special RV multiclass with 2 arguments for floating point conversions.
+// e.g. VFIX and VFIXX
+let hasSideEffects = 0, VE_VLIndex = 3 in
+multiclass RVFIXm<string opcStr, bits<8> opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let cy = 0, sy = 0, vy = ?, vz = ? in
+ defm v : RVmm<opcStr#"$vz", ", $vy", opc, RC, RCM, (ins RDOp:$vz, RC:$vy)>;
+}
+// Multiclass for generic iterative vector calculation
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVIbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in, string disEnc = ""> {
+ let DisableEncoding = disEnc in
+ def "" : RV<opc, (outs RC:$vx), dag_in,
+ !strconcat(opcStr, " $vx", argStr)>;
+ let isCodeGenOnly = 1, Constraints = "$vx = $base", DisableEncoding = disEnc#"$base" in
+ def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
+ !strconcat(opcStr, " $vx", argStr)>;
+}
+multiclass RVIlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
+ dag dag_in> {
+ defm "" : RVIbm<opcStr, argStr, opc, RC, dag_in>;
+ let isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
+ "$vl,">;
+ defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
+ "$vl,">;
+ }
+}
+// Generic RV multiclass for iterative operation with 2 argument.
+// e.g. VFIA, VFIS, and VFIM
+let VE_VLIndex = 3 in
+multiclass RVI2m<string opcStr, bits<8>opc, RegisterClass VRC,
+ RegisterClass RC> {
+ let vy = ? in
+ defm vr : RVIlm<opcStr, ", $vy, $sy", opc, VRC, (ins VRC:$vy, RC:$sy)>;
+ let cy = 0, vy = ? in
+ defm vi : RVIlm<opcStr, ", $vy, $sy", opc, VRC, (ins VRC:$vy, simm7fp:$sy)>;
+}
+// Generic RV multiclass for iterative operation with 3 argument.
+// e.g. VFIAM, VFISM, VFIMA, and etc.
+let VE_VLIndex = 4 in
+multiclass RVI3m<string opcStr, bits<8>opc, RegisterClass VRC,
+ RegisterClass RC> {
+ let vy = ?, vz = ? in
+ defm vvr : RVIlm<opcStr, ", $vy, $vz, $sy", opc, VRC,
+ (ins VRC:$vy, VRC:$vz, RC:$sy)>;
+ let cy = 0, vy = ?, vz = ? in
+ defm vvi : RVIlm<opcStr, ", $vy, $vz, $sy", opc, VRC,
+ (ins VRC:$vy, VRC:$vz, simm7fp:$sy)>;
+}
+// special RV multiclass with 3 arguments for VSHF.
+// e.g. VSHF
+let vy = ?, vz = ?, VE_VLIndex = 4 in
+multiclass RVSHFm<string opcStr, bits<8>opc, RegisterClass RC,
+ Operand SIMM = uimm4> {
+ defm vvr : RVlm<opcStr, ", $vy, $vz, $sy", opc, RC,
+ (ins RC:$vy, RC:$vz, I64:$sy)>;
+ let cy = 0 in defm vvi : RVlm<opcStr, ", $vy, $vz, $sy", opc, RC,
+ (ins RC:$vy, RC:$vz, SIMM:$sy)>;
+}
+// Multiclass for generic mask calculation
+let vx = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVMKbm<string opcStr, string argStr, bits<8>opc, dag dag_out,
+ dag dag_in> {
+ def "" : RV<opc, dag_out, dag_in, !strconcat(opcStr, argStr)>;
+ let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ def l : RV<opc, dag_out, !con(dag_in, (ins I32:$vl)),
+ !strconcat(opcStr, argStr)>;
+ def L : RV<opc, dag_out, !con(dag_in, (ins VLS:$vl)),
+ !strconcat(opcStr, argStr)>;
+ }
+}
+multiclass RVMKlm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM,
+ dag dag_in> {
+ defm "" : RVMKbm<opcStr, " $vx"#argStr, opc, (outs RCM:$vx), dag_in>;
+ let m = ?, VE_VLWithMask = 1 in
+ defm m : RVMKbm<opcStr, " $vx"#argStr#", $m", opc, (outs RCM:$vx),
+ !con(dag_in, (ins RCM:$m))>;
+}
+// Generic RV multiclass for mask calculation with a condition.
+// e.g. VFMK, VFMS, and VFMF
+let cy = 0, sy = 0 in
+multiclass RVMKom<string opcStr, bits<8> opc, RegisterClass RC,
+ RegisterClass RCM> {
+ let vy = ?, vz = ?, VE_VLIndex = 3 in
+ defm v : RVMKlm<opcStr#"$vy", ", $vz", opc, RCM, (ins CCOp:$vy, RC:$vz)>;
+ let vy = 15 /* AT */, VE_VLIndex = 1 in
+ defm a : RVMKlm<opcStr#"at", "", opc, RCM, (ins)>;
+ let vy = 0 /* AF */, VE_VLIndex = 1 in
+ defm na : RVMKlm<opcStr#"af", "", opc, RCM, (ins)>;
+}
+multiclass RVMKm<string opcStr, bits<8> opc, RegisterClass RC,
+ RegisterClass RCM> {
+ defm "" : RVMKom<opcStr, opc, RC, RCM>;
+}
+// Generic RV multiclass for mask calculation with 2 arguments.
+// e.g. ANDM, ORM, XORM, and etc.
+let cy = 0, sy = 0, vx = ?, vy = ?, vz = ?, hasSideEffects = 0 in
+multiclass RVM2m<string opcStr, bits<8> opc, RegisterClass RCM> {
+ def mm : RV<opc, (outs RCM:$vx), (ins RCM:$vy, RCM:$vz),
+ !strconcat(opcStr, " $vx, $vy, $vz")>;
+}
+// Generic RV multiclass for mask calculation with 1 argument.
+// e.g. NEGM
+let cy = 0, sy = 0, vx = ?, vy = ?, hasSideEffects = 0 in
+multiclass RVM1m<string opcStr, bits<8> opc, RegisterClass RCM> {
+ def m : RV<opc, (outs RCM:$vx), (ins RCM:$vy),
+ !strconcat(opcStr, " $vx, $vy")>;
+}
+// Generic RV multiclass for mask calculation with 1 argument.
+// e.g. PCVM, LZVM, and TOVM
+let cy = 0, sy = 0, vy = ?, hasSideEffects = 0, Uses = [VL] in
+multiclass RVMSbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
+ def "" : RV<opc, (outs I64:$sx), dag_in,
+ !strconcat(opcStr, " $sx,", argStr)> {
+ bits<7> sx;
+ let Inst{54-48} = sx;
+ }
+ let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+ def l : RV<opc, (outs I64:$sx), !con(dag_in, (ins I32:$vl)),
+ !strconcat(opcStr, " $sx,", argStr)> {
+ bits<7> sx;
+ let Inst{54-48} = sx;
+ }
+ def L : RV<opc, (outs I64:$sx), !con(dag_in, (ins VLS:$vl)),
+ !strconcat(opcStr, " $sx,", argStr)> {
+ bits<7> sx;
+ let Inst{54-48} = sx;
+ }
+ }
+}
+let VE_VLIndex = 2 in
+multiclass RVMSm<string opcStr, bits<8> opc, RegisterClass RCM> {
+ defm m : RVMSbm<opcStr, " $vy", opc, (ins RCM:$vy)>;
+}
+
+// Section 8.10.1 - VADD (Vector Add)
+let cx = 0, cx2 = 0 in
+defm VADDUL : RVm<"vaddu.l", 0xc8, V64, I64, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVADDULO : RVm<"pvaddu.lo", 0xc8, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VADDUW : RVm<"vaddu.w", 0xc8, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVADDUUP : RVm<"pvaddu.up", 0xc8, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVADDU : RVm<"pvaddu", 0xc8, V64, I64, VM512>;
+def : MnemonicAlias<"vaddu.w", "pvaddu.lo">;
+
+// Section 8.10.2 - VADS (Vector Add Single)
+let cx = 0, cx2 = 0 in
+defm VADDSWSX : RVm<"vadds.w.sx", 0xca, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVADDSLO : RVm<"pvadds.lo", 0xca, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VADDSWZX : RVm<"vadds.w.zx", 0xca, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVADDSUP : RVm<"pvadds.up", 0xca, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVADDS : RVm<"pvadds", 0xca, V64, I64, VM512>;
+def : MnemonicAlias<"pvadds.lo.sx", "vadds.w.sx">;
+def : MnemonicAlias<"vadds.w.zx", "pvadds.lo">;
+def : MnemonicAlias<"vadds.w", "pvadds.lo">;
+def : MnemonicAlias<"pvadds.lo.zx", "pvadds.lo">;
+
+// Section 8.10.3 - VADX (Vector Add)
+defm VADDSL : RVm<"vadds.l", 0x8b, V64, I64, VM>;
+
+// Section 8.10.4 - VSUB (Vector Subtract)
+let cx = 0, cx2 = 0 in
+defm VSUBUL : RVm<"vsubu.l", 0xd8, V64, I64, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVSUBULO : RVm<"pvsubu.lo", 0xd8, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VSUBUW : RVm<"vsubu.w", 0xd8, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVSUBUUP : RVm<"pvsubu.up", 0xd8, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVSUBU : RVm<"pvsubu", 0xd8, V64, I64, VM512>;
+def : MnemonicAlias<"vsubu.w", "pvsubu.lo">;
+
+// Section 8.10.5 - VSBS (Vector Subtract Single)
+let cx = 0, cx2 = 0 in
+defm VSUBSWSX : RVm<"vsubs.w.sx", 0xda, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVSUBSLO : RVm<"pvsubs.lo", 0xda, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VSUBSWZX : RVm<"vsubs.w.zx", 0xda, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVSUBSUP : RVm<"pvsubs.up", 0xda, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVSUBS : RVm<"pvsubs", 0xda, V64, I64, VM512>;
+def : MnemonicAlias<"pvsubs.lo.sx", "vsubs.w.sx">;
+def : MnemonicAlias<"vsubs.w.zx", "pvsubs.lo">;
+def : MnemonicAlias<"vsubs.w", "pvsubs.lo">;
+def : MnemonicAlias<"pvsubs.lo.zx", "pvsubs.lo">;
+
+// Section 8.10.6 - VSBX (Vector Subtract)
+defm VSUBSL : RVm<"vsubs.l", 0x9b, V64, I64, VM>;
+
+// Section 8.10.7 - VMPY (Vector Multiply)
+let cx2 = 0 in
+defm VMULUL : RVm<"vmulu.l", 0xc9, V64, I64, VM>;
+let cx2 = 1 in
+defm VMULUW : RVm<"vmulu.w", 0xc9, V64, I32, VM>;
+
+// Section 8.10.8 - VMPS (Vector Multiply Single)
+let cx2 = 0 in
+defm VMULSWSX : RVm<"vmuls.w.sx", 0xcb, V64, I32, VM>;
+let cx2 = 1 in
+defm VMULSWZX : RVm<"vmuls.w.zx", 0xcb, V64, I32, VM>;
+def : MnemonicAlias<"vmuls.w", "vmuls.w.zx">;
+
+// Section 8.10.9 - VMPX (Vector Multiply)
+defm VMULSL : RVm<"vmuls.l", 0xdb, V64, I64, VM>;
+
+// Section 8.10.10 - VMPD (Vector Multiply)
+defm VMULSLW : RVm<"vmuls.l.w", 0xd9, V64, I32, VM>;
+
+// Section 8.10.11 - VDIV (Vector Divide)
+let cx2 = 0 in
+defm VDIVUL : RVDIVm<"vdivu.l", 0xe9, V64, I64, VM>;
+let cx2 = 1 in
+defm VDIVUW : RVDIVm<"vdivu.w", 0xe9, V64, I32, VM>;
+
+// Section 8.10.12 - VDVS (Vector Divide Single)
+let cx2 = 0 in
+defm VDIVSWSX : RVDIVm<"vdivs.w.sx", 0xeb, V64, I32, VM>;
+let cx2 = 1 in
+defm VDIVSWZX : RVDIVm<"vdivs.w.zx", 0xeb, V64, I32, VM>;
+def : MnemonicAlias<"vdivs.w", "vdivs.w.zx">;
+
+// Section 8.10.13 - VDVX (Vector Divide)
+defm VDIVSL : RVDIVm<"vdivs.l", 0xfb, V64, I64, VM>;
+
+// Section 8.10.14 - VCMP (Vector Compare)
+let cx = 0, cx2 = 0 in
+defm VCMPUL : RVm<"vcmpu.l", 0xb9, V64, I64, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVCMPULO : RVm<"pvcmpu.lo", 0xb9, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VCMPUW : RVm<"vcmpu.w", 0xb9, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVCMPUUP : RVm<"pvcmpu.up", 0xb9, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVCMPU : RVm<"pvcmpu", 0xb9, V64, I64, VM512>;
+def : MnemonicAlias<"vcmpu.w", "pvcmpu.lo">;
+
+// Section 8.10.15 - VCPS (Vector Compare Single)
+let cx = 0, cx2 = 0 in
+defm VCMPSWSX : RVm<"vcmps.w.sx", 0xfa, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVCMPSLO : RVm<"pvcmps.lo", 0xfa, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VCMPSWZX : RVm<"vcmps.w.zx", 0xfa, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVCMPSUP : RVm<"pvcmps.up", 0xfa, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVCMPS : RVm<"pvcmps", 0xfa, V64, I64, VM512>;
+def : MnemonicAlias<"pvcmps.lo.sx", "vcmps.w.sx">;
+def : MnemonicAlias<"vcmps.w.zx", "pvcmps.lo">;
+def : MnemonicAlias<"vcmps.w", "pvcmps.lo">;
+def : MnemonicAlias<"pvcmps.lo.zx", "pvcmps.lo">;
+
+// Section 8.10.16 - VCPX (Vector Compare)
+defm VCMPSL : RVm<"vcmps.l", 0xba, V64, I64, VM>;
+
+// Section 8.10.17 - VCMS (Vector Compare and Select Maximum/Minimum Single)
+let cx = 0, cx2 = 0 in
+defm VMAXSWSX : RVm<"vmaxs.w.sx", 0x8a, V64, I32, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVMAXSLO : RVm<"pvmaxs.lo", 0x8a, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VMAXSWZX : RVm<"vmaxs.w.zx", 0x8a, V64, I32, VM>;
+}
+let cx = 1, cx2 = 0 in
+defm PVMAXSUP : RVm<"pvmaxs.up", 0x8a, V64, I64, VM>;
+let cx = 1, cx2 = 1 in
+defm PVMAXS : RVm<"pvmaxs", 0x8a, V64, I64, VM512>;
+let cs2 = 1 in {
+ let cx = 0, cx2 = 0 in
+ defm VMINSWSX : RVm<"vmins.w.sx", 0x8a, V64, I32, VM>;
+ let cx = 0, cx2 = 1 in {
+ defm PVMINSLO : RVm<"pvmins.lo", 0x8a, V64, I32, VM>;
+ let isCodeGenOnly = 1 in
+ defm VMINSWZX : RVm<"vmins.w.zx", 0x8a, V64, I32, VM>;
+ }
+ let cx = 1, cx2 = 0 in
+ defm PVMINSUP : RVm<"pvmins.up", 0x8a, V64, I64, VM>;
+ let cx = 1, cx2 = 1 in
+ defm PVMINS : RVm<"pvmins", 0x8a, V64, I64, VM512>;
+}
+def : MnemonicAlias<"pvmaxs.lo.sx", "vmaxs.w.sx">;
+def : MnemonicAlias<"vmaxs.w.zx", "pvmaxs.lo">;
+def : MnemonicAlias<"vmaxs.w", "pvmaxs.lo">;
+def : MnemonicAlias<"pvmaxs.lo.zx", "pvmaxs.lo">;
+def : MnemonicAlias<"pvmins.lo.sx", "vmins.w.sx">;
+def : MnemonicAlias<"vmins.w.zx", "pvmins.lo">;
+def : MnemonicAlias<"vmins.w", "pvmins.lo">;
+def : MnemonicAlias<"pvmins.lo.zx", "pvmins.lo">;
+
+// Section 8.10.18 - VCMX (Vector Compare and Select Maximum/Minimum)
+defm VMAXSL : RVm<"vmaxs.l", 0x9a, V64, I64, VM>;
+let cs2 = 1 in
+defm VMINSL : RVm<"vmins.l", 0x9a, V64, I64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.11 - Vector Logical Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.11.1 - VAND (Vector And)
+let cx = 0, cx2 = 0 in defm VAND : RVLm<"vand", 0xc4, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVANDLO : RVLm<"pvand.lo", 0xc4, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVANDUP : RVLm<"pvand.up", 0xc4, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVAND : RVLm<"pvand", 0xc4, I64, V64, VM512>;
+
+// Section 8.11.2 - VOR (Vector Or)
+let cx = 0, cx2 = 0 in defm VOR : RVLm<"vor", 0xc5, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVORLO : RVLm<"pvor.lo", 0xc5, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVORUP : RVLm<"pvor.up", 0xc5, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVOR : RVLm<"pvor", 0xc5, I64, V64, VM512>;
+
+// Section 8.11.3 - VXOR (Vector Exclusive Or)
+let cx = 0, cx2 = 0 in defm VXOR : RVLm<"vxor", 0xc6, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVXORLO : RVLm<"pvxor.lo", 0xc6, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVXORUP : RVLm<"pvxor.up", 0xc6, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVXOR : RVLm<"pvxor", 0xc6, I64, V64, VM512>;
+
+// Section 8.11.4 - VEQV (Vector Equivalence)
+let cx = 0, cx2 = 0 in defm VEQV : RVLm<"veqv", 0xc7, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVEQVLO : RVLm<"pveqv.lo", 0xc7, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVEQVUP : RVLm<"pveqv.up", 0xc7, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVEQV : RVLm<"pveqv", 0xc7, I64, V64, VM512>;
+
+// Section 8.11.5 - VLDZ (Vector Leading Zero Count)
+let cx = 0, cx2 = 0 in defm VLDZ : RV1m<"vldz", 0xe7, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVLDZLO : RV1m<"pvldz.lo", 0xe7, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVLDZUP : RV1m<"pvldz.up", 0xe7, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVLDZ : RV1m<"pvldz", 0xe7, V64, VM512>;
+
+// Section 8.11.6 - VPCNT (Vector Population Count)
+let cx = 0, cx2 = 0 in defm VPCNT : RV1m<"vpcnt", 0xac, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVPCNTLO : RV1m<"pvpcnt.lo", 0xac, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVPCNTUP : RV1m<"pvpcnt.up", 0xac, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVPCNT : RV1m<"pvpcnt", 0xac, V64, VM512>;
+
+// Section 8.11.7 - VBRV (Vector Bit Reverse)
+let cx = 0, cx2 = 0 in defm VBRV : RV1m<"vbrv", 0xf7, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVBRVLO : RV1m<"pvbrv.lo", 0xf7, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVBRVUP : RV1m<"pvbrv.up", 0xf7, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVBRV : RV1m<"pvbrv", 0xf7, V64, VM512>;
+
+// Section 8.11.8 - VSEQ (Vector Sequential Number)
+let cx = 0, cx2 = 0 in defm VSEQ : RV0m<"vseq", 0x99, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVSEQLO : RV0m<"pvseq.lo", 0x99, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVSEQUP : RV0m<"pvseq.up", 0x99, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSEQ : RV0m<"pvseq", 0x99, V64, VM512>;
+
+//-----------------------------------------------------------------------------
+// Section 8.12 - Vector Shift Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.12.1 - VSLL (Vector Shift Left Logical)
+let cx = 0, cx2 = 0 in defm VSLL : RVSm<"vsll", 0xe5, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVSLLLO : RVSm<"pvsll.lo", 0xe5, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVSLLUP : RVSm<"pvsll.up", 0xe5, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSLL : RVSm<"pvsll", 0xe5, I64, V64, VM512>;
+
+// Section 8.12.2 - VSLD (Vector Shift Left Double)
+defm VSLD : RVSDm<"vsld", 0xe4, V64, VM>;
+
+// Section 8.12.3 - VSRL (Vector Shift Right Logical)
+let cx = 0, cx2 = 0 in defm VSRL : RVSm<"vsrl", 0xf5, I64, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVSRLLO : RVSm<"pvsrl.lo", 0xf5, I32, V64, VM>;
+let cx = 1, cx2 = 0 in defm PVSRLUP : RVSm<"pvsrl.up", 0xf5, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSRL : RVSm<"pvsrl", 0xf5, I64, V64, VM512>;
+
+// Section 8.12.4 - VSRD (Vector Shift Right Double)
+defm VSRD : RVSDm<"vsrd", 0xf4, V64, VM>;
+
+// Section 8.12.5 - VSLA (Vector Shift Left Arithmetic)
+let cx = 0, cx2 = 0 in defm VSLAWSX : RVSm<"vsla.w.sx", 0xe6, I32, V64, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVSLALO : RVSm<"pvsla.lo", 0xe6, I32, V64, VM>;
+ let isCodeGenOnly = 1 in defm VSLAWZX : RVSm<"vsla.w.zx", 0xe6, I32, V64, VM>;
+}
+let cx = 1, cx2 = 0 in defm PVSLAUP : RVSm<"pvsla.up", 0xe6, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSLA : RVSm<"pvsla", 0xe6, I64, V64, VM512>;
+def : MnemonicAlias<"pvsla.lo.sx", "vsla.w.sx">;
+def : MnemonicAlias<"vsla.w.zx", "pvsla.lo">;
+def : MnemonicAlias<"vsla.w", "pvsla.lo">;
+def : MnemonicAlias<"pvsla.lo.zx", "pvsla.lo">;
+
+// Section 8.12.6 - VSLAX (Vector Shift Left Arithmetic)
+defm VSLAL : RVSm<"vsla.l", 0xd4, I64, V64, VM>;
+
+// Section 8.12.7 - VSRA (Vector Shift Right Arithmetic)
+let cx = 0, cx2 = 0 in defm VSRAWSX : RVSm<"vsra.w.sx", 0xf6, I32, V64, VM>;
+let cx = 0, cx2 = 1 in {
+ defm PVSRALO : RVSm<"pvsra.lo", 0xf6, I32, V64, VM>;
+ let isCodeGenOnly = 1 in defm VSRAWZX : RVSm<"vsra.w.zx", 0xf6, I32, V64, VM>;
+}
+let cx = 1, cx2 = 0 in defm PVSRAUP : RVSm<"pvsra.up", 0xf6, F32, V64, VM>;
+let cx = 1, cx2 = 1 in defm PVSRA : RVSm<"pvsra", 0xf6, I64, V64, VM512>;
+def : MnemonicAlias<"pvsra.lo.sx", "vsra.w.sx">;
+def : MnemonicAlias<"vsra.w.zx", "pvsra.lo">;
+def : MnemonicAlias<"vsra.w", "pvsra.lo">;
+def : MnemonicAlias<"pvsra.lo.zx", "pvsra.lo">;
+
+// Section 8.12.8 - VSRAX (Vector Shift Right Arithmetic)
+defm VSRAL : RVSm<"vsra.l", 0xd5, I64, V64, VM>;
+
+// Section 8.12.9 - VSFA (Vector Shift Left and Add)
+defm VSFA : RVSAm<"vsfa", 0xd7, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.13 - Vector Floating-Point Arithmetic Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.13.1 - VFAD (Vector Floating Add)
+let cx = 0, cx2 = 0 in
+defm VFADDD : RVm<"vfadd.d", 0xcc, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFADDLO : RVm<"pvfadd.lo", 0xcc, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFADDUP : RVm<"pvfadd.up", 0xcc, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFADDS : RVm<"vfadd.s", 0xcc, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFADD : RVm<"pvfadd", 0xcc, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfadd.s", "pvfadd.up">;
+
+// Section 8.13.2 - VFSB (Vector Floating Subtract)
+let cx = 0, cx2 = 0 in
+defm VFSUBD : RVm<"vfsub.d", 0xdc, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFSUBLO : RVm<"pvfsub.lo", 0xdc, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFSUBUP : RVm<"pvfsub.up", 0xdc, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFSUBS : RVm<"vfsub.s", 0xdc, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFSUB : RVm<"pvfsub", 0xdc, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfsub.s", "pvfsub.up">;
+
+// Section 8.13.3 - VFMP (Vector Floating Multiply)
+let cx = 0, cx2 = 0 in
+defm VFMULD : RVm<"vfmul.d", 0xcd, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMULLO : RVm<"pvfmul.lo", 0xcd, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFMULUP : RVm<"pvfmul.up", 0xcd, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFMULS : RVm<"vfmul.s", 0xcd, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMUL : RVm<"pvfmul", 0xcd, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfmul.s", "pvfmul.up">;
+
+// Section 8.13.4 - VFDV (Vector Floating Divide)
+defm VFDIVD : RVDIVm<"vfdiv.d", 0xdd, V64, I64, VM, simm7fp>;
+let cx = 1 in
+defm VFDIVS : RVDIVm<"vfdiv.s", 0xdd, V64, F32, VM, simm7fp>;
+
+// Section 8.13.5 - VFSQRT (Vector Floating Square Root)
+defm VFSQRTD : RVF1m<"vfsqrt.d", 0xed, V64, VM>;
+let cx = 1 in
+defm VFSQRTS : RVF1m<"vfsqrt.s", 0xed, V64, VM>;
+
+// Section 8.13.6 - VFCP (Vector Floating Compare)
+let cx = 0, cx2 = 0 in
+defm VFCMPD : RVm<"vfcmp.d", 0xfc, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFCMPLO : RVm<"pvfcmp.lo", 0xfc, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFCMPUP : RVm<"pvfcmp.up", 0xfc, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFCMPS : RVm<"vfcmp.s", 0xfc, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFCMP : RVm<"pvfcmp", 0xfc, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfcmp.s", "pvfcmp.up">;
+
+// Section 8.13.7 - VFCM (Vector Floating Compare and Select Maximum/Minimum)
+let cx = 0, cx2 = 0 in
+defm VFMAXD : RVm<"vfmax.d", 0xbd, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMAXLO : RVm<"pvfmax.lo", 0xbd, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFMAXUP : RVm<"pvfmax.up", 0xbd, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFMAXS : RVm<"vfmax.s", 0xbd, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMAX : RVm<"pvfmax", 0xbd, V64, I64, VM512, simm7fp>;
+let cs2 = 1 in {
+ let cx = 0, cx2 = 0 in
+ defm VFMIND : RVm<"vfmin.d", 0xbd, V64, I64, VM, simm7fp>;
+ let cx = 0, cx2 = 1 in
+ defm PVFMINLO : RVm<"pvfmin.lo", 0xbd, V64, I64, VM, simm7fp>;
+ let cx = 1, cx2 = 0 in {
+ defm PVFMINUP : RVm<"pvfmin.up", 0xbd, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFMINS : RVm<"vfmin.s", 0xbd, V64, F32, VM, simm7fp>;
+ }
+ let cx = 1, cx2 = 1 in
+ defm PVFMIN : RVm<"pvfmin", 0xbd, V64, I64, VM512, simm7fp>;
+}
+def : MnemonicAlias<"vfmax.s", "pvfmax.up">;
+def : MnemonicAlias<"vfmin.s", "pvfmin.up">;
+
+// Section 8.13.8 - VFMAD (Vector Floating Fused Multiply Add)
+let cx = 0, cx2 = 0 in
+defm VFMADD : RVMm<"vfmad.d", 0xe2, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMADLO : RVMm<"pvfmad.lo", 0xe2, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFMADUP : RVMm<"pvfmad.up", 0xe2, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFMADS : RVMm<"vfmad.s", 0xe2, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMAD : RVMm<"pvfmad", 0xe2, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfmad.s", "pvfmad.up">;
+
+// Section 8.13.9 - VFMSB (Vector Floating Fused Multiply Subtract)
+let cx = 0, cx2 = 0 in
+defm VFMSBD : RVMm<"vfmsb.d", 0xf2, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFMSBLO : RVMm<"pvfmsb.lo", 0xf2, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFMSBUP : RVMm<"pvfmsb.up", 0xf2, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFMSBS : RVMm<"vfmsb.s", 0xf2, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFMSB : RVMm<"pvfmsb", 0xf2, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfmsb.s", "pvfmsb.up">;
+
+// Section 8.13.10 - VFNMAD (Vector Floating Fused Negative Multiply Add)
+let cx = 0, cx2 = 0 in
+defm VFNMADD : RVMm<"vfnmad.d", 0xe3, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFNMADLO : RVMm<"pvfnmad.lo", 0xe3, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFNMADUP : RVMm<"pvfnmad.up", 0xe3, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFNMADS : RVMm<"vfnmad.s", 0xe3, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFNMAD : RVMm<"pvfnmad", 0xe3, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfnmad.s", "pvfnmad.up">;
+
+// Section 8.13.11 - VFNMSB (Vector Floating Fused Negative Multiply Subtract)
+let cx = 0, cx2 = 0 in
+defm VFNMSBD : RVMm<"vfnmsb.d", 0xf3, V64, I64, VM, simm7fp>;
+let cx = 0, cx2 = 1 in
+defm PVFNMSBLO : RVMm<"pvfnmsb.lo", 0xf3, V64, I64, VM, simm7fp>;
+let cx = 1, cx2 = 0 in {
+ defm PVFNMSBUP : RVMm<"pvfnmsb.up", 0xf3, V64, F32, VM, simm7fp>;
+ let isCodeGenOnly = 1 in
+ defm VFNMSBS : RVMm<"vfnmsb.s", 0xf3, V64, F32, VM, simm7fp>;
+}
+let cx = 1, cx2 = 1 in
+defm PVFNMSB : RVMm<"pvfnmsb", 0xf3, V64, I64, VM512, simm7fp>;
+def : MnemonicAlias<"vfnmsb.s", "pvfnmsb.up">;
+
+// Section 8.13.12 - VRCP (Vector Floating Reciprocal)
+let cx = 0, cx2 = 0 in defm VRCPD : RVF1m<"vrcp.d", 0xe1, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVRCPLO : RVF1m<"pvrcp.lo", 0xe1, V64, VM>;
+let cx = 1, cx2 = 0 in {
+ defm PVRCPUP : RVF1m<"pvrcp.up", 0xe1, V64, VM>;
+ let isCodeGenOnly = 1 in defm VRCPS : RVF1m<"vrcp.s", 0xe1, V64, VM>;
+}
+let cx = 1, cx2 = 1 in defm PVRCP : RVF1m<"pvrcp", 0xe1, V64, VM512>;
+def : MnemonicAlias<"vrcp.s", "pvrcp.up">;
+
+// Section 8.13.13 - VRSQRT (Vector Floating Reciprocal Square Root)
+let cx = 0, cx2 = 0 in defm VRSQRTD : RVF1m<"vrsqrt.d", 0xf1, V64, VM>;
+let cx = 0, cx2 = 1 in defm PVRSQRTLO : RVF1m<"pvrsqrt.lo", 0xf1, V64, VM>;
+let cx = 1, cx2 = 0 in {
+ defm PVRSQRTUP : RVF1m<"pvrsqrt.up", 0xf1, V64, VM>;
+ let isCodeGenOnly = 1 in
+ defm VRSQRTS : RVF1m<"vrsqrt.s", 0xf1, V64, VM>;
+}
+let cx = 1, cx2 = 1 in
+defm PVRSQRT : RVF1m<"pvrsqrt", 0xf1, V64, VM512>;
+let cs2 = 1 in {
+ let cx = 0, cx2 = 0 in
+ defm VRSQRTDNEX : RVF1m<"vrsqrt.d.nex", 0xf1, V64, VM>;
+ let cx = 0, cx2 = 1 in
+ defm PVRSQRTLONEX : RVF1m<"pvrsqrt.lo.nex", 0xf1, V64, VM>;
+ let cx = 1, cx2 = 0 in {
+ defm PVRSQRTUPNEX : RVF1m<"pvrsqrt.up.nex", 0xf1, V64, VM>;
+ let isCodeGenOnly = 1 in
+ defm VRSQRTSNEX : RVF1m<"vrsqrt.s.nex", 0xf1, V64, VM>;
+ }
+ let cx = 1, cx2 = 1 in
+ defm PVRSQRTNEX : RVF1m<"pvrsqrt.nex", 0xf1, V64, VM512>;
+}
+def : MnemonicAlias<"vrsqrt.s", "pvrsqrt.up">;
+def : MnemonicAlias<"vrsqrt.s.nex", "pvrsqrt.up.nex">;
+
+// Section 8.13.14 - VFIX (Vector Convert to Fixed Pointer)
+let cx = 0, cx2 = 0, cs2 = 0 in
+defm VCVTWDSX : RVFIXm<"vcvt.w.d.sx", 0xe8, V64, VM>;
+let cx = 0, cx2 = 1, cs2 = 0 in
+defm VCVTWDZX : RVFIXm<"vcvt.w.d.zx", 0xe8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 0 in
+defm VCVTWSSX : RVFIXm<"vcvt.w.s.sx", 0xe8, V64, VM>;
+let cx = 1, cx2 = 1, cs2 = 0 in
+defm VCVTWSZX : RVFIXm<"vcvt.w.s.zx", 0xe8, V64, VM>;
+let cx = 0, cx2 = 1, cs2 = 1 in
+defm PVCVTWSLO : RVFIXm<"pvcvt.w.s.lo", 0xe8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 1 in
+defm PVCVTWSUP : RVFIXm<"pvcvt.w.s.up", 0xe8, V64, VM>;
+let cx = 1, cx2 = 1, cs2 = 1 in
+defm PVCVTWS : RVFIXm<"pvcvt.w.s", 0xe8, V64, VM512>;
+
+// Section 8.13.15 - VFIXX (Vector Convert to Fixed Pointer)
+defm VCVTLD : RVFIXm<"vcvt.l.d", 0xa8, V64, VM>;
+
+// Section 8.13.16 - VFLT (Vector Convert to Floating Pointer)
+let cx = 0, cx2 = 0, cs2 = 0 in
+defm VCVTDW : RVF1m<"vcvt.d.w", 0xf8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 0 in
+defm VCVTSW : RVF1m<"vcvt.s.w", 0xf8, V64, VM>;
+let cx = 0, cx2 = 1, cs2 = 1 in
+defm PVCVTSWLO : RVF1m<"pvcvt.s.w.lo", 0xf8, V64, VM>;
+let cx = 1, cx2 = 0, cs2 = 1 in
+defm PVCVTSWUP : RVF1m<"pvcvt.s.w.up", 0xf8, V64, VM>;
+let cx = 1, cx2 = 1, cs2 = 1 in
+defm PVCVTSW : RVF1m<"pvcvt.s.w", 0xf8, V64, VM512>;
+
+// Section 8.13.17 - VFLTX (Vector Convert to Floating Pointer)
+defm VCVTDL : RVF1m<"vcvt.d.l", 0xb8, V64, VM>;
+
+// Section 8.13.18 - VCVS (Vector Convert to Single-format)
+defm VCVTSD : RVF1m<"vcvt.s.d", 0x9f, V64, VM>;
+
+// Section 8.13.19 - VCVD (Vector Convert to Double-format)
+defm VCVTDS : RVF1m<"vcvt.d.s", 0x8f, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.14 - Vector Reduction Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.14.1 - VSUMS (Vector Sum Single)
+defm VSUMWSX : RVF1m<"vsum.w.sx", 0xea, V64, VM>;
+let cx2 = 1 in defm VSUMWZX : RVF1m<"vsum.w.zx", 0xea, V64, VM>;
+
+// Section 8.14.2 - VSUMX (Vector Sum)
+defm VSUML : RVF1m<"vsum.l", 0xaa, V64, VM>;
+
+// Section 8.14.3 - VFSUM (Vector Floating Sum)
+defm VFSUMD : RVF1m<"vfsum.d", 0xec, V64, VM>;
+let cx = 1 in defm VFSUMS : RVF1m<"vfsum.s", 0xec, V64, VM>;
+
+// Section 8.14.4 - VMAXS (Vector Maximum/Minimum Single)
+let cx2 = 0 in defm VRMAXSWFSTSX : RVF1m<"vrmaxs.w.fst.sx", 0xbb, V64, VM>;
+let cx2 = 1 in defm VRMAXSWFSTZX : RVF1m<"vrmaxs.w.fst.zx", 0xbb, V64, VM>;
+let cs = 1 in {
+ let cx2 = 0 in
+ defm VRMAXSWLSTSX : RVF1m<"vrmaxs.w.lst.sx", 0xbb, V64, VM>;
+ let cx2 = 1 in
+ defm VRMAXSWLSTZX : RVF1m<"vrmaxs.w.lst.zx", 0xbb, V64, VM>;
+}
+let cs2 = 1 in {
+ let cx2 = 0 in
+ defm VRMINSWFSTSX : RVF1m<"vrmins.w.fst.sx", 0xbb, V64, VM>;
+ let cx2 = 1 in
+ defm VRMINSWFSTZX : RVF1m<"vrmins.w.fst.zx", 0xbb, V64, VM>;
+ let cs = 1 in {
+ let cx2 = 0 in
+ defm VRMINSWLSTSX : RVF1m<"vrmins.w.lst.sx", 0xbb, V64, VM>;
+ let cx2 = 1 in
+ defm VRMINSWLSTZX : RVF1m<"vrmins.w.lst.zx", 0xbb, V64, VM>;
+ }
+}
+
+// Section 8.14.5 - VMAXX (Vector Maximum/Minimum)
+let cs = 0 in defm VRMAXSLFST : RVF1m<"vrmaxs.l.fst", 0xab, V64, VM>;
+let cs = 1 in defm VRMAXSLLST : RVF1m<"vrmaxs.l.lst", 0xab, V64, VM>;
+let cs2 = 1 in {
+ let cs = 0 in defm VRMINSLFST : RVF1m<"vrmins.l.fst", 0xab, V64, VM>;
+ let cs = 1 in defm VRMINSLLST : RVF1m<"vrmins.l.lst", 0xab, V64, VM>;
+}
+
+// Section 8.14.6 - VFMAX (Vector Floating Maximum/Minimum)
+let cs = 0 in defm VFRMAXDFST : RVF1m<"vfrmax.d.fst", 0xad, V64, VM>;
+let cs = 1 in defm VFRMAXDLST : RVF1m<"vfrmax.d.lst", 0xad, V64, VM>;
+let cs2 = 1 in {
+ let cs = 0 in defm VFRMINDFST : RVF1m<"vfrmin.d.fst", 0xad, V64, VM>;
+ let cs = 1 in defm VFRMINDLST : RVF1m<"vfrmin.d.lst", 0xad, V64, VM>;
+}
+let cx = 1 in {
+ let cs = 0 in defm VFRMAXSFST : RVF1m<"vfrmax.s.fst", 0xad, V64, VM>;
+ let cs = 1 in defm VFRMAXSLST : RVF1m<"vfrmax.s.lst", 0xad, V64, VM>;
+ let cs2 = 1 in {
+ let cs = 0 in defm VFRMINSFST : RVF1m<"vfrmin.s.fst", 0xad, V64, VM>;
+ let cs = 1 in defm VFRMINSLST : RVF1m<"vfrmin.s.lst", 0xad, V64, VM>;
+ }
+}
+
+// Section 8.14.7 - VRAND (Vector Reduction And)
+defm VRAND : RVF1m<"vrand", 0x88, V64, VM>;
+
+// Section 8.14.8 - VROR (Vector Reduction Or)
+defm VROR : RVF1m<"vror", 0x98, V64, VM>;
+
+// Section 8.14.9 - VRXOR (Vector Reduction Exclusive Or)
+defm VRXOR : RVF1m<"vrxor", 0x89, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.15 - Vector Iterative Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.15.1 - VFIA (Vector Floating Iteration Add)
+let cx = 0 in defm VFIAD : RVI2m<"vfia.d", 0xce, V64, I64>;
+let cx = 1 in defm VFIAS : RVI2m<"vfia.s", 0xce, V64, F32>;
+
+// Section 8.15.2 - VFIS (Vector Floating Iteration Subtract)
+let cx = 0 in defm VFISD : RVI2m<"vfis.d", 0xde, V64, I64>;
+let cx = 1 in defm VFISS : RVI2m<"vfis.s", 0xde, V64, F32>;
+
+// Section 8.15.3 - VFIM (Vector Floating Iteration Multiply)
+let cx = 0 in defm VFIMD : RVI2m<"vfim.d", 0xcf, V64, I64>;
+let cx = 1 in defm VFIMS : RVI2m<"vfim.s", 0xcf, V64, F32>;
+
+// Section 8.15.4 - VFIAM (Vector Floating Iteration Add and Multiply)
+let cx = 0 in defm VFIAMD : RVI3m<"vfiam.d", 0xee, V64, I64>;
+let cx = 1 in defm VFIAMS : RVI3m<"vfiam.s", 0xee, V64, F32>;
+
+// Section 8.15.5 - VFISM (Vector Floating Iteration Subtract and Multiply)
+let cx = 0 in defm VFISMD : RVI3m<"vfism.d", 0xfe, V64, I64>;
+let cx = 1 in defm VFISMS : RVI3m<"vfism.s", 0xfe, V64, F32>;
+
+// Section 8.15.6 - VFIMA (Vector Floating Iteration Multiply and Add)
+let cx = 0 in defm VFIMAD : RVI3m<"vfima.d", 0xef, V64, I64>;
+let cx = 1 in defm VFIMAS : RVI3m<"vfima.s", 0xef, V64, F32>;
+
+// Section 8.15.7 - VFIMS (Vector Floating Iteration Multiply and Subtract)
+let cx = 0 in defm VFIMSD : RVI3m<"vfims.d", 0xff, V64, I64>;
+let cx = 1 in defm VFIMSS : RVI3m<"vfims.s", 0xff, V64, F32>;
+
+//-----------------------------------------------------------------------------
+// Section 8.16 - Vector Merger Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.16.1 - VMRG (Vector Merge)
+let cx = 0 in defm VMRG : RVm<"vmrg", 0xd6, V64, I64, VM>;
+// FIXME: vmrg.w should be called as pvmrg, but following assembly manual.
+let cx = 1 in defm VMRGW : RVm<"vmrg.w", 0xd6, V64, I64, VM512>;
+def : MnemonicAlias<"vmrg.l", "vmrg">;
+
+// Section 8.16.2 - VSHF (Vector Shuffle)
+defm VSHF : RVSHFm<"vshf", 0xbc, V64>;
+
+// Section 8.16.3 - VCP (Vector Compress)
+defm VCP : RV1m<"vcp", 0x8d, V64, VM>;
+
+// Section 8.16.4 - VEX (Vector Expand)
+defm VEX : RV1m<"vex", 0x9d, V64, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.17 - Vector Mask Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.17.1 - VFMK (Vector Form Mask)
+defm VFMKL : RVMKm<"vfmk.l.", 0xb4, V64, VM>;
+def : MnemonicAlias<"vfmk.l", "vfmk.l.at">;
+
+// Section 8.17.2 - VFMS (Vector Form Mask Single)
+defm VFMKW : RVMKm<"vfmk.w.", 0xb5, V64, VM>;
+let isCodeGenOnly = 1 in defm PVFMKWLO : RVMKm<"vfmk.w.", 0xb5, V64, VM>;
+let cx = 1 in defm PVFMKWUP : RVMKm<"pvfmk.w.up.", 0xb5, V64, VM>;
+def : MnemonicAlias<"vfmk.w", "vfmk.w.at">;
+def : MnemonicAlias<"pvfmk.w.up", "pvfmk.w.up.at">;
+def : MnemonicAlias<"pvfmk.w.lo", "vfmk.w.at">;
+foreach CC = [ "af", "gt", "lt", "ne", "eq", "ge", "le", "at" ] in {
+ def : MnemonicAlias<"pvfmk.w.lo."#CC, "vfmk.w."#CC>;
+}
+
+// Section 8.17.3 - VFMF (Vector Form Mask Floating Point)
+defm VFMKD : RVMKm<"vfmk.d.", 0xb6, V64, VM>;
+let cx2 = 1 in defm PVFMKSLO : RVMKm<"pvfmk.s.lo.", 0xb6, V64, VM>;
+let cx = 1 in {
+ defm PVFMKSUP : RVMKm<"pvfmk.s.up.", 0xb6, V64, VM>;
+ let isCodeGenOnly = 1 in defm VFMKS : RVMKm<"vfmk.s.", 0xb6, V64, VM>;
+}
+def : MnemonicAlias<"vfmk.d", "vfmk.d.at">;
+def : MnemonicAlias<"pvfmk.s.lo", "pvfmk.s.lo.at">;
+def : MnemonicAlias<"pvfmk.s.up", "pvfmk.s.up.at">;
+def : MnemonicAlias<"vfmk.s", "pvfmk.s.up.at">;
+foreach CC = [ "af", "gt", "lt", "ne", "eq", "ge", "le", "at", "num", "nan",
+ "gtnan", "ltnan", "nenan", "eqnan", "genan", "lenan" ] in {
+ def : MnemonicAlias<"vfmk.s."#CC, "pvfmk.s.up."#CC>;
+}
+
+// Section 8.17.4 - ANDM (And VM)
+defm ANDM : RVM2m<"andm", 0x84, VM>;
+
+// Section 8.17.5 - ORM (Or VM)
+defm ORM : RVM2m<"orm", 0x85, VM>;
+
+// Section 8.17.6 - XORM (Exclusive Or VM)
+defm XORM : RVM2m<"xorm", 0x86, VM>;
+
+// Section 8.17.7 - EQVM (Equivalence VM)
+defm EQVM : RVM2m<"eqvm", 0x87, VM>;
+
+// Section 8.17.8 - NNDM (Negate And VM)
+defm NNDM : RVM2m<"nndm", 0x94, VM>;
+
+// Section 8.17.9 - NEGM (Negate VM)
+defm NEGM : RVM1m<"negm", 0x95, VM>;
+
+// Section 8.17.10 - PCVM (Population Count of VM)
+defm PCVM : RVMSm<"pcvm", 0xa4, VM>;
+
+// Section 8.17.11 - LZVM (Leading Zero of VM)
+defm LZVM : RVMSm<"lzvm", 0xa5, VM>;
+
+// Section 8.17.12 - TOVM (Trailing One of VM)
+defm TOVM : RVMSm<"tovm", 0xa6, VM>;
+
+//-----------------------------------------------------------------------------
+// Section 8.18 - Vector Control Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.18.1 - LVL (Load VL)
+let sx = 0, cz = 0, sz = 0, hasSideEffects = 0, Defs = [VL] in {
+ def LVLr : RR<0xbf, (outs), (ins I64:$sy), "lvl $sy">;
+ let cy = 0 in def LVLi : RR<0xbf, (outs), (ins simm7:$sy), "lvl $sy">;
+}
+
+// Section 8.18.2 - SVL (Save VL)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 0, Uses = [VL] in
+def SVL : RR<0x2f, (outs I64:$sx), (ins), "svl $sx">;
+
+// Section 8.18.3 - SMVL (Save Maximum Vector Length)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 0 in
+def SMVL : RR<0x2e, (outs I64:$sx), (ins), "smvl $sx">;
+
+// Section 8.18.4 - LVIX (Load Vector Data Index)
+let sx = 0, cz = 0, sz = 0, hasSideEffects = 0, Defs = [VIX] in {
+ def LVIXr : RR<0xaf, (outs), (ins I64:$sy), "lvix $sy">;
+ let cy = 0 in def LVIXi : RR<0xaf, (outs), (ins uimm6:$sy), "lvix $sy">;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp
index 9815610510e1..bc5577ce4f97 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -51,6 +51,11 @@ static MCOperand LowerOperand(const MachineInstr *MI, const MachineOperand &MO,
break;
return MCOperand::createReg(MO.getReg());
+ case MachineOperand::MO_BlockAddress:
+ return LowerSymbolOperand(
+ MI, MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP);
+ case MachineOperand::MO_ConstantPoolIndex:
+ return LowerSymbolOperand(MI, MO, AP.GetCPISymbol(MO.getIndex()), AP);
case MachineOperand::MO_ExternalSymbol:
return LowerSymbolOperand(
MI, MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
@@ -58,7 +63,8 @@ static MCOperand LowerOperand(const MachineInstr *MI, const MachineOperand &MO,
return LowerSymbolOperand(MI, MO, AP.getSymbol(MO.getGlobal()), AP);
case MachineOperand::MO_Immediate:
return MCOperand::createImm(MO.getImm());
-
+ case MachineOperand::MO_JumpTableIndex:
+ return LowerSymbolOperand(MI, MO, AP.GetJTISymbol(MO.getIndex()), AP);
case MachineOperand::MO_MachineBasicBlock:
return LowerSymbolOperand(MI, MO, MO.getMBB()->getSymbol(), AP);
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp
index 5783a8df69d2..d175ad26c742 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
@@ -35,6 +36,8 @@ VERegisterInfo::VERegisterInfo() : VEGenRegisterInfo(VE::SX10) {}
const MCPhysReg *
VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
switch (MF->getFunction().getCallingConv()) {
+ case CallingConv::Fast:
+ // Being explicit (same as standard CC).
default:
return CSR_SaveList;
case CallingConv::PreserveAll:
@@ -45,6 +48,8 @@ VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {
+ case CallingConv::Fast:
+ // Being explicit (same as standard CC).
default:
return CSR_RegMask;
case CallingConv::PreserveAll:
@@ -82,10 +87,22 @@ BitVector VERegisterInfo::getReservedRegs(const MachineFunction &MF) const {
++ItAlias)
Reserved.set(*ItAlias);
+ // Reserve constant registers.
+ Reserved.set(VE::VM0);
+ Reserved.set(VE::VMP0);
+
return Reserved;
}
-bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { return false; }
+bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+ switch (PhysReg) {
+ case VE::VM0:
+ case VE::VMP0:
+ return true;
+ default:
+ return false;
+ }
+}
const TargetRegisterClass *
VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
@@ -93,6 +110,29 @@ VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
return &VE::I64RegClass;
}
+static unsigned offsetToDisp(MachineInstr &MI) {
+ // Default offset in instruction's operands (reg+reg+imm).
+ unsigned OffDisp = 2;
+
+#define RRCAS_multi_cases(NAME) NAME##rir : case NAME##rii
+
+ {
+ using namespace llvm::VE;
+ switch (MI.getOpcode()) {
+ case RRCAS_multi_cases(TS1AML):
+ case RRCAS_multi_cases(TS1AMW):
+ case RRCAS_multi_cases(CASL):
+ case RRCAS_multi_cases(CASW):
+ // These instructions use AS format (reg+imm).
+ OffDisp = 1;
+ break;
+ }
+ }
+#undef RRCAS_multi_cases
+
+ return OffDisp;
+}
+
static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
MachineInstr &MI, const DebugLoc &dl,
unsigned FIOperandNum, int Offset, Register FrameReg) {
@@ -100,7 +140,7 @@ static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
// VE has 32 bit offset field, so no need to expand a target instruction.
// Directly encode it.
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
- MI.getOperand(FIOperandNum + 2).ChangeToImmediate(Offset);
+ MI.getOperand(FIOperandNum + offsetToDisp(MI)).ChangeToImmediate(Offset);
}
void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
@@ -116,9 +156,41 @@ void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Register FrameReg;
int Offset;
- Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
-
- Offset += MI.getOperand(FIOperandNum + 2).getImm();
+ Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
+
+ Offset += MI.getOperand(FIOperandNum + offsetToDisp(MI)).getImm();
+
+ if (MI.getOpcode() == VE::STQrii) {
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ Register SrcReg = MI.getOperand(3).getReg();
+ Register SrcHiReg = getSubReg(SrcReg, VE::sub_even);
+ Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd);
+ // VE stores HiReg to 8(addr) and LoReg to 0(addr)
+ MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(VE::STrii))
+ .addReg(FrameReg)
+ .addImm(0)
+ .addImm(0)
+ .addReg(SrcLoReg);
+ replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
+ MI.setDesc(TII.get(VE::STrii));
+ MI.getOperand(3).setReg(SrcHiReg);
+ Offset += 8;
+ } else if (MI.getOpcode() == VE::LDQrii) {
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register DestHiReg = getSubReg(DestReg, VE::sub_even);
+ Register DestLoReg = getSubReg(DestReg, VE::sub_odd);
+ // VE loads HiReg from 8(addr) and LoReg from 0(addr)
+ MachineInstr *StMI =
+ BuildMI(*MI.getParent(), II, dl, TII.get(VE::LDrii), DestLoReg)
+ .addReg(FrameReg)
+ .addImm(0)
+ .addImm(0);
+ replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
+ MI.setDesc(TII.get(VE::LDrii));
+ MI.getOperand(0).setReg(DestHiReg);
+ Offset += 8;
+ }
replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
}
@@ -126,26 +198,3 @@ void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Register VERegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return VE::SX9;
}
-
-// VE has no architectural need for stack realignment support,
-// except that LLVM unfortunately currently implements overaligned
-// stack objects by depending upon stack realignment support.
-// If that ever changes, this can probably be deleted.
-bool VERegisterInfo::canRealignStack(const MachineFunction &MF) const {
- if (!TargetRegisterInfo::canRealignStack(MF))
- return false;
-
- // VE always has a fixed frame pointer register, so don't need to
- // worry about needing to reserve it. [even if we don't have a frame
- // pointer for our frame, it still cannot be used for other things,
- // or register window traps will be SADNESS.]
-
- // If there's a reserved call frame, we can use VE to access locals.
- if (getFrameLowering(MF)->hasReservedCallFrame(MF))
- return true;
-
- // Otherwise, we'd need a base pointer, but those aren't implemented
- // for VE at the moment.
-
- return false;
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h
index 9a32da16bea6..334fb965a986 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.h
@@ -40,8 +40,6 @@ public:
RegScavenger *RS = nullptr) const override;
Register getFrameRegister(const MachineFunction &MF) const override;
-
- bool canRealignStack(const MachineFunction &MF) const override;
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td
index 29708d35c730..70ff104b65b7 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VERegisterInfo.td
@@ -26,13 +26,33 @@ class VEMiscReg<bits<6> enc, string n>: Register<n> {
let Namespace = "VE";
}
+class VEVecReg<bits<8> enc, string n, list<Register> subregs = [],
+ list<string> altNames = [], list<Register> aliases = []>
+ : Register<n, altNames> {
+ let HWEncoding{15-8} = 0;
+ let HWEncoding{7-0} = enc;
+ let Namespace = "VE";
+ let SubRegs = subregs;
+ let Aliases = aliases;
+}
+
+class VEMaskReg<bits<4> enc, string n, list<Register> subregs = [],
+ list<string> altNames = [], list<Register> aliases = []>
+ : Register<n, altNames> {
+ let HWEncoding{15-4} = 0;
+ let HWEncoding{3-0} = enc;
+ let Namespace = "VE";
+ let SubRegs = subregs;
+ let Aliases = aliases;
+}
+
let Namespace = "VE" in {
- def sub_i8 : SubRegIndex<8, 56>; // Low 8 bit (56..63)
- def sub_i16 : SubRegIndex<16, 48>; // Low 16 bit (48..63)
def sub_i32 : SubRegIndex<32, 32>; // Low 32 bit (32..63)
def sub_f32 : SubRegIndex<32>; // High 32 bit (0..31)
def sub_even : SubRegIndex<64>; // High 64 bit (0..63)
def sub_odd : SubRegIndex<64, 64>; // Low 64 bit (64..127)
+ def sub_vm_even : SubRegIndex<256>; // High 256 bit (0..255)
+ def sub_vm_odd : SubRegIndex<256, 256>; // Low 256 bit (256..511)
def AsmName : RegAltNameIndex;
}
@@ -66,26 +86,23 @@ def MISC : RegisterClass<"VE", [i64], 64,
def IC : VEMiscReg<62, "ic">;
//-----------------------------------------------------------------------------
-// Gneric Registers
+// Vector Length Register
//-----------------------------------------------------------------------------
-let RegAltNameIndices = [AsmName] in {
+def VL : VEMiscReg<63, "vl">;
-// Generic integer registers - 8 bits wide
-foreach I = 0-63 in
- def SB#I : VEReg<I, "sb"#I, [], ["s"#I]>, DwarfRegNum<[I]>;
+// Register classes.
+def VLS : RegisterClass<"VE", [i32], 64, (add VL)>;
-// Generic integer registers - 16 bits wide
-let SubRegIndices = [sub_i8] in
-foreach I = 0-63 in
- def SH#I : VEReg<I, "sh"#I, [!cast<VEReg>("SB"#I)], ["s"#I]>,
- DwarfRegNum<[I]>;
+//-----------------------------------------------------------------------------
+// Generic Registers
+//-----------------------------------------------------------------------------
+
+let RegAltNameIndices = [AsmName] in {
// Generic integer registers - 32 bits wide
-let SubRegIndices = [sub_i16] in
foreach I = 0-63 in
- def SW#I : VEReg<I, "sw"#I, [!cast<VEReg>("SH"#I)], ["s"#I]>,
- DwarfRegNum<[I]>;
+ def SW#I : VEReg<I, "sw"#I, [], ["s"#I]>, DwarfRegNum<[I]>;
// Generic floating point registers - 32 bits wide
// NOTE: Mark SF#I as alias of SW#I temporary to avoid register allocation
@@ -95,10 +112,21 @@ foreach I = 0-63 in
DwarfRegNum<[I]>;
// Generic integer registers - 64 bits wide
-let SubRegIndices = [sub_i32, sub_f32], CoveredBySubRegs = 1 in
-foreach I = 0-63 in
- def SX#I : VEReg<I, "s"#I, [!cast<VEReg>("SW"#I), !cast<VEReg>("SF"#I)],
- ["s"#I]>, DwarfRegNum<[I]>;
+let SubRegIndices = [sub_i32, sub_f32], CoveredBySubRegs = 1 in {
+ // Several registers have specific names, so add them to one of aliases.
+ def SX8 : VEReg<8, "s8", [SW8, SF8], ["s8", "sl"]>, DwarfRegNum<[8]>;
+ def SX9 : VEReg<9, "s9", [SW9, SF9], ["s9", "fp"]>, DwarfRegNum<[9]>;
+ def SX10 : VEReg<10, "s10", [SW10, SF10], ["s10", "lr"]>, DwarfRegNum<[10]>;
+ def SX11 : VEReg<11, "s11", [SW11, SF11], ["s11", "sp"]>, DwarfRegNum<[11]>;
+ def SX14 : VEReg<14, "s14", [SW14, SF14], ["s14", "tp"]>, DwarfRegNum<[14]>;
+ def SX15 : VEReg<15, "s15", [SW15, SF15], ["s15", "got"]>, DwarfRegNum<[15]>;
+ def SX16 : VEReg<16, "s16", [SW16, SF16], ["s16", "plt"]>, DwarfRegNum<[16]>;
+
+ // Other generic registers.
+ foreach I = { 0-7, 12-13, 17-63 } in
+ def SX#I : VEReg<I, "s"#I, [!cast<VEReg>("SW"#I), !cast<VEReg>("SF"#I)],
+ ["s"#I]>, DwarfRegNum<[I]>;
+}
// Aliases of the S* registers used to hold 128-bit for values (long doubles).
// Following foreach represents something like:
@@ -112,20 +140,31 @@ foreach I = 0-31 in
!cast<VEReg>("SX"#!add(!shl(I,1),1))],
["s"#!shl(I,1)]>;
+// Vector registers - 64 bits wide 256 elements
+foreach I = 0-63 in
+ def V#I : VEVecReg<I, "v"#I, [], ["v"#I]>, DwarfRegNum<[!add(64,I)]>;
+
+// Vector Index Register
+def VIX : VEVecReg<255, "vix", [], ["vix"]>;
+
+// Vector mask registers - 256 bits wide
+foreach I = 0-15 in
+ def VM#I : VEMaskReg<I, "vm"#I, [], ["vm"#I]>, DwarfRegNum<[!add(128,I)]>;
+
+// Aliases of VMs to use as a pair of two VM for packed instructions
+let SubRegIndices = [sub_vm_even, sub_vm_odd], CoveredBySubRegs = 1 in
+foreach I = 0-7 in
+ def VMP#I : VEMaskReg<!shl(I,1), "vmp"#I,
+ [!cast<VEMaskReg>("VM"#!shl(I,1)),
+ !cast<VEMaskReg>("VM"#!add(!shl(I,1),1))],
+ ["vm"#!shl(I,1)]>;
+
} // RegAltNameIndices = [AsmName]
// Register classes.
//
// The register order is defined in terms of the preferred
// allocation order.
-def I8 : RegisterClass<"VE", [i8], 8,
- (add (sequence "SB%u", 0, 7),
- (sequence "SB%u", 34, 63),
- (sequence "SB%u", 8, 33))>;
-def I16 : RegisterClass<"VE", [i16], 16,
- (add (sequence "SH%u", 0, 7),
- (sequence "SH%u", 34, 63),
- (sequence "SH%u", 8, 33))>;
def I32 : RegisterClass<"VE", [i32], 32,
(add (sequence "SW%u", 0, 7),
(sequence "SW%u", 34, 63),
@@ -142,3 +181,14 @@ def F128 : RegisterClass<"VE", [f128], 128,
(add (sequence "Q%u", 0, 3),
(sequence "Q%u", 17, 31),
(sequence "Q%u", 4, 16))>;
+
+def V64 : RegisterClass<"VE",
+ [v256f64, // default type for vector registers
+ v512i32, v512f32,
+ v256i64, v256i32, v256f32, /* v256f64, */], 64,
+ (add (sequence "V%u", 0, 63),
+ VIX)>;
+
+// vm0 is reserved for always true
+def VM : RegisterClass<"VE", [v256i1], 64, (sequence "VM%u", 0, 15)>;
+def VM512 : RegisterClass<"VE", [v512i1], 64, (sequence "VMP%u", 0, 7)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp
index a0b78d95e3cf..daa6cfb8aa84 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.cpp
@@ -27,73 +27,35 @@ void VESubtarget::anchor() {}
VESubtarget &VESubtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {
+ // Default feature settings
+ EnableVPU = false;
+
// Determine default and user specified characteristics
std::string CPUName = std::string(CPU);
if (CPUName.empty())
- CPUName = "ve";
+ CPUName = "generic";
// Parse features string.
- ParseSubtargetFeatures(CPUName, FS);
+ ParseSubtargetFeatures(CPUName, /*TuneCPU=*/CPU, FS);
return *this;
}
VESubtarget::VESubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
- : VEGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+ : VEGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), TargetTriple(TT),
InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
FrameLowering(*this) {}
-int VESubtarget::getAdjustedFrameSize(int frameSize) const {
-
- // VE stack frame:
- //
- // +----------------------------------------+
- // | Locals and temporaries |
- // +----------------------------------------+
- // | Parameter area for callee |
- // 176(fp) | |
- // +----------------------------------------+
- // | Register save area (RSA) for callee |
- // | |
- // 16(fp) | 20 * 8 bytes |
- // +----------------------------------------+
- // 8(fp) | Return address |
- // +----------------------------------------+
- // 0(fp) | Frame pointer of caller |
- // --------+----------------------------------------+--------
- // | Locals and temporaries for callee |
- // +----------------------------------------+
- // | Parameter area for callee of callee |
- // +----------------------------------------+
- // 16(sp) | RSA for callee of callee |
- // +----------------------------------------+
- // 8(sp) | Return address |
- // +----------------------------------------+
- // 0(sp) | Frame pointer of callee |
- // +----------------------------------------+
-
- // RSA frame:
- // +----------------------------------------------+
- // 168(fp) | %s33 |
- // +----------------------------------------------+
- // | %s19...%s32 |
- // +----------------------------------------------+
- // 48(fp) | %s18 |
- // +----------------------------------------------+
- // 40(fp) | Linkage area register (%s17) |
- // +----------------------------------------------+
- // 32(fp) | Procedure linkage table register (%plt=%s16) |
- // +----------------------------------------------+
- // 24(fp) | Global offset table register (%got=%s15) |
- // +----------------------------------------------+
- // 16(fp) | Thread pointer register (%tp=%s14) |
- // +----------------------------------------------+
+uint64_t VESubtarget::getAdjustedFrameSize(uint64_t FrameSize) const {
+ // Calculate adjusted frame size by adding the size of RSA frame,
+ // return address, and frame poitner as described in VEFrameLowering.cpp.
+ const VEFrameLowering *TFL = getFrameLowering();
- frameSize += 176; // for RSA, RA, and FP
- frameSize = alignTo(frameSize, 16); // requires 16 bytes alignment
+ FrameSize += getRsaSize();
+ FrameSize = alignTo(FrameSize, TFL->getStackAlign());
- return frameSize;
+ return FrameSize;
}
bool VESubtarget::enableMachineScheduler() const { return true; }
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h
index f3a2c206162e..213aca2ea3f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VESubtarget.h
@@ -32,6 +32,13 @@ class VESubtarget : public VEGenSubtargetInfo {
Triple TargetTriple;
virtual void anchor();
+ /// Features {
+
+ // Emit VPU instructions
+ bool EnableVPU;
+
+ /// } Features
+
VEInstrInfo InstrInfo;
VETargetLowering TLInfo;
SelectionDAGTargetInfo TSInfo;
@@ -55,15 +62,21 @@ public:
bool enableMachineScheduler() const override;
+ bool enableVPU() const { return EnableVPU; }
+
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
VESubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
/// Given a actual stack size as determined by FrameInfo, this function
- /// returns adjusted framesize which includes space for register window
- /// spills and arguments.
- int getAdjustedFrameSize(int stackSize) const;
+ /// returns adjusted framesize which includes space for RSA, return
+ /// address, and frame poitner.
+ uint64_t getAdjustedFrameSize(uint64_t FrameSize) const;
+
+ /// Get the size of RSA, return address, and frame pointer as described
+ /// in VEFrameLowering.cpp.
+ unsigned getRsaSize(void) const { return 176; };
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
};
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp
index 08b55eebbc98..414ae09431c0 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
#define DEBUG_TYPE "ve"
-extern "C" void LLVMInitializeVETarget() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
// Register the target.
RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
}
@@ -44,13 +44,24 @@ static std::string computeDataLayout(const Triple &T) {
// Stack alignment is 128 bits
Ret += "-S128";
+ // Vector alignments are 64 bits
+ // Need to define all of them. Otherwise, each alignment becomes
+ // the size of each data by default.
+ Ret += "-v64:64:64"; // for v2f32
+ Ret += "-v128:64:64";
+ Ret += "-v256:64:64";
+ Ret += "-v512:64:64";
+ Ret += "-v1024:64:64";
+ Ret += "-v2048:64:64";
+ Ret += "-v4096:64:64";
+ Ret += "-v8192:64:64";
+ Ret += "-v16384:64:64"; // for v256f64
+
return Ret;
}
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::Static;
- return *RM;
+ return RM.getValueOr(Reloc::Static);
}
class VEELFTargetObjectFile : public TargetLoweringObjectFileELF {
@@ -96,7 +107,9 @@ public:
return getTM<VETargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
+ void addPreEmitPass() override;
};
} // namespace
@@ -104,7 +117,18 @@ TargetPassConfig *VETargetMachine::createPassConfig(PassManagerBase &PM) {
return new VEPassConfig(*this, PM);
}
+void VEPassConfig::addIRPasses() {
+ // VE requires atomic expand pass.
+ addPass(createAtomicExpandPass());
+ TargetPassConfig::addIRPasses();
+}
+
bool VEPassConfig::addInstSelector() {
addPass(createVEISelDag(getVETargetMachine()));
return false;
}
+
+void VEPassConfig::addPreEmitPass() {
+ // LVLGen should be called after scheduling and register allocation
+ addPass(createLVLGenPass());
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
index c267c4d9a578..68af66597485 100644
--- a/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -33,16 +33,35 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
const VESubtarget *getST() const { return ST; }
const VETargetLowering *getTLI() const { return TLI; }
+ bool enableVPU() const { return getST()->enableVPU(); }
+
public:
explicit VETTIImpl(const VETargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
- unsigned getNumberOfRegisters(unsigned ClassID) const { return 64; }
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool VectorRegs = (ClassID == 1);
+ if (VectorRegs) {
+ // TODO report vregs once vector isel is stable.
+ return 0;
+ }
+
+ return 64;
+ }
- unsigned getRegisterBitWidth(bool Vector) const { return 64; }
+ unsigned getRegisterBitWidth(bool Vector) const {
+ if (Vector) {
+ // TODO report vregs once vector isel is stable.
+ return 0;
+ }
+ return 64;
+ }
- unsigned getMinVectorRegisterBitWidth() const { return 64; }
+ unsigned getMinVectorRegisterBitWidth() const {
+ // TODO report vregs once vector isel is stable.
+ return 0;
+ }
};
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td
new file mode 100644
index 000000000000..2c88d5099a7b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -0,0 +1,46 @@
+//===-------------- VVPInstrInfo.td - VVP_* SDNode patterns ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the VE Vector Predicated SDNodes (VVP SDNodes). VVP
+// SDNodes are an intermediate isel layer between the vector SDNodes emitted by
+// LLVM and the actual VE vector instructions. For example:
+//
+// ADD(x,y) --> VVP_ADD(x,y,mask,evl) --> VADDSWSXrvml(x,y,mask,evl)
+// ^ ^ ^
+// The standard The VVP layer SDNode. The VE vector instruction.
+// SDNode.
+//
+// TODO explain how VVP nodes relate to VP SDNodes once VP ISel is uptream.
+//===----------------------------------------------------------------------===//
+
+// Binary Operators {
+
+// BinaryOp(x,y,mask,vl)
+def SDTIntBinOpVVP : SDTypeProfile<1, 4, [ // vp_add, vp_and, etc.
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<0>,
+ SDTCisSameNumEltsAs<0, 3>,
+ IsVLVT<4>
+]>;
+
+// Binary operator commutative pattern.
+class vvp_commutative<SDNode RootOp> :
+ PatFrags<
+ (ops node:$lhs, node:$rhs, node:$mask, node:$vlen),
+ [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
+ (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
+
+// VVP node definitions.
+def vvp_add : SDNode<"VEISD::VVP_ADD", SDTIntBinOpVVP>;
+def c_vvp_add : vvp_commutative<vvp_add>;
+
+def vvp_and : SDNode<"VEISD::VVP_AND", SDTIntBinOpVVP>;
+def c_vvp_and : vvp_commutative<vvp_and>;
+
+// } Binary Operators
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td
new file mode 100644
index 000000000000..7003fb387670
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -0,0 +1,71 @@
+//===----------- VVPInstrPatternsVec.td - VVP_* SDNode patterns -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes how VVP_* SDNodes are lowered to machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// VVP SDNode definitions.
+//
+//===----------------------------------------------------------------------===//
+include "VVPInstrInfo.td"
+
+multiclass VectorBinaryArith<
+ SDPatternOperator OpNode,
+ ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
+ string OpBaseName,
+ SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+ // No mask.
+ def : Pat<(OpNode
+ (any_broadcast ScalarVT:$sx),
+ DataVT:$vy, (MaskVT true_mask), i32:$avl),
+ (!cast<Instruction>(OpBaseName#"rvl")
+ ScalarVT:$sx, $vy, $avl)>;
+ def : Pat<(OpNode DataVT:$vx, DataVT:$vy, (MaskVT true_mask), i32:$avl),
+ (!cast<Instruction>(OpBaseName#"vvl")
+ $vx, $vy, $avl)>;
+
+ // Mask.
+ def : Pat<(OpNode
+ (any_broadcast ScalarVT:$sx),
+ DataVT:$vy, MaskVT:$mask, i32:$avl),
+ (!cast<Instruction>(OpBaseName#"rvml")
+ ScalarVT:$sx, $vy, $mask, $avl)>;
+ def : Pat<(OpNode DataVT:$vx, DataVT:$vy, MaskVT:$mask, i32:$avl),
+ (!cast<Instruction>(OpBaseName#"vvml")
+ $vx, $vy, $mask, $avl)>;
+
+ // TODO We do not specify patterns for the immediate variants here. There
+ // will be an immediate folding pass that takes care of switching to the
+ // immediate variant where applicable.
+
+ // TODO Fold vvp_select into passthru.
+}
+
+// Expand both 64bit and 32 bit variant (256 elements)
+multiclass VectorBinaryArith_ShortLong<
+ SDPatternOperator OpNode,
+ ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+ ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+ defm : VectorBinaryArith<OpNode,
+ LongScalarVT, LongDataVT, v256i1,
+ LongOpBaseName, simm7, LO7>;
+ defm : VectorBinaryArith<OpNode,
+ ShortScalarVT, ShortDataVT, v256i1,
+ ShortOpBaseName, simm7, LO7>;
+}
+
+
+defm : VectorBinaryArith_ShortLong<c_vvp_add,
+ i64, v256i64, "VADDSL",
+ i32, v256i32, "VADDSWSX">;
+defm : VectorBinaryArith_ShortLong<c_vvp_and,
+ i64, v256i64, "VAND",
+ i32, v256i32, "PVANDLO">;
diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def b/contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def
new file mode 100644
index 000000000000..a68402e9ea10
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/VE/VVPNodes.def
@@ -0,0 +1,41 @@
+//===-- VVPNodes.def - Lists & properties of VE Vector Predication Nodes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all VVP_* SDNodes and their properties
+//
+//===----------------------------------------------------------------------===//
+
+/// HANDLE_VP_TO_VVP(VPOPC, VVPOPC)
+/// \p VPOPC is the VP_* SDNode opcode.
+/// \p VVPOPC is the VVP_* SDNode opcode.
+#ifndef HANDLE_VP_TO_VVP
+#define HANDLE_VP_TO_VVP(VPOPC, VVPOPC)
+#endif
+
+/// ADD_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP SDNode operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_VVP_OP
+#define ADD_VVP_OP(X, Y)
+#endif
+
+/// ADD_BINARY_VVP_OP(VVPNAME,SDNAME)
+/// \p VVPName is a VVP Binary operator.
+/// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
+#ifndef ADD_BINARY_VVP_OP
+#define ADD_BINARY_VVP_OP(X,Y) ADD_VVP_OP(X,Y) HANDLE_VP_TO_VVP(VP_##Y, X)
+#endif
+
+// Integer arithmetic.
+ADD_BINARY_VVP_OP(VVP_ADD,ADD)
+
+ADD_BINARY_VVP_OP(VVP_AND,AND)
+
+#undef HANDLE_VP_TO_VVP
+#undef ADD_BINARY_VVP_OP
+#undef ADD_VVP_OP
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index e29d85d7588d..60ac3248b9e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -35,10 +35,12 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-asm-parser"
+static const char *getSubtargetFeatureName(uint64_t Val);
+
namespace {
/// WebAssemblyOperand - Instances of this class represent the operands in a
-/// parsed WASM machine instruction.
+/// parsed Wasm machine instruction.
struct WebAssemblyOperand : public MCParsedAsmOperand {
enum KindTy { Token, Integer, Float, Symbol, BrList } Kind;
@@ -158,6 +160,24 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
}
};
+static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx,
+ const StringRef &Name) {
+ // FIXME: Duplicates functionality from
+ // MC/WasmObjectWriter::recordRelocation, as well as WebAssemblyCodegen's
+ // WebAssembly:getOrCreateFunctionTableSymbol.
+ MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+ if (Sym) {
+ if (!Sym->isFunctionTable())
+ Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
+ } else {
+ Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+ Sym->setFunctionTable();
+ // The default function table is synthesized by the linker.
+ Sym->setUndefined();
+ }
+ return Sym;
+}
+
class WebAssemblyAsmParser final : public MCTargetAsmParser {
MCAsmParser &Parser;
MCAsmLexer &Lexer;
@@ -320,8 +340,8 @@ public:
Type == "i32x4" || Type == "i64x2" || Type == "f32x4" ||
Type == "f64x2")
return wasm::ValType::V128;
- if (Type == "exnref")
- return wasm::ValType::EXNREF;
+ if (Type == "funcref")
+ return wasm::ValType::FUNCREF;
if (Type == "externref")
return wasm::ValType::EXTERNREF;
return Optional<wasm::ValType>();
@@ -335,7 +355,8 @@ public:
.Case("f32", WebAssembly::BlockType::F32)
.Case("f64", WebAssembly::BlockType::F64)
.Case("v128", WebAssembly::BlockType::V128)
- .Case("exnref", WebAssembly::BlockType::Exnref)
+ .Case("funcref", WebAssembly::BlockType::Funcref)
+ .Case("externref", WebAssembly::BlockType::Externref)
.Case("void", WebAssembly::BlockType::Void)
.Default(WebAssembly::BlockType::Invalid);
}
@@ -403,7 +424,8 @@ public:
bool checkForP2AlignIfLoadStore(OperandVector &Operands, StringRef InstName) {
// FIXME: there is probably a cleaner way to do this.
auto IsLoadStore = InstName.find(".load") != StringRef::npos ||
- InstName.find(".store") != StringRef::npos;
+ InstName.find(".store") != StringRef::npos ||
+ InstName.find("prefetch") != StringRef::npos;
auto IsAtomic = InstName.find("atomic.") != StringRef::npos;
if (IsLoadStore || IsAtomic) {
// Parse load/store operands of the form: offset:p2align=align
@@ -417,6 +439,12 @@ public:
return error("Expected integer constant");
parseSingleInteger(false, Operands);
} else {
+ // v128.{load,store}{8,16,32,64}_lane has both a memarg and a lane
+ // index. We need to avoid parsing an extra alignment operand for the
+ // lane index.
+ auto IsLoadStoreLane = InstName.find("_lane") != StringRef::npos;
+ if (IsLoadStoreLane && Operands.size() == 4)
+ return false;
// Alignment not specified (or atomics, must use default alignment).
// We can't just call WebAssembly::GetDefaultP2Align since we don't have
// an opcode until after the assembly matcher, so set a default to fix
@@ -430,6 +458,13 @@ public:
return false;
}
+ WebAssembly::HeapType parseHeapType(StringRef Id) {
+ return StringSwitch<WebAssembly::HeapType>(Id)
+ .Case("extern", WebAssembly::HeapType::Externref)
+ .Case("func", WebAssembly::HeapType::Funcref)
+ .Default(WebAssembly::HeapType::Invalid);
+ }
+
void addBlockTypeOperand(OperandVector &Operands, SMLoc NameLoc,
WebAssembly::BlockType BT) {
Operands.push_back(std::make_unique<WebAssemblyOperand>(
@@ -472,6 +507,7 @@ public:
// proper nesting.
bool ExpectBlockType = false;
bool ExpectFuncType = false;
+ bool ExpectHeapType = false;
if (Name == "block") {
push(Block);
ExpectBlockType = true;
@@ -511,6 +547,17 @@ public:
return true;
} else if (Name == "call_indirect" || Name == "return_call_indirect") {
ExpectFuncType = true;
+ // Ensure that the object file has a __indirect_function_table import, as
+ // we call_indirect against it.
+ auto &Ctx = getStreamer().getContext();
+ MCSymbolWasm *Sym =
+ GetOrCreateFunctionTableSymbol(Ctx, "__indirect_function_table");
+ // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
+ // it as NO_STRIP so as to ensure that the indirect function table makes
+ // it to linked output.
+ Sym->setNoStrip();
+ } else if (Name == "ref.null") {
+ ExpectHeapType = true;
}
if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) {
@@ -552,6 +599,15 @@ public:
return error("Unknown block type: ", Id);
addBlockTypeOperand(Operands, NameLoc, BT);
Parser.Lex();
+ } else if (ExpectHeapType) {
+ auto HeapType = parseHeapType(Id.getString());
+ if (HeapType == WebAssembly::HeapType::Invalid) {
+ return error("Expected a heap type: ", Id);
+ }
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Integer, Id.getLoc(), Id.getEndLoc(),
+ WebAssemblyOperand::IntOp{static_cast<int64_t>(HeapType)}));
+ Parser.Lex();
} else {
// Assume this identifier is a label.
const MCExpr *Val;
@@ -687,16 +743,52 @@ public:
auto Type = parseType(TypeName);
if (!Type)
return error("Unknown type in .globaltype directive: ", TypeTok);
+ // Optional mutable modifier. Default to mutable for historical reasons.
+ // Ideally we would have gone with immutable as the default and used `mut`
+ // as the modifier to match the `.wat` format.
+ bool Mutable = true;
+ if (isNext(AsmToken::Comma)) {
+ TypeTok = Lexer.getTok();
+ auto Id = expectIdent();
+ if (Id == "immutable")
+ Mutable = false;
+ else
+ // Should we also allow `mutable` and `mut` here for clarity?
+ return error("Unknown type in .globaltype modifier: ", TypeTok);
+ }
// Now set this symbol with the correct type.
auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
WasmSym->setGlobalType(
- wasm::WasmGlobalType{uint8_t(Type.getValue()), true});
+ wasm::WasmGlobalType{uint8_t(Type.getValue()), Mutable});
// And emit the directive again.
TOut.emitGlobalType(WasmSym);
return expect(AsmToken::EndOfStatement, "EOL");
}
+ if (DirectiveID.getString() == ".tabletype") {
+ auto SymName = expectIdent();
+ if (SymName.empty())
+ return true;
+ if (expect(AsmToken::Comma, ","))
+ return true;
+ auto TypeTok = Lexer.getTok();
+ auto TypeName = expectIdent();
+ if (TypeName.empty())
+ return true;
+ auto Type = parseType(TypeName);
+ if (!Type)
+ return error("Unknown type in .tabletype directive: ", TypeTok);
+
+ // Now that we have the name and table type, we can actually create the
+ // symbol
+ auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+ WasmSym->setTableType(Type.getValue());
+ TOut.emitTableType(WasmSym);
+ return expect(AsmToken::EndOfStatement, "EOL");
+ }
+
if (DirectiveID.getString() == ".functype") {
// This code has to send things to the streamer similar to
// WebAssemblyAsmPrinter::EmitFunctionBodyStart.
@@ -836,8 +928,9 @@ public:
bool MatchingInlineAsm) override {
MCInst Inst;
Inst.setLoc(IDLoc);
- unsigned MatchResult =
- MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+ FeatureBitset MissingFeatures;
+ unsigned MatchResult = MatchInstructionImpl(
+ Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm);
switch (MatchResult) {
case Match_Success: {
ensureLocals(Out);
@@ -866,9 +959,16 @@ public:
}
return false;
}
- case Match_MissingFeature:
- return Parser.Error(
- IDLoc, "instruction requires a WASM feature not currently enabled");
+ case Match_MissingFeature: {
+ assert(MissingFeatures.count() > 0 && "Expected missing features");
+ SmallString<128> Message;
+ raw_svector_ostream OS(Message);
+ OS << "instruction requires:";
+ for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)
+ if (MissingFeatures.test(i))
+ OS << ' ' << getSubtargetFeatureName(i);
+ return Parser.Error(IDLoc, Message);
+ }
case Match_MnemonicFail:
return Parser.Error(IDLoc, "invalid instruction");
case Match_NearMisses:
@@ -896,12 +996,27 @@ public:
auto SymName = Symbol->getName();
if (SymName.startswith(".L"))
return; // Local Symbol.
+
// Only create a new text section if we're already in one.
+ // TODO: If the user explicitly creates a new function section, we ignore
+ // its name when we create this one. It would be nice to honor their
+ // choice, while still ensuring that we create one if they forget.
+ // (that requires coordination with WasmAsmParser::parseSectionDirective)
auto CWS = cast<MCSectionWasm>(getStreamer().getCurrentSection().first);
if (!CWS || !CWS->getKind().isText())
return;
auto SecName = ".text." + SymName;
- auto WS = getContext().getWasmSection(SecName, SectionKind::getText());
+
+ auto *Group = CWS->getGroup();
+ // If the current section is a COMDAT, also set the flag on the symbol.
+ // TODO: Currently the only place that the symbols' comdat flag matters is
+ // for importing comdat functions. But there's no way to specify that in
+ // assembly currently.
+ if (Group)
+ cast<MCSymbolWasm>(Symbol)->setComdat(true);
+ auto *WS =
+ getContext().getWasmSection(SecName, SectionKind::getText(), Group,
+ MCContext::GenericSectionID, nullptr);
getStreamer().SwitchSection(WS);
// Also generate DWARF for this section if requested.
if (getContext().getGenDwarfForAssembly())
@@ -932,5 +1047,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmParser() {
}
#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
#define GET_MATCHER_IMPLEMENTATION
#include "WebAssemblyGenAsmMatcher.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 42fa6d58fffd..1b7cc093f7ad 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -198,6 +198,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
case WebAssembly::OPERAND_LOCAL:
case WebAssembly::OPERAND_GLOBAL:
case WebAssembly::OPERAND_FUNCTION32:
+ case WebAssembly::OPERAND_TABLE:
case WebAssembly::OPERAND_OFFSET32:
case WebAssembly::OPERAND_OFFSET64:
case WebAssembly::OPERAND_P2ALIGN:
@@ -240,6 +241,28 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
}
break;
}
+ // heap_type operands, for e.g. ref.null:
+ case WebAssembly::OPERAND_HEAPTYPE: {
+ int64_t Val;
+ uint64_t PrevSize = Size;
+ if (!nextLEB(Val, Bytes, Size, true))
+ return MCDisassembler::Fail;
+ if (Val < 0 && Size == PrevSize + 1) {
+ // The HeapType encoding is like BlockType, in that encodings that
+ // decode as negative values indicate ValTypes. In practice we expect
+ // either wasm::ValType::EXTERNREF or wasm::ValType::FUNCREF here.
+ //
+ // The positive SLEB values are reserved for future expansion and are
+ // expected to be type indices in the typed function references
+ // proposal, and should disassemble as MCSymbolRefExpr as in BlockType
+ // above.
+ MI.addOperand(MCOperand::createImm(Val & 0x7f));
+ } else {
+ MI.addOperand(
+ MCOperand::createImm(int64_t(WebAssembly::HeapType::Invalid)));
+ }
+ break;
+ }
// FP operands.
case WebAssembly::OPERAND_F32IMM: {
if (!parseImmediate<float>(MI, Size, Bytes))
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 8ecd7c53621d..d88311197c1a 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -59,11 +59,6 @@ public:
return false;
}
- bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
-
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index f60b5fcd14ec..fb8b0c364f30 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -94,19 +94,18 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
case WebAssembly::LOOP_S:
printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
- break;
+ return;
case WebAssembly::BLOCK:
case WebAssembly::BLOCK_S:
ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
- break;
+ return;
case WebAssembly::TRY:
case WebAssembly::TRY_S:
- ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
- EHPadStack.push_back(EHPadStackCounter++);
- LastSeenEHInst = TRY;
- break;
+ ControlFlowStack.push_back(std::make_pair(ControlFlowCounter, false));
+ EHPadStack.push_back(ControlFlowCounter++);
+ return;
case WebAssembly::END_LOOP:
case WebAssembly::END_LOOP_S:
@@ -115,7 +114,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
} else {
ControlFlowStack.pop_back();
}
- break;
+ return;
case WebAssembly::END_BLOCK:
case WebAssembly::END_BLOCK_S:
@@ -125,7 +124,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
printAnnotation(
OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
}
- break;
+ return;
case WebAssembly::END_TRY:
case WebAssembly::END_TRY_S:
@@ -134,60 +133,60 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
} else {
printAnnotation(
OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
- LastSeenEHInst = END_TRY;
}
- break;
+ return;
case WebAssembly::CATCH:
case WebAssembly::CATCH_S:
+ case WebAssembly::CATCH_ALL:
+ case WebAssembly::CATCH_ALL_S:
if (EHPadStack.empty()) {
printAnnotation(OS, "try-catch mismatch!");
} else {
printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
}
- break;
- }
-
- // Annotate any control flow label references.
+ return;
- // rethrow instruction does not take any depth argument and rethrows to the
- // nearest enclosing catch scope, if any. If there's no enclosing catch
- // scope, it throws up to the caller.
- if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) {
+ case WebAssembly::RETHROW:
+ case WebAssembly::RETHROW_S:
+ // 'rethrow' rethrows to the nearest enclosing catch scope, if any. If
+ // there's no enclosing catch scope, it throws up to the caller.
if (EHPadStack.empty()) {
printAnnotation(OS, "to caller");
} else {
printAnnotation(OS, "down to catch" + utostr(EHPadStack.back()));
}
+ return;
+ }
- } else {
- unsigned NumFixedOperands = Desc.NumOperands;
- SmallSet<uint64_t, 8> Printed;
- for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) {
- // See if this operand denotes a basic block target.
- if (I < NumFixedOperands) {
- // A non-variable_ops operand, check its type.
- if (Desc.OpInfo[I].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
- continue;
- } else {
- // A variable_ops operand, which currently can be immediates (used in
- // br_table) which are basic block targets, or for call instructions
- // when using -wasm-keep-registers (in which case they are registers,
- // and should not be processed).
- if (!MI->getOperand(I).isImm())
- continue;
- }
- uint64_t Depth = MI->getOperand(I).getImm();
- if (!Printed.insert(Depth).second)
+ // Annotate any control flow label references.
+
+ unsigned NumFixedOperands = Desc.NumOperands;
+ SmallSet<uint64_t, 8> Printed;
+ for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) {
+ // See if this operand denotes a basic block target.
+ if (I < NumFixedOperands) {
+ // A non-variable_ops operand, check its type.
+ if (Desc.OpInfo[I].OperandType != WebAssembly::OPERAND_BASIC_BLOCK)
continue;
- if (Depth >= ControlFlowStack.size()) {
- printAnnotation(OS, "Invalid depth argument!");
- } else {
- const auto &Pair = ControlFlowStack.rbegin()[Depth];
- printAnnotation(OS, utostr(Depth) + ": " +
- (Pair.second ? "up" : "down") + " to label" +
- utostr(Pair.first));
- }
+ } else {
+ // A variable_ops operand, which currently can be immediates (used in
+ // br_table) which are basic block targets, or for call instructions
+ // when using -wasm-keep-registers (in which case they are registers,
+ // and should not be processed).
+ if (!MI->getOperand(I).isImm())
+ continue;
+ }
+ uint64_t Depth = MI->getOperand(I).getImm();
+ if (!Printed.insert(Depth).second)
+ continue;
+ if (Depth >= ControlFlowStack.size()) {
+ printAnnotation(OS, "Invalid depth argument!");
+ } else {
+ const auto &Pair = ControlFlowStack.rbegin()[Depth];
+ printAnnotation(OS, utostr(Depth) + ": " +
+ (Pair.second ? "up" : "down") + " to label" +
+ utostr(Pair.first));
}
}
}
@@ -302,6 +301,29 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
}
}
+void WebAssemblyInstPrinter::printWebAssemblyHeapTypeOperand(const MCInst *MI,
+ unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ switch (Op.getImm()) {
+ case long(wasm::ValType::EXTERNREF):
+ O << "extern";
+ break;
+ case long(wasm::ValType::FUNCREF):
+ O << "func";
+ break;
+ default:
+ O << "unsupported_heap_type_value";
+ break;
+ }
+ } else {
+ // Typed function references and other subtypes of funcref and externref
+ // currently unimplemented.
+ O << "unsupported_heap_type_operand";
+ }
+}
+
// We have various enums representing a subset of these types, use this
// function to convert any of them to text.
const char *WebAssembly::anyTypeToString(unsigned Ty) {
@@ -318,10 +340,10 @@ const char *WebAssembly::anyTypeToString(unsigned Ty) {
return "v128";
case wasm::WASM_TYPE_FUNCREF:
return "funcref";
+ case wasm::WASM_TYPE_EXTERNREF:
+ return "externref";
case wasm::WASM_TYPE_FUNC:
return "func";
- case wasm::WASM_TYPE_EXNREF:
- return "exnref";
case wasm::WASM_TYPE_NORESULT:
return "void";
default:
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 1387a1928b3f..2ed6d562acff 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -25,13 +25,9 @@ class MCSubtargetInfo;
class WebAssemblyInstPrinter final : public MCInstPrinter {
uint64_t ControlFlowCounter = 0;
- uint64_t EHPadStackCounter = 0;
SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
SmallVector<uint64_t, 4> EHPadStack;
- enum EHInstKind { TRY, CATCH, END_TRY };
- EHInstKind LastSeenEHInst = END_TRY;
-
public:
WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
const MCRegisterInfo &MRI);
@@ -48,8 +44,11 @@ public:
raw_ostream &O);
void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O);
+ void printWebAssemblyHeapTypeOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O);
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
};
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index dfed3451e45b..55bf5d14fdac 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -62,12 +62,16 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
uint64_t Start = OS.tell();
uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
- if (Binary <= UINT8_MAX) {
+ if (Binary < (1 << 8)) {
OS << uint8_t(Binary);
- } else {
- assert(Binary <= UINT16_MAX && "Several-byte opcodes not supported yet");
+ } else if (Binary < (1 << 16)) {
OS << uint8_t(Binary >> 8);
encodeULEB128(uint8_t(Binary), OS);
+ } else if (Binary < (1 << 24)) {
+ OS << uint8_t(Binary >> 16);
+ encodeULEB128(uint16_t(Binary), OS);
+ } else {
+ llvm_unreachable("Very large (prefix + 3 byte) opcodes not supported");
}
// For br_table instructions, encode the size of the table. In the MCInst,
@@ -102,6 +106,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
encodeSLEB128(int64_t(MO.getImm()), OS);
break;
case WebAssembly::OPERAND_SIGNATURE:
+ case WebAssembly::OPERAND_HEAPTYPE:
OS << uint8_t(MO.getImm());
break;
case WebAssembly::OPERAND_VEC_I8IMM:
@@ -151,6 +156,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
PaddedSize = 10;
break;
case WebAssembly::OPERAND_FUNCTION32:
+ case WebAssembly::OPERAND_TABLE:
case WebAssembly::OPERAND_OFFSET32:
case WebAssembly::OPERAND_SIGNATURE:
case WebAssembly::OPERAND_TYPEINDEX:
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 027e5408c633..064e613cfc8e 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -76,7 +76,7 @@ static MCAsmBackend *createAsmBackend(const Target & /*T*/,
static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
StringRef FS) {
- return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS);
+ return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCTargetStreamer *
@@ -147,8 +147,10 @@ wasm::ValType WebAssembly::toValType(const MVT &Ty) {
case MVT::v4f32:
case MVT::v2f64:
return wasm::ValType::V128;
- case MVT::exnref:
- return wasm::ValType::EXNREF;
+ case MVT::funcref:
+ return wasm::ValType::FUNCREF;
+ case MVT::externref:
+ return wasm::ValType::EXTERNREF;
default:
llvm_unreachable("unexpected type");
}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 02b310628ee1..5b77b8495adf 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -76,6 +76,10 @@ enum OperandType {
OPERAND_EVENT,
/// A list of branch targets for br_list.
OPERAND_BRLIST,
+ /// 32-bit unsigned table number.
+ OPERAND_TABLE,
+ /// heap type immediate for ref.null.
+ OPERAND_HEAPTYPE,
};
} // end namespace WebAssembly
@@ -97,6 +101,11 @@ enum TOF {
MO_MEMORY_BASE_REL,
// On a symbol operand this indicates that the immediate is the symbol
+ // address relative the __tls_base wasm global.
+ // Only applicable to data symbols.
+ MO_TLS_BASE_REL,
+
+ // On a symbol operand this indicates that the immediate is the symbol
// address relative the __table_base wasm global.
// Only applicable to function symbols.
MO_TABLE_BASE_REL,
@@ -129,7 +138,8 @@ enum class BlockType : unsigned {
F32 = unsigned(wasm::ValType::F32),
F64 = unsigned(wasm::ValType::F64),
V128 = unsigned(wasm::ValType::V128),
- Exnref = unsigned(wasm::ValType::EXNREF),
+ Externref = unsigned(wasm::ValType::EXTERNREF),
+ Funcref = unsigned(wasm::ValType::FUNCREF),
// Multivalue blocks (and other non-void blocks) are only emitted when the
// blocks will never be exited and are at the ends of functions (see
// WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
@@ -138,6 +148,13 @@ enum class BlockType : unsigned {
Multivalue = 0xffff,
};
+/// Used as immediate MachineOperands for heap types, e.g. for ref.null.
+enum class HeapType : unsigned {
+ Invalid = 0x00,
+ Externref = unsigned(wasm::ValType::EXTERNREF),
+ Funcref = unsigned(wasm::ValType::FUNCREF),
+};
+
/// Instruction opcodes emitted via means other than CodeGen.
static const unsigned Nop = 0x01;
static const unsigned End = 0x0b;
@@ -176,8 +193,12 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW8_U_XCHG_I64)
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I64)
- WASM_LOAD_STORE(LOAD_SPLAT_v8x16)
- return 0;
+ WASM_LOAD_STORE(LOAD8_SPLAT)
+ WASM_LOAD_STORE(LOAD_LANE_I8x16)
+ WASM_LOAD_STORE(STORE_LANE_I8x16)
+ WASM_LOAD_STORE(PREFETCH_T)
+ WASM_LOAD_STORE(PREFETCH_NT)
+ return 0;
WASM_LOAD_STORE(LOAD16_S_I32)
WASM_LOAD_STORE(LOAD16_U_I32)
WASM_LOAD_STORE(LOAD16_S_I64)
@@ -202,8 +223,10 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW16_U_XCHG_I64)
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I64)
- WASM_LOAD_STORE(LOAD_SPLAT_v16x8)
- return 1;
+ WASM_LOAD_STORE(LOAD16_SPLAT)
+ WASM_LOAD_STORE(LOAD_LANE_I16x8)
+ WASM_LOAD_STORE(STORE_LANE_I16x8)
+ return 1;
WASM_LOAD_STORE(LOAD_I32)
WASM_LOAD_STORE(LOAD_F32)
WASM_LOAD_STORE(STORE_I32)
@@ -229,10 +252,13 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW32_U_XCHG_I64)
WASM_LOAD_STORE(ATOMIC_RMW_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW32_U_CMPXCHG_I64)
- WASM_LOAD_STORE(ATOMIC_NOTIFY)
- WASM_LOAD_STORE(ATOMIC_WAIT_I32)
- WASM_LOAD_STORE(LOAD_SPLAT_v32x4)
- return 2;
+ WASM_LOAD_STORE(MEMORY_ATOMIC_NOTIFY)
+ WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT32)
+ WASM_LOAD_STORE(LOAD32_SPLAT)
+ WASM_LOAD_STORE(LOAD_ZERO_I32x4)
+ WASM_LOAD_STORE(LOAD_LANE_I32x4)
+ WASM_LOAD_STORE(STORE_LANE_I32x4)
+ return 2;
WASM_LOAD_STORE(LOAD_I64)
WASM_LOAD_STORE(LOAD_F64)
WASM_LOAD_STORE(STORE_I64)
@@ -246,15 +272,18 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW_XOR_I64)
WASM_LOAD_STORE(ATOMIC_RMW_XCHG_I64)
WASM_LOAD_STORE(ATOMIC_RMW_CMPXCHG_I64)
- WASM_LOAD_STORE(ATOMIC_WAIT_I64)
- WASM_LOAD_STORE(LOAD_SPLAT_v64x2)
- WASM_LOAD_STORE(LOAD_EXTEND_S_v8i16)
- WASM_LOAD_STORE(LOAD_EXTEND_U_v8i16)
- WASM_LOAD_STORE(LOAD_EXTEND_S_v4i32)
- WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32)
- WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64)
- WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64)
- return 3;
+ WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT64)
+ WASM_LOAD_STORE(LOAD64_SPLAT)
+ WASM_LOAD_STORE(LOAD_EXTEND_S_I16x8)
+ WASM_LOAD_STORE(LOAD_EXTEND_U_I16x8)
+ WASM_LOAD_STORE(LOAD_EXTEND_S_I32x4)
+ WASM_LOAD_STORE(LOAD_EXTEND_U_I32x4)
+ WASM_LOAD_STORE(LOAD_EXTEND_S_I64x2)
+ WASM_LOAD_STORE(LOAD_EXTEND_U_I64x2)
+ WASM_LOAD_STORE(LOAD_ZERO_I64x2)
+ WASM_LOAD_STORE(LOAD_LANE_I64x2)
+ WASM_LOAD_STORE(STORE_LANE_I64x2)
+ return 3;
WASM_LOAD_STORE(LOAD_V128)
WASM_LOAD_STORE(STORE_V128)
return 4;
@@ -294,8 +323,10 @@ inline bool isArgument(unsigned Opc) {
case WebAssembly::ARGUMENT_v4f32_S:
case WebAssembly::ARGUMENT_v2f64:
case WebAssembly::ARGUMENT_v2f64_S:
- case WebAssembly::ARGUMENT_exnref:
- case WebAssembly::ARGUMENT_exnref_S:
+ case WebAssembly::ARGUMENT_funcref:
+ case WebAssembly::ARGUMENT_funcref_S:
+ case WebAssembly::ARGUMENT_externref:
+ case WebAssembly::ARGUMENT_externref_S:
return true;
default:
return false;
@@ -314,8 +345,10 @@ inline bool isCopy(unsigned Opc) {
case WebAssembly::COPY_F64_S:
case WebAssembly::COPY_V128:
case WebAssembly::COPY_V128_S:
- case WebAssembly::COPY_EXNREF:
- case WebAssembly::COPY_EXNREF_S:
+ case WebAssembly::COPY_FUNCREF:
+ case WebAssembly::COPY_FUNCREF_S:
+ case WebAssembly::COPY_EXTERNREF:
+ case WebAssembly::COPY_EXTERNREF_S:
return true;
default:
return false;
@@ -334,8 +367,10 @@ inline bool isTee(unsigned Opc) {
case WebAssembly::TEE_F64_S:
case WebAssembly::TEE_V128:
case WebAssembly::TEE_V128_S:
- case WebAssembly::TEE_EXNREF:
- case WebAssembly::TEE_EXNREF_S:
+ case WebAssembly::TEE_FUNCREF:
+ case WebAssembly::TEE_FUNCREF_S:
+ case WebAssembly::TEE_EXTERNREF:
+ case WebAssembly::TEE_EXTERNREF_S:
return true;
default:
return false;
@@ -398,6 +433,18 @@ inline bool isMarker(unsigned Opc) {
}
}
+inline bool isCatch(unsigned Opc) {
+ switch (Opc) {
+ case WebAssembly::CATCH:
+ case WebAssembly::CATCH_S:
+ case WebAssembly::CATCH_ALL:
+ case WebAssembly::CATCH_ALL_S:
+ return true;
+ default:
+ return false;
+ }
+}
+
} // end namespace WebAssembly
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index e954eeaebb14..652d7a00a63c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -71,8 +71,17 @@ void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) {
assert(Sym->isGlobal());
OS << "\t.globaltype\t" << Sym->getName() << ", "
<< WebAssembly::typeToString(
- static_cast<wasm::ValType>(Sym->getGlobalType().Type))
- << '\n';
+ static_cast<wasm::ValType>(Sym->getGlobalType().Type));
+ if (!Sym->getGlobalType().Mutable)
+ OS << ", immutable";
+ OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitTableType(const MCSymbolWasm *Sym) {
+ assert(Sym->isTable());
+ OS << "\t.tabletype\t" << Sym->getName() << ", "
+ << WebAssembly::typeToString(Sym->getTableType());
+ OS << '\n';
}
void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index d6fba05c9986..75c9fb4e289d 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -39,6 +39,8 @@ public:
virtual void emitIndIdx(const MCExpr *Value) = 0;
/// .globaltype
virtual void emitGlobalType(const MCSymbolWasm *Sym) = 0;
+ /// .tabletype
+ virtual void emitTableType(const MCSymbolWasm *Sym) = 0;
/// .eventtype
virtual void emitEventType(const MCSymbolWasm *Sym) = 0;
/// .import_module
@@ -67,6 +69,7 @@ public:
void emitFunctionType(const MCSymbolWasm *Sym) override;
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalType(const MCSymbolWasm *Sym) override;
+ void emitTableType(const MCSymbolWasm *Sym) override;
void emitEventType(const MCSymbolWasm *Sym) override;
void emitImportModule(const MCSymbolWasm *Sym, StringRef ImportModule) override;
void emitImportName(const MCSymbolWasm *Sym, StringRef ImportName) override;
@@ -83,6 +86,7 @@ public:
void emitFunctionType(const MCSymbolWasm *Sym) override {}
void emitIndIdx(const MCExpr *Value) override;
void emitGlobalType(const MCSymbolWasm *Sym) override {}
+ void emitTableType(const MCSymbolWasm *Sym) override {}
void emitEventType(const MCSymbolWasm *Sym) override {}
void emitImportModule(const MCSymbolWasm *Sym,
StringRef ImportModule) override {}
@@ -103,6 +107,7 @@ public:
void emitFunctionType(const MCSymbolWasm *) override {}
void emitIndIdx(const MCExpr *) override {}
void emitGlobalType(const MCSymbolWasm *) override {}
+ void emitTableType(const MCSymbolWasm *) override {}
void emitEventType(const MCSymbolWasm *) override {}
void emitImportModule(const MCSymbolWasm *, StringRef) override {}
void emitImportName(const MCSymbolWasm *, StringRef) override {}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 779e921c1d94..aa7e2311d240 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -76,6 +76,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
case MCSymbolRefExpr::VK_WASM_TBREL:
assert(SymA.isFunction());
return wasm::R_WASM_TABLE_INDEX_REL_SLEB;
+ case MCSymbolRefExpr::VK_WASM_TLSREL:
+ return wasm::R_WASM_MEMORY_ADDR_TLS_SLEB;
case MCSymbolRefExpr::VK_WASM_MBREL:
assert(SymA.isData());
return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_REL_SLEB64
@@ -92,7 +94,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
return wasm::R_WASM_TABLE_INDEX_SLEB;
return wasm::R_WASM_MEMORY_ADDR_SLEB;
case WebAssembly::fixup_sleb128_i64:
- assert(SymA.isData());
+ if (SymA.isFunction())
+ return wasm::R_WASM_TABLE_INDEX_SLEB64;
return wasm::R_WASM_MEMORY_ADDR_SLEB64;
case WebAssembly::fixup_uleb128_i32:
if (SymA.isGlobal())
@@ -101,6 +104,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
return wasm::R_WASM_FUNCTION_INDEX_LEB;
if (SymA.isEvent())
return wasm::R_WASM_EVENT_INDEX_LEB;
+ if (SymA.isTable())
+ return wasm::R_WASM_TABLE_NUMBER_LEB;
return wasm::R_WASM_MEMORY_ADDR_LEB;
case WebAssembly::fixup_uleb128_i64:
assert(SymA.isData());
@@ -119,6 +124,17 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
}
return wasm::R_WASM_MEMORY_ADDR_I32;
case FK_Data_8:
+ if (SymA.isFunction())
+ return wasm::R_WASM_TABLE_INDEX_I64;
+ if (SymA.isGlobal())
+ llvm_unreachable("unimplemented R_WASM_GLOBAL_INDEX_I64");
+ if (auto Section = static_cast<const MCSectionWasm *>(
+ getFixupSection(Fixup.getValue()))) {
+ if (Section->getKind().isText())
+ return wasm::R_WASM_FUNCTION_OFFSET_I64;
+ else if (!Section->isWasmData())
+ llvm_unreachable("unimplemented R_WASM_SECTION_OFFSET_I64");
+ }
assert(SymA.isData());
return wasm::R_WASM_MEMORY_ADDR_I64;
default:
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 96fa13d30729..7f1c4bb40a4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -49,6 +49,8 @@ using namespace llvm;
#define DEBUG_TYPE "asm-printer"
extern cl::opt<bool> WasmKeepRegisters;
+extern cl::opt<bool> EnableEmException;
+extern cl::opt<bool> EnableEmSjLj;
//===----------------------------------------------------------------------===//
// Helpers.
@@ -81,10 +83,92 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
return static_cast<WebAssemblyTargetStreamer *>(TS);
}
+// Emscripten exception handling helpers
+//
+// This converts invoke names generated by LowerEmscriptenEHSjLj to real names
+// that are expected by JavaScript glue code. The invoke names generated by
+// Emscripten JS glue code are based on their argument and return types; for
+// example, for a function that takes an i32 and returns nothing, it is
+// 'invoke_vi'. But the format of invoke generated by LowerEmscriptenEHSjLj pass
+// contains a mangled string generated from their IR types, for example,
+// "__invoke_void_%struct.mystruct*_int", because final wasm types are not
+// available in the IR pass. So we convert those names to the form that
+// Emscripten JS code expects.
+//
+// Refer to LowerEmscriptenEHSjLj pass for more details.
+
+// Returns true if the given function name is an invoke name generated by
+// LowerEmscriptenEHSjLj pass.
+static bool isEmscriptenInvokeName(StringRef Name) {
+ if (Name.front() == '"' && Name.back() == '"')
+ Name = Name.substr(1, Name.size() - 2);
+ return Name.startswith("__invoke_");
+}
+
+// Returns a character that represents the given wasm value type in invoke
+// signatures.
+static char getInvokeSig(wasm::ValType VT) {
+ switch (VT) {
+ case wasm::ValType::I32:
+ return 'i';
+ case wasm::ValType::I64:
+ return 'j';
+ case wasm::ValType::F32:
+ return 'f';
+ case wasm::ValType::F64:
+ return 'd';
+ case wasm::ValType::V128:
+ return 'V';
+ case wasm::ValType::FUNCREF:
+ return 'F';
+ case wasm::ValType::EXTERNREF:
+ return 'X';
+ }
+ llvm_unreachable("Unhandled wasm::ValType enum");
+}
+
+// Given the wasm signature, generate the invoke name in the format JS glue code
+// expects.
+static std::string getEmscriptenInvokeSymbolName(wasm::WasmSignature *Sig) {
+ assert(Sig->Returns.size() <= 1);
+ std::string Ret = "invoke_";
+ if (!Sig->Returns.empty())
+ for (auto VT : Sig->Returns)
+ Ret += getInvokeSig(VT);
+ else
+ Ret += 'v';
+ // Invokes' first argument is a pointer to the original function, so skip it
+ for (unsigned I = 1, E = Sig->Params.size(); I < E; I++)
+ Ret += getInvokeSig(Sig->Params[I]);
+ return Ret;
+}
+
//===----------------------------------------------------------------------===//
// WebAssemblyAsmPrinter Implementation.
//===----------------------------------------------------------------------===//
+MCSymbolWasm *WebAssemblyAsmPrinter::getMCSymbolForFunction(
+ const Function *F, bool EnableEmEH, wasm::WasmSignature *Sig,
+ bool &InvokeDetected) {
+ MCSymbolWasm *WasmSym = nullptr;
+ if (EnableEmEH && isEmscriptenInvokeName(F->getName())) {
+ assert(Sig);
+ InvokeDetected = true;
+ if (Sig->Returns.size() > 1) {
+ std::string Msg =
+ "Emscripten EH/SjLj does not support multivalue returns: " +
+ std::string(F->getName()) + ": " +
+ WebAssembly::signatureToString(Sig);
+ report_fatal_error(Msg);
+ }
+ WasmSym = cast<MCSymbolWasm>(
+ GetExternalSymbolSymbol(getEmscriptenInvokeSymbolName(Sig)));
+ } else {
+ WasmSym = cast<MCSymbolWasm>(getSymbol(F));
+ }
+ return WasmSym;
+}
+
void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
for (auto &It : OutContext.getSymbols()) {
// Emit a .globaltype and .eventtype declaration.
@@ -95,6 +179,7 @@ void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
getTargetStreamer()->emitEventType(Sym);
}
+ DenseSet<MCSymbol *> InvokeSymbols;
for (const auto &F : M) {
if (F.isIntrinsic())
continue;
@@ -104,31 +189,46 @@ void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
SmallVector<MVT, 4> Results;
SmallVector<MVT, 4> Params;
computeSignatureVTs(F.getFunctionType(), &F, F, TM, Params, Results);
- auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
+ // At this point these MCSymbols may or may not have been created already
+ // and thus also contain a signature, but we need to get the signature
+ // anyway here in case it is an invoke that has not yet been created. We
+ // will discard it later if it turns out not to be necessary.
+ auto Signature = signatureFromMVTs(Results, Params);
+ bool InvokeDetected = false;
+ auto *Sym = getMCSymbolForFunction(&F, EnableEmException || EnableEmSjLj,
+ Signature.get(), InvokeDetected);
+
+ // Multiple functions can be mapped to the same invoke symbol. For
+ // example, two IR functions '__invoke_void_i8*' and '__invoke_void_i32'
+ // are both mapped to '__invoke_vi'. We keep them in a set once we emit an
+ // Emscripten EH symbol so we don't emit the same symbol twice.
+ if (InvokeDetected && !InvokeSymbols.insert(Sym).second)
+ continue;
+
Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
if (!Sym->getSignature()) {
- auto Signature = signatureFromMVTs(Results, Params);
Sym->setSignature(Signature.get());
addSignature(std::move(Signature));
+ } else {
+ // This symbol has already been created and had a signature. Discard it.
+ Signature.reset();
}
- // FIXME: this was originally intended for post-linking and was only used
- // for imports that were only called indirectly (i.e. s2wasm could not
- // infer the type from a call). With object files it applies to all
- // imports. so fix the names and the tests, or rethink how import
- // delcarations work in asm files.
+
getTargetStreamer()->emitFunctionType(Sym);
- if (TM.getTargetTriple().isOSBinFormatWasm() &&
- F.hasFnAttribute("wasm-import-module")) {
+ if (F.hasFnAttribute("wasm-import-module")) {
StringRef Name =
F.getFnAttribute("wasm-import-module").getValueAsString();
Sym->setImportModule(storeName(Name));
getTargetStreamer()->emitImportModule(Sym, Name);
}
- if (TM.getTargetTriple().isOSBinFormatWasm() &&
- F.hasFnAttribute("wasm-import-name")) {
+ if (F.hasFnAttribute("wasm-import-name")) {
+ // If this is a converted Emscripten EH/SjLj symbol, we shouldn't use
+ // the original function name but the converted symbol name.
StringRef Name =
- F.getFnAttribute("wasm-import-name").getValueAsString();
+ InvokeDetected
+ ? Sym->getName()
+ : F.getFnAttribute("wasm-import-name").getValueAsString();
Sym->setImportName(storeName(Name));
getTargetStreamer()->emitImportName(Sym, Name);
}
@@ -304,7 +404,6 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
addSignature(std::move(Signature));
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
- // FIXME: clean up how params and results are emitted (use signatures)
getTargetStreamer()->emitFunctionType(WasmSym);
// Emit the function index.
@@ -362,14 +461,6 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
// This is a compiler barrier that prevents instruction reordering during
// backend compilation, and should not be emitted.
break;
- case WebAssembly::EXTRACT_EXCEPTION_I32:
- case WebAssembly::EXTRACT_EXCEPTION_I32_S:
- // These are pseudo instructions that simulates popping values from stack.
- // We print these only when we have -wasm-keep-registers on for assembly
- // readability.
- if (!WasmKeepRegisters)
- break;
- LLVM_FALLTHROUGH;
default: {
WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
MCInst TmpInst;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index d9281568638d..7a6a3247a19f 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -77,6 +77,9 @@ public:
MVT getRegType(unsigned RegNo) const;
std::string regToString(const MachineOperand &MO);
WebAssemblyTargetStreamer *getTargetStreamer();
+ MCSymbolWasm *getMCSymbolForFunction(const Function *F, bool EnableEmEH,
+ wasm::WasmSignature *Sig,
+ bool &InvokeDetected);
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 8442b49e25f4..eb3e9b91d40d 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
#include "WebAssemblyExceptionInfo.h"
+#include "WebAssemblySortRegion.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyUtilities.h"
#include "llvm/ADT/PriorityQueue.h"
@@ -31,6 +32,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
+using WebAssembly::SortRegion;
+using WebAssembly::SortRegionInfo;
#define DEBUG_TYPE "wasm-cfg-sort"
@@ -44,78 +47,6 @@ static cl::opt<bool> WasmDisableEHPadSort(
namespace {
-// Wrapper for loops and exceptions
-class Region {
-public:
- virtual ~Region() = default;
- virtual MachineBasicBlock *getHeader() const = 0;
- virtual bool contains(const MachineBasicBlock *MBB) const = 0;
- virtual unsigned getNumBlocks() const = 0;
- using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
- virtual iterator_range<block_iterator> blocks() const = 0;
- virtual bool isLoop() const = 0;
-};
-
-template <typename T> class ConcreteRegion : public Region {
- const T *Region;
-
-public:
- ConcreteRegion(const T *Region) : Region(Region) {}
- MachineBasicBlock *getHeader() const override { return Region->getHeader(); }
- bool contains(const MachineBasicBlock *MBB) const override {
- return Region->contains(MBB);
- }
- unsigned getNumBlocks() const override { return Region->getNumBlocks(); }
- iterator_range<block_iterator> blocks() const override {
- return Region->blocks();
- }
- bool isLoop() const override { return false; }
-};
-
-template <> bool ConcreteRegion<MachineLoop>::isLoop() const { return true; }
-
-// This class has information of nested Regions; this is analogous to what
-// LoopInfo is for loops.
-class RegionInfo {
- const MachineLoopInfo &MLI;
- const WebAssemblyExceptionInfo &WEI;
- DenseMap<const MachineLoop *, std::unique_ptr<Region>> LoopMap;
- DenseMap<const WebAssemblyException *, std::unique_ptr<Region>> ExceptionMap;
-
-public:
- RegionInfo(const MachineLoopInfo &MLI, const WebAssemblyExceptionInfo &WEI)
- : MLI(MLI), WEI(WEI) {}
-
- // Returns a smallest loop or exception that contains MBB
- const Region *getRegionFor(const MachineBasicBlock *MBB) {
- const auto *ML = MLI.getLoopFor(MBB);
- const auto *WE = WEI.getExceptionFor(MBB);
- if (!ML && !WE)
- return nullptr;
- // We determine subregion relationship by domination of their headers, i.e.,
- // if region A's header dominates region B's header, B is a subregion of A.
- // WebAssemblyException contains BBs in all its subregions (loops or
- // exceptions), but MachineLoop may not, because MachineLoop does not contain
- // BBs that don't have a path to its header even if they are dominated by
- // its header. So here we should use WE->contains(ML->getHeader()), but not
- // ML->contains(WE->getHeader()).
- if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
- // If the smallest region containing MBB is a loop
- if (LoopMap.count(ML))
- return LoopMap[ML].get();
- LoopMap[ML] = std::make_unique<ConcreteRegion<MachineLoop>>(ML);
- return LoopMap[ML].get();
- } else {
- // If the smallest region containing MBB is an exception
- if (ExceptionMap.count(WE))
- return ExceptionMap[WE].get();
- ExceptionMap[WE] =
- std::make_unique<ConcreteRegion<WebAssemblyException>>(WE);
- return ExceptionMap[WE].get();
- }
- }
-};
-
class WebAssemblyCFGSort final : public MachineFunctionPass {
StringRef getPassName() const override { return "WebAssembly CFG Sort"; }
@@ -236,14 +167,14 @@ struct CompareBlockNumbersBackwards {
/// Bookkeeping for a region to help ensure that we don't mix blocks not
/// dominated by the its header among its blocks.
struct Entry {
- const Region *TheRegion;
+ const SortRegion *TheRegion;
unsigned NumBlocksLeft;
/// List of blocks not dominated by Loop's header that are deferred until
/// after all of Loop's blocks have been seen.
std::vector<MachineBasicBlock *> Deferred;
- explicit Entry(const class Region *R)
+ explicit Entry(const SortRegion *R)
: TheRegion(R), NumBlocksLeft(R->getNumBlocks()) {}
};
} // end anonymous namespace
@@ -287,10 +218,10 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
CompareBlockNumbersBackwards>
Ready;
- RegionInfo RI(MLI, WEI);
+ SortRegionInfo SRI(MLI, WEI);
SmallVector<Entry, 4> Entries;
for (MachineBasicBlock *MBB = &MF.front();;) {
- const Region *R = RI.getRegionFor(MBB);
+ const SortRegion *R = SRI.getRegionFor(MBB);
if (R) {
// If MBB is a region header, add it to the active region list. We can't
// put any blocks that it doesn't dominate until we see the end of the
@@ -373,7 +304,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
MF.RenumberBlocks();
#ifndef NDEBUG
- SmallSetVector<const Region *, 8> OnStack;
+ SmallSetVector<const SortRegion *, 8> OnStack;
// Insert a sentinel representing the degenerate loop that starts at the
// function entry block and includes the entire function as a "loop" that
@@ -382,7 +313,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
for (auto &MBB : MF) {
assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
- const Region *Region = RI.getRegionFor(&MBB);
+ const SortRegion *Region = SRI.getRegionFor(&MBB);
if (Region && &MBB == Region->getHeader()) {
// Region header.
@@ -408,10 +339,10 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
for (auto Pred : MBB.predecessors())
assert(Pred->getNumber() < MBB.getNumber() &&
"Non-loop-header predecessors should be topologically sorted");
- assert(OnStack.count(RI.getRegionFor(&MBB)) &&
+ assert(OnStack.count(SRI.getRegionFor(&MBB)) &&
"Blocks must be nested in their regions");
}
- while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back()))
+ while (OnStack.size() > 1 && &MBB == SRI.getBottom(OnStack.back()))
OnStack.pop_back();
}
assert(OnStack.pop_back_val() == nullptr &&
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 8cbfc98e8197..a8e0c3efea0e 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -24,6 +24,7 @@
#include "WebAssembly.h"
#include "WebAssemblyExceptionInfo.h"
#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySortRegion.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyUtilities.h"
#include "llvm/ADT/Statistic.h"
@@ -33,6 +34,7 @@
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
+using WebAssembly::SortRegionInfo;
#define DEBUG_TYPE "wasm-cfg-stackify"
@@ -55,6 +57,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
// which holds the beginning of the scope. This will allow us to quickly skip
// over scoped regions when walking blocks.
SmallVector<MachineBasicBlock *, 8> ScopeTops;
+ void updateScopeTops(MachineBasicBlock *Begin, MachineBasicBlock *End) {
+ int EndNo = End->getNumber();
+ if (!ScopeTops[EndNo] || ScopeTops[EndNo]->getNumber() > Begin->getNumber())
+ ScopeTops[EndNo] = Begin;
+ }
// Placing markers.
void placeMarkers(MachineFunction &MF);
@@ -133,10 +140,10 @@ static bool explicitlyBranchesTo(MachineBasicBlock *Pred,
// contains instructions that should go before the marker, and AfterSet contains
// ones that should go after the marker. In this function, AfterSet is only
// used for sanity checking.
+template <typename Container>
static MachineBasicBlock::iterator
-getEarliestInsertPos(MachineBasicBlock *MBB,
- const SmallPtrSet<const MachineInstr *, 4> &BeforeSet,
- const SmallPtrSet<const MachineInstr *, 4> &AfterSet) {
+getEarliestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
+ const Container &AfterSet) {
auto InsertPos = MBB->end();
while (InsertPos != MBB->begin()) {
if (BeforeSet.count(&*std::prev(InsertPos))) {
@@ -157,10 +164,10 @@ getEarliestInsertPos(MachineBasicBlock *MBB,
// contains instructions that should go before the marker, and AfterSet contains
// ones that should go after the marker. In this function, BeforeSet is only
// used for sanity checking.
+template <typename Container>
static MachineBasicBlock::iterator
-getLatestInsertPos(MachineBasicBlock *MBB,
- const SmallPtrSet<const MachineInstr *, 4> &BeforeSet,
- const SmallPtrSet<const MachineInstr *, 4> &AfterSet) {
+getLatestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
+ const Container &AfterSet) {
auto InsertPos = MBB->begin();
while (InsertPos != MBB->end()) {
if (AfterSet.count(&*InsertPos)) {
@@ -219,20 +226,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
// which reduces overall stack height.
MachineBasicBlock *Header = nullptr;
bool IsBranchedTo = false;
- bool IsBrOnExn = false;
- MachineInstr *BrOnExn = nullptr;
int MBBNumber = MBB.getNumber();
for (MachineBasicBlock *Pred : MBB.predecessors()) {
if (Pred->getNumber() < MBBNumber) {
Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
- if (explicitlyBranchesTo(Pred, &MBB)) {
+ if (explicitlyBranchesTo(Pred, &MBB))
IsBranchedTo = true;
- if (Pred->getFirstTerminator()->getOpcode() == WebAssembly::BR_ON_EXN) {
- IsBrOnExn = true;
- assert(!BrOnExn && "There should be only one br_on_exn per block");
- BrOnExn = &*Pred->getFirstTerminator();
- }
- }
}
}
if (!Header)
@@ -317,22 +316,7 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
}
// Add the BLOCK.
-
- // 'br_on_exn' extracts exnref object and pushes variable number of values
- // depending on its tag. For C++ exception, its a single i32 value, and the
- // generated code will be in the form of:
- // block i32
- // br_on_exn 0, $__cpp_exception
- // rethrow
- // end_block
WebAssembly::BlockType ReturnType = WebAssembly::BlockType::Void;
- if (IsBrOnExn) {
- const char *TagName = BrOnExn->getOperand(1).getSymbolName();
- if (std::strcmp(TagName, "__cpp_exception") != 0)
- llvm_unreachable("Only C++ exception is supported");
- ReturnType = WebAssembly::BlockType::I32;
- }
-
auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
MachineInstr *Begin =
BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
@@ -372,16 +356,15 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
registerScope(Begin, End);
// Track the farthest-spanning scope that ends at this point.
- int Number = MBB.getNumber();
- if (!ScopeTops[Number] ||
- ScopeTops[Number]->getNumber() > Header->getNumber())
- ScopeTops[Number] = Header;
+ updateScopeTops(Header, &MBB);
}
/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
MachineFunction &MF = *MBB.getParent();
const auto &MLI = getAnalysis<MachineLoopInfo>();
+ const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
+ SortRegionInfo SRI(MLI, WEI);
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
MachineLoop *Loop = MLI.getLoopFor(&MBB);
@@ -390,7 +373,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
// The operand of a LOOP is the first block after the loop. If the loop is the
// bottom of the function, insert a dummy block at the end.
- MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop);
+ MachineBasicBlock *Bottom = SRI.getBottom(Loop);
auto Iter = std::next(Bottom->getIterator());
if (Iter == MF.end()) {
getAppendixBlock(MF);
@@ -441,8 +424,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
assert((!ScopeTops[AfterLoop->getNumber()] ||
ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
"With block sorting the outermost loop for a block should be first.");
- if (!ScopeTops[AfterLoop->getNumber()])
- ScopeTops[AfterLoop->getNumber()] = &MBB;
+ updateScopeTops(&MBB, AfterLoop);
}
void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
@@ -450,7 +432,9 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
MachineFunction &MF = *MBB.getParent();
auto &MDT = getAnalysis<MachineDominatorTree>();
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ const auto &MLI = getAnalysis<MachineLoopInfo>();
const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
+ SortRegionInfo SRI(MLI, WEI);
const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
// Compute the nearest common dominator of all unwind predecessors
@@ -470,7 +454,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
// end.
WebAssemblyException *WE = WEI.getExceptionFor(&MBB);
assert(WE);
- MachineBasicBlock *Bottom = WebAssembly::getBottom(WE);
+ MachineBasicBlock *Bottom = SRI.getBottom(WE);
auto Iter = std::next(Bottom->getIterator());
if (Iter == MF.end()) {
@@ -639,11 +623,8 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
// catch |
// end_block --|
// end_try
- for (int Number : {Cont->getNumber(), MBB.getNumber()}) {
- if (!ScopeTops[Number] ||
- ScopeTops[Number]->getNumber() > Header->getNumber())
- ScopeTops[Number] = Header;
- }
+ for (auto *End : {&MBB, Cont})
+ updateScopeTops(Header, End);
}
void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
@@ -656,11 +637,32 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
// try
// ...
// br bb2 <- Not necessary
- // bb1:
+ // bb1 (ehpad):
+ // catch
+ // ...
+ // bb2: <- Continuation BB
+ // end
+ //
+ // A more involved case: When the BB where 'end' is located is an another EH
+ // pad, the Cont (= continuation) BB is that EH pad's 'end' BB. For example,
+ // bb0:
+ // try
+ // try
+ // ...
+ // br bb3 <- Not necessary
+ // bb1 (ehpad):
+ // catch
+ // bb2 (ehpad):
+ // end
// catch
// ...
- // bb2:
+ // bb3: <- Continuation BB
// end
+ //
+ // When the EH pad at hand is bb1, its matching end_try is in bb2. But it is
+ // another EH pad, so bb0's continuation BB becomes bb3. So 'br bb3' in the
+ // code can be deleted. This is why we run 'while' until 'Cont' is not an EH
+ // pad.
for (auto &MBB : MF) {
if (!MBB.isEHPad())
continue;
@@ -668,7 +670,14 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
MachineBasicBlock *EHPadLayoutPred = MBB.getPrevNode();
- MachineBasicBlock *Cont = BeginToEnd[EHPadToTry[&MBB]]->getParent();
+
+ MachineBasicBlock *Cont = &MBB;
+ while (Cont->isEHPad()) {
+ MachineInstr *Try = EHPadToTry[Cont];
+ MachineInstr *EndTry = BeginToEnd[Try];
+ Cont = EndTry->getParent();
+ }
+
bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
// This condition means either
// 1. This BB ends with a single unconditional branch whose destinaion is
@@ -745,18 +754,26 @@ static unsigned getCopyOpcode(const TargetRegisterClass *RC) {
return WebAssembly::COPY_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::COPY_V128;
- if (RC == &WebAssembly::EXNREFRegClass)
- return WebAssembly::COPY_EXNREF;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return WebAssembly::COPY_FUNCREF;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return WebAssembly::COPY_EXTERNREF;
llvm_unreachable("Unexpected register class");
}
// When MBB is split into MBB and Split, we should unstackify defs in MBB that
// have their uses in Split.
-static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
- MachineBasicBlock &Split,
- WebAssemblyFunctionInfo &MFI,
- MachineRegisterInfo &MRI,
- const WebAssemblyInstrInfo &TII) {
+// FIXME This function will be used when fixing unwind mismatches, but the old
+// version of that function was removed for the moment and the new version has
+// not yet been added. So 'LLVM_ATTRIBUTE_UNUSED' is added to suppress the
+// warning. Remove the attribute after the new functionality is added.
+LLVM_ATTRIBUTE_UNUSED static void
+unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, MachineBasicBlock &Split) {
+ MachineFunction &MF = *MBB.getParent();
+ const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+ auto &MRI = MF.getRegInfo();
+
for (auto &MI : Split) {
for (auto &MO : MI.explicit_uses()) {
if (!MO.isReg() || Register::isPhysicalRegister(MO.getReg()))
@@ -810,525 +827,8 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
}
bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
- const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- // Linearizing the control flow by placing TRY / END_TRY markers can create
- // mismatches in unwind destinations. There are two kinds of mismatches we
- // try to solve here.
-
- // 1. When an instruction may throw, but the EH pad it will unwind to can be
- // different from the original CFG.
- //
- // Example: we have the following CFG:
- // bb0:
- // call @foo (if it throws, unwind to bb2)
- // bb1:
- // call @bar (if it throws, unwind to bb3)
- // bb2 (ehpad):
- // catch
- // ...
- // bb3 (ehpad)
- // catch
- // handler body
- //
- // And the CFG is sorted in this order. Then after placing TRY markers, it
- // will look like: (BB markers are omitted)
- // try $label1
- // try
- // call @foo
- // call @bar (if it throws, unwind to bb3)
- // catch <- ehpad (bb2)
- // ...
- // end_try
- // catch <- ehpad (bb3)
- // handler body
- // end_try
- //
- // Now if bar() throws, it is going to end up ip in bb2, not bb3, where it
- // is supposed to end up. We solve this problem by
- // a. Split the target unwind EH pad (here bb3) so that the handler body is
- // right after 'end_try', which means we extract the handler body out of
- // the catch block. We do this because this handler body should be
- // somewhere branch-eable from the inner scope.
- // b. Wrap the call that has an incorrect unwind destination ('call @bar'
- // here) with a nested try/catch/end_try scope, and within the new catch
- // block, branches to the handler body.
- // c. Place a branch after the newly inserted nested end_try so it can bypass
- // the handler body, which is now outside of a catch block.
- //
- // The result will like as follows. (new: a) means this instruction is newly
- // created in the process of doing 'a' above.
- //
- // block $label0 (new: placeBlockMarker)
- // try $label1
- // try
- // call @foo
- // try (new: b)
- // call @bar
- // catch (new: b)
- // local.set n / drop (new: b)
- // br $label1 (new: b)
- // end_try (new: b)
- // catch <- ehpad (bb2)
- // end_try
- // br $label0 (new: c)
- // catch <- ehpad (bb3)
- // end_try (hoisted: a)
- // handler body
- // end_block (new: placeBlockMarker)
- //
- // Note that the new wrapping block/end_block will be generated later in
- // placeBlockMarker.
- //
- // TODO Currently local.set and local.gets are generated to move exnref value
- // created by catches. That's because we don't support yielding values from a
- // block in LLVM machine IR yet, even though it is supported by wasm. Delete
- // unnecessary local.get/local.sets once yielding values from a block is
- // supported. The full EH spec requires multi-value support to do this, but
- // for C++ we don't yet need it because we only throw a single i32.
- //
- // ---
- // 2. The same as 1, but in this case an instruction unwinds to a caller
- // function and not another EH pad.
- //
- // Example: we have the following CFG:
- // bb0:
- // call @foo (if it throws, unwind to bb2)
- // bb1:
- // call @bar (if it throws, unwind to caller)
- // bb2 (ehpad):
- // catch
- // ...
- //
- // And the CFG is sorted in this order. Then after placing TRY markers, it
- // will look like:
- // try
- // call @foo
- // call @bar (if it throws, unwind to caller)
- // catch <- ehpad (bb2)
- // ...
- // end_try
- //
- // Now if bar() throws, it is going to end up ip in bb2, when it is supposed
- // throw up to the caller.
- // We solve this problem by
- // a. Create a new 'appendix' BB at the end of the function and put a single
- // 'rethrow' instruction (+ local.get) in there.
- // b. Wrap the call that has an incorrect unwind destination ('call @bar'
- // here) with a nested try/catch/end_try scope, and within the new catch
- // block, branches to the new appendix block.
- //
- // block $label0 (new: placeBlockMarker)
- // try
- // call @foo
- // try (new: b)
- // call @bar
- // catch (new: b)
- // local.set n (new: b)
- // br $label0 (new: b)
- // end_try (new: b)
- // catch <- ehpad (bb2)
- // ...
- // end_try
- // ...
- // end_block (new: placeBlockMarker)
- // local.get n (new: a) <- appendix block
- // rethrow (new: a)
- //
- // In case there are multiple calls in a BB that may throw to the caller, they
- // can be wrapped together in one nested try scope. (In 1, this couldn't
- // happen, because may-throwing instruction there had an unwind destination,
- // i.e., it was an invoke before, and there could be only one invoke within a
- // BB.)
-
- SmallVector<const MachineBasicBlock *, 8> EHPadStack;
- // Range of intructions to be wrapped in a new nested try/catch
- using TryRange = std::pair<MachineInstr *, MachineInstr *>;
- // In original CFG, <unwind destination BB, a vector of try ranges>
- DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
- // In new CFG, <destination to branch to, a vector of try ranges>
- DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> BrDestToTryRanges;
- // In new CFG, <destination to branch to, register containing exnref>
- DenseMap<MachineBasicBlock *, unsigned> BrDestToExnReg;
-
- // Destinations for branches that will be newly added, for which a new
- // BLOCK/END_BLOCK markers are necessary.
- SmallVector<MachineBasicBlock *, 8> BrDests;
-
- // Gather possibly throwing calls (i.e., previously invokes) whose current
- // unwind destination is not the same as the original CFG.
- for (auto &MBB : reverse(MF)) {
- bool SeenThrowableInstInBB = false;
- for (auto &MI : reverse(MBB)) {
- if (MI.getOpcode() == WebAssembly::TRY)
- EHPadStack.pop_back();
- else if (MI.getOpcode() == WebAssembly::CATCH)
- EHPadStack.push_back(MI.getParent());
-
- // In this loop we only gather calls that have an EH pad to unwind. So
- // there will be at most 1 such call (= invoke) in a BB, so after we've
- // seen one, we can skip the rest of BB. Also if MBB has no EH pad
- // successor or MI does not throw, this is not an invoke.
- if (SeenThrowableInstInBB || !MBB.hasEHPadSuccessor() ||
- !WebAssembly::mayThrow(MI))
- continue;
- SeenThrowableInstInBB = true;
-
- // If the EH pad on the stack top is where this instruction should unwind
- // next, we're good.
- MachineBasicBlock *UnwindDest = nullptr;
- for (auto *Succ : MBB.successors()) {
- if (Succ->isEHPad()) {
- UnwindDest = Succ;
- break;
- }
- }
- if (EHPadStack.back() == UnwindDest)
- continue;
-
- // If not, record the range.
- UnwindDestToTryRanges[UnwindDest].push_back(TryRange(&MI, &MI));
- }
- }
-
- assert(EHPadStack.empty());
-
- // Gather possibly throwing calls that are supposed to unwind up to the caller
- // if they throw, but currently unwind to an incorrect destination. Unlike the
- // loop above, there can be multiple calls within a BB that unwind to the
- // caller, which we should group together in a range.
- bool NeedAppendixBlock = false;
- for (auto &MBB : reverse(MF)) {
- MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr; // inclusive
- for (auto &MI : reverse(MBB)) {
- if (MI.getOpcode() == WebAssembly::TRY)
- EHPadStack.pop_back();
- else if (MI.getOpcode() == WebAssembly::CATCH)
- EHPadStack.push_back(MI.getParent());
-
- // If MBB has an EH pad successor, this inst does not unwind to caller.
- if (MBB.hasEHPadSuccessor())
- continue;
-
- // We wrap up the current range when we see a marker even if we haven't
- // finished a BB.
- if (RangeEnd && WebAssembly::isMarker(MI.getOpcode())) {
- NeedAppendixBlock = true;
- // Record the range. nullptr here means the unwind destination is the
- // caller.
- UnwindDestToTryRanges[nullptr].push_back(
- TryRange(RangeBegin, RangeEnd));
- RangeBegin = RangeEnd = nullptr; // Reset range pointers
- }
-
- // If EHPadStack is empty, that means it is correctly unwind to caller if
- // it throws, so we're good. If MI does not throw, we're good too.
- if (EHPadStack.empty() || !WebAssembly::mayThrow(MI))
- continue;
-
- // We found an instruction that unwinds to the caller but currently has an
- // incorrect unwind destination. Create a new range or increment the
- // currently existing range.
- if (!RangeEnd)
- RangeBegin = RangeEnd = &MI;
- else
- RangeBegin = &MI;
- }
-
- if (RangeEnd) {
- NeedAppendixBlock = true;
- // Record the range. nullptr here means the unwind destination is the
- // caller.
- UnwindDestToTryRanges[nullptr].push_back(TryRange(RangeBegin, RangeEnd));
- RangeBegin = RangeEnd = nullptr; // Reset range pointers
- }
- }
-
- assert(EHPadStack.empty());
- // We don't have any unwind destination mismatches to resolve.
- if (UnwindDestToTryRanges.empty())
- return false;
-
- // If we found instructions that should unwind to the caller but currently
- // have incorrect unwind destination, we create an appendix block at the end
- // of the function with a local.get and a rethrow instruction.
- if (NeedAppendixBlock) {
- auto *AppendixBB = getAppendixBlock(MF);
- Register ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
- BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW))
- .addReg(ExnReg);
- // These instruction ranges should branch to this appendix BB.
- for (auto Range : UnwindDestToTryRanges[nullptr])
- BrDestToTryRanges[AppendixBB].push_back(Range);
- BrDestToExnReg[AppendixBB] = ExnReg;
- }
-
- // We loop through unwind destination EH pads that are targeted from some
- // inner scopes. Because these EH pads are destination of more than one scope
- // now, we split them so that the handler body is after 'end_try'.
- // - Before
- // ehpad:
- // catch
- // local.set n / drop
- // handler body
- // ...
- // cont:
- // end_try
- //
- // - After
- // ehpad:
- // catch
- // local.set n / drop
- // brdest: (new)
- // end_try (hoisted from 'cont' BB)
- // handler body (taken from 'ehpad')
- // ...
- // cont:
- for (auto &P : UnwindDestToTryRanges) {
- NumUnwindMismatches += P.second.size();
-
- // This means the destination is the appendix BB, which was separately
- // handled above.
- if (!P.first)
- continue;
-
- MachineBasicBlock *EHPad = P.first;
-
- // Find 'catch' and 'local.set' or 'drop' instruction that follows the
- // 'catch'. If -wasm-disable-explicit-locals is not set, 'catch' should be
- // always followed by either 'local.set' or a 'drop', because 'br_on_exn' is
- // generated after 'catch' in LateEHPrepare and we don't support blocks
- // taking values yet.
- MachineInstr *Catch = nullptr;
- unsigned ExnReg = 0;
- for (auto &MI : *EHPad) {
- switch (MI.getOpcode()) {
- case WebAssembly::CATCH:
- Catch = &MI;
- ExnReg = Catch->getOperand(0).getReg();
- break;
- }
- }
- assert(Catch && "EH pad does not have a catch");
- assert(ExnReg != 0 && "Invalid register");
-
- auto SplitPos = std::next(Catch->getIterator());
-
- // Create a new BB that's gonna be the destination for branches from the
- // inner mismatched scope.
- MachineInstr *BeginTry = EHPadToTry[EHPad];
- MachineInstr *EndTry = BeginToEnd[BeginTry];
- MachineBasicBlock *Cont = EndTry->getParent();
- auto *BrDest = MF.CreateMachineBasicBlock();
- MF.insert(std::next(EHPad->getIterator()), BrDest);
- // Hoist up the existing 'end_try'.
- BrDest->insert(BrDest->end(), EndTry->removeFromParent());
- // Take out the handler body from EH pad to the new branch destination BB.
- BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end());
- unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI, TII);
- // Fix predecessor-successor relationship.
- BrDest->transferSuccessors(EHPad);
- EHPad->addSuccessor(BrDest);
-
- // All try ranges that were supposed to unwind to this EH pad now have to
- // branch to this new branch dest BB.
- for (auto Range : UnwindDestToTryRanges[EHPad])
- BrDestToTryRanges[BrDest].push_back(Range);
- BrDestToExnReg[BrDest] = ExnReg;
-
- // In case we fall through to the continuation BB after the catch block, we
- // now have to add a branch to it.
- // - Before
- // try
- // ...
- // (falls through to 'cont')
- // catch
- // handler body
- // end
- // <-- cont
- //
- // - After
- // try
- // ...
- // br %cont (new)
- // catch
- // end
- // handler body
- // <-- cont
- MachineBasicBlock *EHPadLayoutPred = &*std::prev(EHPad->getIterator());
- MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
- SmallVector<MachineOperand, 4> Cond;
- bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
- if (Analyzable && !TBB && !FBB) {
- DebugLoc DL = EHPadLayoutPred->empty()
- ? DebugLoc()
- : EHPadLayoutPred->rbegin()->getDebugLoc();
- BuildMI(EHPadLayoutPred, DL, TII.get(WebAssembly::BR)).addMBB(Cont);
- BrDests.push_back(Cont);
- }
- }
-
- // For possibly throwing calls whose unwind destinations are currently
- // incorrect because of CFG linearization, we wrap them with a nested
- // try/catch/end_try, and within the new catch block, we branch to the correct
- // handler.
- // - Before
- // mbb:
- // call @foo <- Unwind destination mismatch!
- // ehpad:
- // ...
- //
- // - After
- // mbb:
- // try (new)
- // call @foo
- // nested-ehpad: (new)
- // catch (new)
- // local.set n / drop (new)
- // br %brdest (new)
- // nested-end: (new)
- // end_try (new)
- // ehpad:
- // ...
- for (auto &P : BrDestToTryRanges) {
- MachineBasicBlock *BrDest = P.first;
- auto &TryRanges = P.second;
- unsigned ExnReg = BrDestToExnReg[BrDest];
-
- for (auto Range : TryRanges) {
- MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr;
- std::tie(RangeBegin, RangeEnd) = Range;
- auto *MBB = RangeBegin->getParent();
- // Store the first function call from this range, because RangeBegin can
- // be moved to point EH_LABEL before the call
- MachineInstr *RangeBeginCall = RangeBegin;
-
- // Include possible EH_LABELs in the range
- if (RangeBegin->getIterator() != MBB->begin() &&
- std::prev(RangeBegin->getIterator())->isEHLabel())
- RangeBegin = &*std::prev(RangeBegin->getIterator());
- if (std::next(RangeEnd->getIterator()) != MBB->end() &&
- std::next(RangeEnd->getIterator())->isEHLabel())
- RangeEnd = &*std::next(RangeEnd->getIterator());
-
- MachineBasicBlock *EHPad = nullptr;
- for (auto *Succ : MBB->successors()) {
- if (Succ->isEHPad()) {
- EHPad = Succ;
- break;
- }
- }
-
- // Local expression tree before the first call of this range should go
- // after the nested TRY.
- SmallPtrSet<const MachineInstr *, 4> AfterSet;
- AfterSet.insert(RangeBegin);
- AfterSet.insert(RangeBeginCall);
- for (auto I = MachineBasicBlock::iterator(RangeBeginCall),
- E = MBB->begin();
- I != E; --I) {
- if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
- continue;
- if (WebAssembly::isChild(*std::prev(I), MFI))
- AfterSet.insert(&*std::prev(I));
- else
- break;
- }
-
- // Create the nested try instruction.
- auto InsertPos = getLatestInsertPos(
- MBB, SmallPtrSet<const MachineInstr *, 4>(), AfterSet);
- MachineInstr *NestedTry =
- BuildMI(*MBB, InsertPos, RangeBegin->getDebugLoc(),
- TII.get(WebAssembly::TRY))
- .addImm(int64_t(WebAssembly::BlockType::Void));
-
- // Create the nested EH pad and fill instructions in.
- MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock();
- MF.insert(std::next(MBB->getIterator()), NestedEHPad);
- NestedEHPad->setIsEHPad();
- NestedEHPad->setIsEHScopeEntry();
- BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::CATCH),
- ExnReg);
- BuildMI(NestedEHPad, RangeEnd->getDebugLoc(), TII.get(WebAssembly::BR))
- .addMBB(BrDest);
-
- // Create the nested continuation BB and end_try instruction.
- MachineBasicBlock *NestedCont = MF.CreateMachineBasicBlock();
- MF.insert(std::next(NestedEHPad->getIterator()), NestedCont);
- MachineInstr *NestedEndTry =
- BuildMI(*NestedCont, NestedCont->begin(), RangeEnd->getDebugLoc(),
- TII.get(WebAssembly::END_TRY));
- // In case MBB has more instructions after the try range, move them to the
- // new nested continuation BB.
- NestedCont->splice(NestedCont->end(), MBB,
- std::next(RangeEnd->getIterator()), MBB->end());
- unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI, TII);
- registerTryScope(NestedTry, NestedEndTry, NestedEHPad);
-
- // Fix predecessor-successor relationship.
- NestedCont->transferSuccessors(MBB);
- if (EHPad) {
- NestedCont->removeSuccessor(EHPad);
- // If EHPad does not have any predecessors left after removing
- // NextedCont predecessor, remove its successor too, because this EHPad
- // is not reachable from the entry BB anyway. We can't remove EHPad BB
- // itself because it can contain 'catch' or 'end', which are necessary
- // for keeping try-catch-end structure.
- if (EHPad->pred_empty())
- EHPad->removeSuccessor(BrDest);
- }
- MBB->addSuccessor(NestedEHPad);
- MBB->addSuccessor(NestedCont);
- NestedEHPad->addSuccessor(BrDest);
- }
- }
-
- // Renumber BBs and recalculate ScopeTop info because new BBs might have been
- // created and inserted above.
- MF.RenumberBlocks();
- ScopeTops.clear();
- ScopeTops.resize(MF.getNumBlockIDs());
- for (auto &MBB : reverse(MF)) {
- for (auto &MI : reverse(MBB)) {
- if (ScopeTops[MBB.getNumber()])
- break;
- switch (MI.getOpcode()) {
- case WebAssembly::END_BLOCK:
- case WebAssembly::END_LOOP:
- case WebAssembly::END_TRY:
- ScopeTops[MBB.getNumber()] = EndToBegin[&MI]->getParent();
- break;
- case WebAssembly::CATCH:
- ScopeTops[MBB.getNumber()] = EHPadToTry[&MBB]->getParent();
- break;
- }
- }
- }
-
- // Recompute the dominator tree.
- getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
-
- // Place block markers for newly added branches, if necessary.
-
- // If we've created an appendix BB and a branch to it, place a block/end_block
- // marker for that. For some new branches, those branch destination BBs start
- // with a hoisted end_try marker, so we don't need a new marker there.
- if (AppendixBB)
- BrDests.push_back(AppendixBB);
-
- llvm::sort(BrDests,
- [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
- auto ANum = A->getNumber();
- auto BNum = B->getNumber();
- return ANum < BNum;
- });
- for (auto *Dest : BrDests)
- placeBlockMarker(*Dest);
-
- return true;
+ // TODO Implement this
+ return false;
}
static unsigned
@@ -1365,22 +865,44 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
: WebAssembly::BlockType(
WebAssembly::toValType(MFI.getResults().front()));
- for (MachineBasicBlock &MBB : reverse(MF)) {
- for (MachineInstr &MI : reverse(MBB)) {
+ SmallVector<MachineBasicBlock::reverse_iterator, 4> Worklist;
+ Worklist.push_back(MF.rbegin()->rbegin());
+
+ auto Process = [&](MachineBasicBlock::reverse_iterator It) {
+ auto *MBB = It->getParent();
+ while (It != MBB->rend()) {
+ MachineInstr &MI = *It++;
if (MI.isPosition() || MI.isDebugInstr())
continue;
switch (MI.getOpcode()) {
+ case WebAssembly::END_TRY: {
+ // If a 'try''s return type is fixed, both its try body and catch body
+ // should satisfy the return type, so we need to search 'end'
+ // instructions before its corresponding 'catch' too.
+ auto *EHPad = TryToEHPad.lookup(EndToBegin[&MI]);
+ assert(EHPad);
+ auto NextIt =
+ std::next(WebAssembly::findCatch(EHPad)->getReverseIterator());
+ if (NextIt != EHPad->rend())
+ Worklist.push_back(NextIt);
+ LLVM_FALLTHROUGH;
+ }
case WebAssembly::END_BLOCK:
case WebAssembly::END_LOOP:
- case WebAssembly::END_TRY:
EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
continue;
default:
- // Something other than an `end`. We're done.
+ // Something other than an `end`. We're done for this BB.
return;
}
}
- }
+ // We've reached the beginning of a BB. Continue the search in the previous
+ // BB.
+ Worklist.push_back(MBB->getPrevNode()->rbegin());
+ };
+
+ while (!Worklist.empty())
+ Process(Worklist.pop_back_val());
}
// WebAssembly functions end with an end instruction, as if the function body
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
index 159fb4c00ddc..78191ae758fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
@@ -20,7 +20,19 @@ using namespace llvm;
WebAssemblyDebugValueManager::WebAssemblyDebugValueManager(
MachineInstr *Instr) {
- Instr->collectDebugValues(DbgValues);
+ // This code differs from MachineInstr::collectDebugValues in that it scans
+ // the whole BB, not just contiguous DBG_VALUEs.
+ if (!Instr->getOperand(0).isReg())
+ return;
+
+ MachineBasicBlock::iterator DI = *Instr;
+ ++DI;
+ for (MachineBasicBlock::iterator DE = Instr->getParent()->end(); DI != DE;
+ ++DI) {
+ if (DI->isDebugValue() &&
+ DI->getDebugOperandForReg(Instr->getOperand(0).getReg()))
+ DbgValues.push_back(&*DI);
+ }
}
void WebAssemblyDebugValueManager::move(MachineInstr *Insert) {
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 55925bcbe771..ac94e9e80d01 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -96,8 +96,10 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
return WebAssembly::DROP_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::DROP_V128;
- if (RC == &WebAssembly::EXNREFRegClass)
- return WebAssembly::DROP_EXNREF;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return WebAssembly::DROP_FUNCREF;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return WebAssembly::DROP_EXTERNREF;
llvm_unreachable("Unexpected register class");
}
@@ -113,8 +115,10 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) {
return WebAssembly::LOCAL_GET_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::LOCAL_GET_V128;
- if (RC == &WebAssembly::EXNREFRegClass)
- return WebAssembly::LOCAL_GET_EXNREF;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return WebAssembly::LOCAL_GET_FUNCREF;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return WebAssembly::LOCAL_GET_EXTERNREF;
llvm_unreachable("Unexpected register class");
}
@@ -130,8 +134,10 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) {
return WebAssembly::LOCAL_SET_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::LOCAL_SET_V128;
- if (RC == &WebAssembly::EXNREFRegClass)
- return WebAssembly::LOCAL_SET_EXNREF;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return WebAssembly::LOCAL_SET_FUNCREF;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return WebAssembly::LOCAL_SET_EXTERNREF;
llvm_unreachable("Unexpected register class");
}
@@ -147,8 +153,10 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) {
return WebAssembly::LOCAL_TEE_F64;
if (RC == &WebAssembly::V128RegClass)
return WebAssembly::LOCAL_TEE_V128;
- if (RC == &WebAssembly::EXNREFRegClass)
- return WebAssembly::LOCAL_TEE_EXNREF;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return WebAssembly::LOCAL_TEE_FUNCREF;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return WebAssembly::LOCAL_TEE_EXTERNREF;
llvm_unreachable("Unexpected register class");
}
@@ -164,8 +172,10 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
return MVT::f64;
if (RC == &WebAssembly::V128RegClass)
return MVT::v16i8;
- if (RC == &WebAssembly::EXNREFRegClass)
- return MVT::exnref;
+ if (RC == &WebAssembly::FUNCREFRegClass)
+ return MVT::funcref;
+ if (RC == &WebAssembly::EXTERNREFRegClass)
+ return MVT::externref;
llvm_unreachable("unrecognized register class");
}
@@ -221,6 +231,10 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
auto Local = static_cast<unsigned>(MI.getOperand(1).getImm());
Reg2Local[Reg] = Local;
checkFrameBase(MFI, Local, Reg);
+
+ // Update debug value to point to the local before removing.
+ WebAssemblyDebugValueManager(&MI).replaceWithLocal(Local);
+
MI.eraseFromParent();
Changed = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index c2a0d3e01740..82b032267d55 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -20,12 +20,14 @@
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -128,7 +130,8 @@ private:
case MVT::i64:
case MVT::f32:
case MVT::f64:
- case MVT::exnref:
+ case MVT::funcref:
+ case MVT::externref:
return VT;
case MVT::f16:
return MVT::f32;
@@ -704,9 +707,13 @@ bool WebAssemblyFastISel::fastLowerArguments() {
Opc = WebAssembly::ARGUMENT_v2f64;
RC = &WebAssembly::V128RegClass;
break;
- case MVT::exnref:
- Opc = WebAssembly::ARGUMENT_exnref;
- RC = &WebAssembly::EXNREFRegClass;
+ case MVT::funcref:
+ Opc = WebAssembly::ARGUMENT_funcref;
+ RC = &WebAssembly::FUNCREFRegClass;
+ break;
+ case MVT::externref:
+ Opc = WebAssembly::ARGUMENT_externref;
+ RC = &WebAssembly::EXTERNREFRegClass;
break;
default:
return false;
@@ -806,8 +813,11 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
case MVT::v2f64:
ResultReg = createResultReg(&WebAssembly::V128RegClass);
break;
- case MVT::exnref:
- ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
+ case MVT::funcref:
+ ResultReg = createResultReg(&WebAssembly::FUNCREFRegClass);
+ break;
+ case MVT::externref:
+ ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass);
break;
default:
return false;
@@ -862,6 +872,15 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
// Add placeholders for the type index and immediate flags
MIB.addImm(0);
MIB.addImm(0);
+
+ // Ensure that the object file has a __indirect_function_table import, as we
+ // call_indirect against it.
+ MCSymbolWasm *Sym = WebAssembly::getOrCreateFunctionTableSymbol(
+ MF->getMMI().getContext(), "__indirect_function_table");
+ // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
+ // it as NO_STRIP so as to ensure that the indirect function table makes it
+ // to linked output.
+ Sym->setNoStrip();
}
for (unsigned ArgReg : Args)
@@ -916,9 +935,13 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
Opc = WebAssembly::SELECT_F64;
RC = &WebAssembly::F64RegClass;
break;
- case MVT::exnref:
- Opc = WebAssembly::SELECT_EXNREF;
- RC = &WebAssembly::EXNREFRegClass;
+ case MVT::funcref:
+ Opc = WebAssembly::SELECT_FUNCREF;
+ RC = &WebAssembly::FUNCREFRegClass;
+ break;
+ case MVT::externref:
+ Opc = WebAssembly::SELECT_EXTERNREF;
+ RC = &WebAssembly::EXTERNREFRegClass;
break;
default:
return false;
@@ -1321,7 +1344,8 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
- case MVT::exnref:
+ case MVT::funcref:
+ case MVT::externref:
break;
default:
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
index 7f805b34b499..52aa3534c78e 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
@@ -41,13 +41,51 @@ public:
char WebAssemblyFixBrTableDefaults::ID = 0;
+// Target indepedent selection dag assumes that it is ok to use PointerTy
+// as the index for a "switch", whereas Wasm so far only has a 32-bit br_table.
+// See e.g. SelectionDAGBuilder::visitJumpTableHeader
+// We have a 64-bit br_table in the tablegen defs as a result, which does get
+// selected, and thus we get incorrect truncates/extensions happening on
+// wasm64. Here we fix that.
+void fixBrTableIndex(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineFunction &MF) {
+ // Only happens on wasm64.
+ auto &WST = MF.getSubtarget<WebAssemblySubtarget>();
+ if (!WST.hasAddr64())
+ return;
+
+ assert(MI.getDesc().getOpcode() == WebAssembly::BR_TABLE_I64 &&
+ "64-bit br_table pseudo instruction expected");
+
+ // Find extension op, if any. It sits in the previous BB before the branch.
+ auto ExtMI = MF.getRegInfo().getVRegDef(MI.getOperand(0).getReg());
+ if (ExtMI->getOpcode() == WebAssembly::I64_EXTEND_U_I32) {
+ // Unnecessarily extending a 32-bit value to 64, remove it.
+ assert(MI.getOperand(0).getReg() == ExtMI->getOperand(0).getReg());
+ MI.getOperand(0).setReg(ExtMI->getOperand(1).getReg());
+ ExtMI->eraseFromParent();
+ } else {
+ // Incoming 64-bit value that needs to be truncated.
+ Register Reg32 =
+ MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+ BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
+ WST.getInstrInfo()->get(WebAssembly::I32_WRAP_I64), Reg32)
+ .addReg(MI.getOperand(0).getReg());
+ MI.getOperand(0).setReg(Reg32);
+ }
+
+ // We now have a 32-bit operand in all cases, so change the instruction
+ // accordingly.
+ MI.setDesc(WST.getInstrInfo()->get(WebAssembly::BR_TABLE_I32));
+}
+
// `MI` is a br_table instruction with a dummy default target argument. This
// function finds and adds the default target argument and removes any redundant
// range check preceding the br_table. Returns the MBB that the br_table is
// moved into so it can be removed from further consideration, or nullptr if the
// br_table cannot be optimized.
-MachineBasicBlock *fixBrTable(MachineInstr &MI, MachineBasicBlock *MBB,
- MachineFunction &MF) {
+MachineBasicBlock *fixBrTableDefault(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineFunction &MF) {
// Get the header block, which contains the redundant range check.
assert(MBB->pred_size() == 1 && "Expected a single guard predecessor");
auto *HeaderMBB = *MBB->pred_begin();
@@ -125,7 +163,8 @@ bool WebAssemblyFixBrTableDefaults::runOnMachineFunction(MachineFunction &MF) {
MBBSet.erase(MBB);
for (auto &MI : *MBB) {
if (WebAssembly::isBrTable(MI)) {
- auto *Fixed = fixBrTable(MI, MBB, MF);
+ fixBrTableIndex(MI, MBB, MF);
+ auto *Fixed = fixBrTableDefault(MI, MBB, MF);
if (Fixed != nullptr) {
MBBSet.erase(Fixed);
Changed = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index dee1c4e28149..d75afdcefb7d 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -29,7 +29,12 @@ HANDLE_NODETYPE(SWIZZLE)
HANDLE_NODETYPE(VEC_SHL)
HANDLE_NODETYPE(VEC_SHR_S)
HANDLE_NODETYPE(VEC_SHR_U)
+HANDLE_NODETYPE(WIDEN_LOW_S)
+HANDLE_NODETYPE(WIDEN_LOW_U)
+HANDLE_NODETYPE(WIDEN_HIGH_S)
+HANDLE_NODETYPE(WIDEN_HIGH_U)
HANDLE_NODETYPE(THROW)
+HANDLE_NODETYPE(CATCH)
HANDLE_NODETYPE(MEMORY_COPY)
HANDLE_NODETYPE(MEMORY_FILL)
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index d1a696f854f8..b9154b09fbbc 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -80,9 +80,6 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
auto GlobalGetIns = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
: WebAssembly::GLOBAL_GET_I32;
- auto ConstIns =
- PtrVT == MVT::i64 ? WebAssembly::CONST_I64 : WebAssembly::CONST_I32;
- auto AddIns = PtrVT == MVT::i64 ? WebAssembly::ADD_I64 : WebAssembly::ADD_I32;
// Few custom selection stuff.
SDLoc DL(Node);
@@ -126,41 +123,6 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
return;
}
- case ISD::GlobalTLSAddress: {
- const auto *GA = cast<GlobalAddressSDNode>(Node);
-
- if (!MF.getSubtarget<WebAssemblySubtarget>().hasBulkMemory())
- report_fatal_error("cannot use thread-local storage without bulk memory",
- false);
-
- // Currently Emscripten does not support dynamic linking with threads.
- // Therefore, if we have thread-local storage, only the local-exec model
- // is possible.
- // TODO: remove this and implement proper TLS models once Emscripten
- // supports dynamic linking with threads.
- if (GA->getGlobal()->getThreadLocalMode() !=
- GlobalValue::LocalExecTLSModel &&
- !Subtarget->getTargetTriple().isOSEmscripten()) {
- report_fatal_error("only -ftls-model=local-exec is supported for now on "
- "non-Emscripten OSes: variable " +
- GA->getGlobal()->getName(),
- false);
- }
-
- SDValue TLSBaseSym = CurDAG->getTargetExternalSymbol("__tls_base", PtrVT);
- SDValue TLSOffsetSym = CurDAG->getTargetGlobalAddress(
- GA->getGlobal(), DL, PtrVT, GA->getOffset(), 0);
-
- MachineSDNode *TLSBase =
- CurDAG->getMachineNode(GlobalGetIns, DL, PtrVT, TLSBaseSym);
- MachineSDNode *TLSOffset =
- CurDAG->getMachineNode(ConstIns, DL, PtrVT, TLSOffsetSym);
- MachineSDNode *TLSAddress = CurDAG->getMachineNode(
- AddIns, DL, PtrVT, SDValue(TLSBase, 0), SDValue(TLSOffset, 0));
- ReplaceNode(Node, TLSAddress);
- return;
- }
-
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
switch (IntNo) {
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 925636c82321..e348bba2b04c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -16,6 +16,7 @@
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -31,6 +32,7 @@
#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
@@ -68,6 +70,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
computeRegisterProperties(Subtarget->getRegisterInfo());
setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVTPtr, Custom);
setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
setOperationAction(ISD::JumpTable, MVTPtr, Custom);
setOperationAction(ISD::BlockAddress, MVTPtr, Custom);
@@ -123,6 +126,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Hoist bitcasts out of shuffles
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ // Combine extends of extract_subvectors into widening ops
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+
// Support saturating add for i8x16 and i16x8
for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
for (auto T : {MVT::v16i8, MVT::v8i16})
@@ -156,11 +163,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// There is no i8x16.mul instruction
setOperationAction(ISD::MUL, MVT::v16i8, Expand);
- // There are no vector select instructions
- for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT})
- for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
- MVT::v2f64})
- setOperationAction(Op, T, Expand);
+ // There is no vector conditional select instruction
+ for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+ MVT::v2f64})
+ setOperationAction(ISD::SELECT_CC, T, Expand);
// Expand integer operations supported for scalars but not SIMD
for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV,
@@ -247,6 +253,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
}
+ // And some truncating stores are legal as well
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
}
// Don't do anything clever with build_pairs
@@ -258,6 +267,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// Exception handling intrinsics
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setMaxAtomicSizeInBitsSupported(64);
@@ -268,7 +278,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
// Define the emscripten name for return address helper.
- // TODO: when implementing other WASM backends, make this generic or only do
+ // TODO: when implementing other Wasm backends, make this generic or only do
// this on emscripten depending on what they end up doing.
setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
@@ -442,6 +452,19 @@ static MachineBasicBlock *LowerCallResults(MachineInstr &CallResults,
const MCInstrDesc &MCID = TII.get(CallOp);
MachineInstrBuilder MIB(MF, MF.CreateMachineInstr(MCID, DL));
+ // See if we must truncate the function pointer.
+ // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
+ // as 64-bit for uniformity with other pointer types.
+ if (IsIndirect && MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) {
+ Register Reg32 =
+ MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+ auto &FnPtr = CallParams.getOperand(0);
+ BuildMI(*BB, CallResults.getIterator(), DL,
+ TII.get(WebAssembly::I32_WRAP_I64), Reg32)
+ .addReg(FnPtr.getReg());
+ FnPtr.setReg(Reg32);
+ }
+
// Move the function pointer to the end of the arguments for indirect calls
if (IsIndirect) {
auto FnPtr = CallParams.getOperand(0);
@@ -456,6 +479,15 @@ static MachineBasicBlock *LowerCallResults(MachineInstr &CallResults,
if (IsIndirect) {
MIB.addImm(0);
MIB.addImm(0);
+
+ // Ensure that the object file has a __indirect_function_table import, as we
+ // call_indirect against it.
+ MCSymbolWasm *Sym = WebAssembly::getOrCreateFunctionTableSymbol(
+ MF.getContext(), "__indirect_function_table");
+ // Until call_indirect emits TABLE_NUMBER relocs against this symbol, mark
+ // it as NO_STRIP so as to ensure that the indirect function table makes it
+ // to linked output.
+ Sym->setNoStrip();
}
for (auto Use : CallParams.uses())
@@ -542,6 +574,16 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
if (VT.getSizeInBits() <= 64)
return std::make_pair(0U, &WebAssembly::I64RegClass);
}
+ if (VT.isFloatingPoint() && !VT.isVector()) {
+ switch (VT.getSizeInBits()) {
+ case 32:
+ return std::make_pair(0U, &WebAssembly::F32RegClass);
+ case 64:
+ return std::make_pair(0U, &WebAssembly::F64RegClass);
+ default:
+ break;
+ }
+ }
break;
default:
break;
@@ -626,7 +668,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
- case Intrinsic::wasm_atomic_notify:
+ case Intrinsic::wasm_memory_atomic_notify:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i32;
Info.ptrVal = I.getArgOperand(0);
@@ -640,7 +682,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// consistent. The same applies for wasm_atomic_wait intrinsics too.
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
- case Intrinsic::wasm_atomic_wait_i32:
+ case Intrinsic::wasm_memory_atomic_wait32:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i32;
Info.ptrVal = I.getArgOperand(0);
@@ -648,7 +690,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = Align(4);
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
- case Intrinsic::wasm_atomic_wait_i64:
+ case Intrinsic::wasm_memory_atomic_wait64:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(0);
@@ -656,6 +698,75 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = Align(8);
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
+ case Intrinsic::wasm_load32_zero:
+ case Intrinsic::wasm_load64_zero:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8);
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::wasm_load8_lane:
+ case Intrinsic::wasm_load16_lane:
+ case Intrinsic::wasm_load32_lane:
+ case Intrinsic::wasm_load64_lane:
+ case Intrinsic::wasm_store8_lane:
+ case Intrinsic::wasm_store16_lane:
+ case Intrinsic::wasm_store32_lane:
+ case Intrinsic::wasm_store64_lane: {
+ MVT MemVT;
+ Align MemAlign;
+ switch (Intrinsic) {
+ case Intrinsic::wasm_load8_lane:
+ case Intrinsic::wasm_store8_lane:
+ MemVT = MVT::i8;
+ MemAlign = Align(1);
+ break;
+ case Intrinsic::wasm_load16_lane:
+ case Intrinsic::wasm_store16_lane:
+ MemVT = MVT::i16;
+ MemAlign = Align(2);
+ break;
+ case Intrinsic::wasm_load32_lane:
+ case Intrinsic::wasm_store32_lane:
+ MemVT = MVT::i32;
+ MemAlign = Align(4);
+ break;
+ case Intrinsic::wasm_load64_lane:
+ case Intrinsic::wasm_store64_lane:
+ MemVT = MVT::i64;
+ MemAlign = Align(8);
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic");
+ }
+ if (Intrinsic == Intrinsic::wasm_load8_lane ||
+ Intrinsic == Intrinsic::wasm_load16_lane ||
+ Intrinsic == Intrinsic::wasm_load32_lane ||
+ Intrinsic == Intrinsic::wasm_load64_lane) {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.flags = MachineMemOperand::MOLoad;
+ } else {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.flags = MachineMemOperand::MOStore;
+ }
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = MemVT;
+ Info.offset = 0;
+ Info.align = MemAlign;
+ return true;
+ }
+ case Intrinsic::wasm_prefetch_t:
+ case Intrinsic::wasm_prefetch_nt: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::i8;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Align(1);
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
default:
return false;
}
@@ -866,8 +977,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
/*isSS=*/false);
unsigned ValNo = 0;
SmallVector<SDValue, 8> Chains;
- for (SDValue Arg :
- make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+ for (SDValue Arg : drop_begin(OutVals, NumFixedArgs)) {
assert(ArgLocs[ValNo].getValNo() == ValNo &&
"ArgLocs should remain in order and only hold varargs args");
unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
@@ -876,7 +986,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
DAG.getConstant(Offset, DL, PtrVT));
Chains.push_back(
DAG.getStore(Chain, DL, Arg, Add,
- MachinePointerInfo::getFixedStack(MF, FI, Offset), 0));
+ MachinePointerInfo::getFixedStack(MF, FI, Offset)));
}
if (!Chains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
@@ -1091,6 +1201,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
return LowerFrameIndex(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress:
+ return LowerGlobalTLSAddress(Op, DAG);
case ISD::ExternalSymbol:
return LowerExternalSymbol(Op, DAG);
case ISD::JumpTable:
@@ -1199,6 +1311,49 @@ SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
}
+SDValue
+WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ const auto *GA = cast<GlobalAddressSDNode>(Op);
+ MVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (!MF.getSubtarget<WebAssemblySubtarget>().hasBulkMemory())
+ report_fatal_error("cannot use thread-local storage without bulk memory",
+ false);
+
+ const GlobalValue *GV = GA->getGlobal();
+
+ // Currently Emscripten does not support dynamic linking with threads.
+ // Therefore, if we have thread-local storage, only the local-exec model
+ // is possible.
+ // TODO: remove this and implement proper TLS models once Emscripten
+ // supports dynamic linking with threads.
+ if (GV->getThreadLocalMode() != GlobalValue::LocalExecTLSModel &&
+ !Subtarget->getTargetTriple().isOSEmscripten()) {
+ report_fatal_error("only -ftls-model=local-exec is supported for now on "
+ "non-Emscripten OSes: variable " +
+ GV->getName(),
+ false);
+ }
+
+ auto GlobalGet = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
+ : WebAssembly::GLOBAL_GET_I32;
+ const char *BaseName = MF.createExternalSymbolName("__tls_base");
+
+ SDValue BaseAddr(
+ DAG.getMachineNode(GlobalGet, DL, PtrVT,
+ DAG.getTargetExternalSymbol(BaseName, PtrVT)),
+ 0);
+
+ SDValue TLSOffset = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, GA->getOffset(), WebAssemblyII::MO_TLS_BASE_REL);
+ SDValue SymAddr = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, TLSOffset);
+
+ return DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, SymAddr);
+}
+
SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -1303,7 +1458,22 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
MFI->getVarargBufferVreg(), PtrVT);
return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1),
- MachinePointerInfo(SV), 0);
+ MachinePointerInfo(SV));
+}
+
+static SDValue getCppExceptionSymNode(SDValue Op, unsigned TagIndex,
+ SelectionDAG &DAG) {
+ // We only support C++ exceptions for now
+ int Tag =
+ cast<ConstantSDNode>(Op.getOperand(TagIndex).getNode())->getZExtValue();
+ if (Tag != WebAssembly::CPP_EXCEPTION)
+ llvm_unreachable("Invalid tag: We only support C++ exceptions for now");
+ auto &MF = DAG.getMachineFunction();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ const char *SymName = MF.createExternalSymbolName("__cpp_exception");
+ return DAG.getNode(WebAssemblyISD::Wrapper, SDLoc(Op), PtrVT,
+ DAG.getTargetExternalSymbol(SymName, PtrVT));
}
SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
@@ -1339,15 +1509,7 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
}
case Intrinsic::wasm_throw: {
- // We only support C++ exceptions for now
- int Tag = cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
- if (Tag != CPP_EXCEPTION)
- llvm_unreachable("Invalid tag!");
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
- const char *SymName = MF.createExternalSymbolName("__cpp_exception");
- SDValue SymNode = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
- DAG.getTargetExternalSymbol(SymName, PtrVT));
+ SDValue SymNode = getCppExceptionSymNode(Op, 2, DAG);
return DAG.getNode(WebAssemblyISD::THROW, DL,
MVT::Other, // outchain type
{
@@ -1357,6 +1519,19 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
});
}
+ case Intrinsic::wasm_catch: {
+ SDValue SymNode = getCppExceptionSymNode(Op, 2, DAG);
+ return DAG.getNode(WebAssemblyISD::CATCH, DL,
+ {
+ MVT::i32, // outchain type
+ MVT::Other // return value
+ },
+ {
+ Op.getOperand(0), // inchain
+ SymNode // exception symbol
+ });
+ }
+
case Intrinsic::wasm_shuffle: {
// Drop in-chain and replace undefs, but otherwise pass through unchanged
SDValue Ops[18];
@@ -1474,8 +1649,8 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SmallVector<SwizzleEntry, 16> SwizzleCounts;
auto AddCount = [](auto &Counts, const auto &Val) {
- auto CountIt = std::find_if(Counts.begin(), Counts.end(),
- [&Val](auto E) { return E.first == Val; });
+ auto CountIt =
+ llvm::find_if(Counts, [&Val](auto E) { return E.first == Val; });
if (CountIt == Counts.end()) {
Counts.emplace_back(Val, 1);
} else {
@@ -1537,6 +1712,7 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
};
} else if (NumConstantLanes >= NumSplatLanes &&
Subtarget->hasUnimplementedSIMD128()) {
+ // If we support v128.const, emit it directly
SmallVector<SDValue, 16> ConstLanes;
for (const SDValue &Lane : Op->op_values()) {
if (IsConstant(Lane)) {
@@ -1548,11 +1724,59 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
}
}
Result = DAG.getBuildVector(VecT, DL, ConstLanes);
- IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+ IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
return IsConstant(Lane);
};
- }
- if (!Result) {
+ } else if (NumConstantLanes >= NumSplatLanes && VecT.isInteger()) {
+ // Otherwise, if this is an integer vector, pack the lane values together so
+ // we can construct the 128-bit constant from a pair of i64s using a splat
+ // followed by at most one i64x2.replace_lane. Also keep track of the lanes
+ // that actually matter so we can avoid the replace_lane in more cases.
+ std::array<uint64_t, 2> I64s{{0, 0}};
+ std::array<uint64_t, 2> ConstLaneMasks{{0, 0}};
+ size_t LaneBits = 128 / Lanes;
+ size_t HalfLanes = Lanes / 2;
+ for (size_t I = 0; I < Lanes; ++I) {
+ const SDValue &Lane = Op.getOperand(I);
+ if (IsConstant(Lane)) {
+ // How much we need to shift Val to position it in an i64
+ auto Shift = LaneBits * (I % HalfLanes);
+ auto Mask = maskTrailingOnes<uint64_t>(LaneBits);
+ auto Val = cast<ConstantSDNode>(Lane.getNode())->getZExtValue() & Mask;
+ I64s[I / HalfLanes] |= Val << Shift;
+ ConstLaneMasks[I / HalfLanes] |= Mask << Shift;
+ }
+ }
+ // Check whether all constant lanes in the second half of the vector are
+ // equivalent in the first half or vice versa to determine whether splatting
+ // either side will be sufficient to materialize the constant. As a special
+ // case, if the first and second halves have no constant lanes in common, we
+ // can just combine them.
+ bool FirstHalfSufficient = (I64s[0] & ConstLaneMasks[1]) == I64s[1];
+ bool SecondHalfSufficient = (I64s[1] & ConstLaneMasks[0]) == I64s[0];
+ bool CombinedSufficient = (ConstLaneMasks[0] & ConstLaneMasks[1]) == 0;
+
+ uint64_t Splatted;
+ if (SecondHalfSufficient) {
+ Splatted = I64s[1];
+ } else if (CombinedSufficient) {
+ Splatted = I64s[0] | I64s[1];
+ } else {
+ Splatted = I64s[0];
+ }
+
+ Result = DAG.getSplatBuildVector(MVT::v2i64, DL,
+ DAG.getConstant(Splatted, DL, MVT::i64));
+ if (!FirstHalfSufficient && !SecondHalfSufficient && !CombinedSufficient) {
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result,
+ DAG.getConstant(I64s[1], DL, MVT::i64),
+ DAG.getConstant(1, DL, MVT::i32));
+ }
+ Result = DAG.getBitcast(VecT, Result);
+ IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
+ return IsConstant(Lane);
+ };
+ } else {
// Use a splat, but possibly a load_splat
LoadSDNode *SplattedLoad;
if ((SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
@@ -1565,11 +1789,14 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
} else {
Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
}
- IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+ IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
return Lane == SplatValue;
};
}
+ assert(Result);
+ assert(IsLaneConstructed);
+
// Add replace_lane instructions for any unhandled values
for (size_t I = 0; I < Lanes; ++I) {
const SDValue &Lane = Op->getOperand(I);
@@ -1730,6 +1957,49 @@ performVECTOR_SHUFFLECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return DAG.getBitcast(DstType, NewShuffle);
}
+static SDValue performVectorWidenCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto &DAG = DCI.DAG;
+ assert(N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND);
+
+ // Combine ({s,z}ext (extract_subvector src, i)) into a widening operation if
+ // possible before the extract_subvector can be expanded.
+ auto Extract = N->getOperand(0);
+ if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return SDValue();
+ auto Source = Extract.getOperand(0);
+ auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+ if (IndexNode == nullptr)
+ return SDValue();
+ auto Index = IndexNode->getZExtValue();
+
+ // Only v8i8 and v4i16 extracts can be widened, and only if the extracted
+ // subvector is the low or high half of its source.
+ EVT ResVT = N->getValueType(0);
+ if (ResVT == MVT::v8i16) {
+ if (Extract.getValueType() != MVT::v8i8 ||
+ Source.getValueType() != MVT::v16i8 || (Index != 0 && Index != 8))
+ return SDValue();
+ } else if (ResVT == MVT::v4i32) {
+ if (Extract.getValueType() != MVT::v4i16 ||
+ Source.getValueType() != MVT::v8i16 || (Index != 0 && Index != 4))
+ return SDValue();
+ } else {
+ return SDValue();
+ }
+
+ bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND;
+ bool IsLow = Index == 0;
+
+ unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::WIDEN_LOW_S
+ : WebAssemblyISD::WIDEN_HIGH_S)
+ : (IsLow ? WebAssemblyISD::WIDEN_LOW_U
+ : WebAssemblyISD::WIDEN_HIGH_U);
+
+ return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -1738,5 +2008,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
case ISD::VECTOR_SHUFFLE:
return performVECTOR_SHUFFLECombine(N, DCI);
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ return performVectorWidenCombine(N, DCI);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b8e612377529..c8a052d01199 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -106,6 +106,7 @@ private:
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 256b77e33db9..22103b0bfb38 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -33,112 +33,117 @@ multiclass ATOMIC_NRI<dag oops, dag iops, list<dag> pattern, string asmstr = "",
//===----------------------------------------------------------------------===//
let hasSideEffects = 1 in {
-defm ATOMIC_NOTIFY_A32 :
+defm MEMORY_ATOMIC_NOTIFY_A32 :
ATOMIC_I<(outs I32:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
- "atomic.notify \t${off}${p2align}", 0x00, "false">;
-defm ATOMIC_NOTIFY_A64 :
+ "memory.atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
+ "memory.atomic.notify \t${off}${p2align}", 0x00, "false">;
+defm MEMORY_ATOMIC_NOTIFY_A64 :
ATOMIC_I<(outs I32:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$count),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
- "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
- "atomic.notify \t${off}${p2align}", 0x00, "true">;
+ "memory.atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
+ "memory.atomic.notify \t${off}${p2align}", 0x00, "true">;
let mayLoad = 1 in {
-defm ATOMIC_WAIT_I32_A32 :
+defm MEMORY_ATOMIC_WAIT32_A32 :
ATOMIC_I<(outs I32:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp,
I64:$timeout),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
- "i32.atomic.wait \t${off}${p2align}", 0x01, "false">;
-defm ATOMIC_WAIT_I32_A64 :
+ "memory.atomic.wait32 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+ "memory.atomic.wait32 \t${off}${p2align}", 0x01, "false">;
+defm MEMORY_ATOMIC_WAIT32_A64 :
ATOMIC_I<(outs I32:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$exp,
I64:$timeout),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
- "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
- "i32.atomic.wait \t${off}${p2align}", 0x01, "true">;
-defm ATOMIC_WAIT_I64_A32 :
+ "memory.atomic.wait32 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+ "memory.atomic.wait32 \t${off}${p2align}", 0x01, "true">;
+defm MEMORY_ATOMIC_WAIT64_A32 :
ATOMIC_I<(outs I32:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp,
I64:$timeout),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
- "i64.atomic.wait \t${off}${p2align}", 0x02, "false">;
-defm ATOMIC_WAIT_I64_A64 :
+ "memory.atomic.wait64 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+ "memory.atomic.wait64 \t${off}${p2align}", 0x02, "false">;
+defm MEMORY_ATOMIC_WAIT64_A64 :
ATOMIC_I<(outs I32:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr, I64:$exp,
I64:$timeout),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
- "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
- "i64.atomic.wait \t${off}${p2align}", 0x02, "true">;
+ "memory.atomic.wait64 \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+ "memory.atomic.wait64 \t${off}${p2align}", 0x02, "true">;
} // mayLoad = 1
} // hasSideEffects = 1
-let Predicates = [HasAtomics] in {
// Select notifys with no constant offset.
def NotifyPatNoOffset_A32 :
- Pat<(i32 (int_wasm_atomic_notify I32:$addr, I32:$count)),
- (ATOMIC_NOTIFY_A32 0, 0, I32:$addr, I32:$count)>,
- Requires<[HasAddr32]>;
+ Pat<(i32 (int_wasm_memory_atomic_notify I32:$addr, I32:$count)),
+ (MEMORY_ATOMIC_NOTIFY_A32 0, 0, I32:$addr, I32:$count)>,
+ Requires<[HasAddr32, HasAtomics]>;
def NotifyPatNoOffset_A64 :
- Pat<(i32 (int_wasm_atomic_notify I64:$addr, I32:$count)),
- (ATOMIC_NOTIFY_A64 0, 0, I64:$addr, I32:$count)>,
- Requires<[HasAddr64]>;
+ Pat<(i32 (int_wasm_memory_atomic_notify I64:$addr, I32:$count)),
+ (MEMORY_ATOMIC_NOTIFY_A64 0, 0, I64:$addr, I32:$count)>,
+ Requires<[HasAddr64, HasAtomics]>;
// Select notifys with a constant offset.
// Pattern with address + immediate offset
multiclass NotifyPatImmOff<PatFrag operand, string inst> {
- def : Pat<(i32 (int_wasm_atomic_notify (operand I32:$addr, imm:$off),
+ def : Pat<(i32 (int_wasm_memory_atomic_notify (operand I32:$addr, imm:$off),
I32:$count)),
(!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, I32:$count)>,
- Requires<[HasAddr32]>;
- def : Pat<(i32 (int_wasm_atomic_notify (operand I64:$addr, imm:$off),
+ Requires<[HasAddr32, HasAtomics]>;
+ def : Pat<(i32 (int_wasm_memory_atomic_notify (operand I64:$addr, imm:$off),
I32:$count)),
(!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, I32:$count)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
-defm : NotifyPatImmOff<regPlusImm, "ATOMIC_NOTIFY">;
-defm : NotifyPatImmOff<or_is_add, "ATOMIC_NOTIFY">;
+defm : NotifyPatImmOff<regPlusImm, "MEMORY_ATOMIC_NOTIFY">;
+defm : NotifyPatImmOff<or_is_add, "MEMORY_ATOMIC_NOTIFY">;
// Select notifys with just a constant offset.
def NotifyPatOffsetOnly_A32 :
- Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
- (ATOMIC_NOTIFY_A32 0, imm:$off, (CONST_I32 0), I32:$count)>,
- Requires<[HasAddr32]>;
+ Pat<(i32 (int_wasm_memory_atomic_notify imm:$off, I32:$count)),
+ (MEMORY_ATOMIC_NOTIFY_A32 0, imm:$off, (CONST_I32 0), I32:$count)>,
+ Requires<[HasAddr32, HasAtomics]>;
def NotifyPatOffsetOnly_A64 :
- Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
- (ATOMIC_NOTIFY_A64 0, imm:$off, (CONST_I64 0), I32:$count)>,
- Requires<[HasAddr64]>;
+ Pat<(i32 (int_wasm_memory_atomic_notify imm:$off, I32:$count)),
+ (MEMORY_ATOMIC_NOTIFY_A64 0, imm:$off, (CONST_I64 0), I32:$count)>,
+ Requires<[HasAddr64, HasAtomics]>;
def NotifyPatGlobalAddrOffOnly_A32 :
- Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
- I32:$count)),
- (ATOMIC_NOTIFY_A32 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>,
- Requires<[HasAddr32]>;
+ Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+ I32:$count)),
+ (MEMORY_ATOMIC_NOTIFY_A32 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)
+ >,
+ Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
def NotifyPatGlobalAddrOffOnly_A64 :
- Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
- I32:$count)),
- (ATOMIC_NOTIFY_A64 0, tglobaladdr:$off, (CONST_I64 0), I32:$count)>,
- Requires<[HasAddr64]>;
+ Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+ I32:$count)),
+ (MEMORY_ATOMIC_NOTIFY_A64 0, tglobaladdr:$off, (CONST_I64 0), I32:$count)
+ >,
+ Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
// Select waits with no constant offset.
multiclass WaitPatNoOffset<ValueType ty, Intrinsic kind,
string inst> {
def : Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)),
(!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$exp, I64:$timeout)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(i32 (kind I64:$addr, ty:$exp, I64:$timeout)),
(!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$exp, I64:$timeout)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
-defm : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
-defm : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
-defm : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
-defm : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
+defm : WaitPatNoOffset<i32, int_wasm_memory_atomic_wait32,
+ "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatNoOffset<i64, int_wasm_memory_atomic_wait64,
+ "MEMORY_ATOMIC_WAIT64">;
+defm : WaitPatNoOffset<i32, int_wasm_memory_atomic_wait32,
+ "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatNoOffset<i64, int_wasm_memory_atomic_wait64,
+ "MEMORY_ATOMIC_WAIT64">;
// Select waits with a constant offset.
@@ -148,52 +153,53 @@ multiclass WaitPatImmOff<ValueType ty, Intrinsic kind, PatFrag operand,
def : Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)),
(!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$exp,
I64:$timeout)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(i32 (kind (operand I64:$addr, imm:$off), ty:$exp, I64:$timeout)),
(!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$exp,
I64:$timeout)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
-defm : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, regPlusImm,
- "ATOMIC_WAIT_I32">;
-defm : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add,
- "ATOMIC_WAIT_I32">;
-defm : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm,
- "ATOMIC_WAIT_I64">;
-defm : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add,
- "ATOMIC_WAIT_I64">;
-
-// Select wait_i32, "ATOMIC_WAIT_I32s with just a constant offset.
+defm : WaitPatImmOff<i32, int_wasm_memory_atomic_wait32, regPlusImm,
+ "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatImmOff<i32, int_wasm_memory_atomic_wait32, or_is_add,
+ "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatImmOff<i64, int_wasm_memory_atomic_wait64, regPlusImm,
+ "MEMORY_ATOMIC_WAIT64">;
+defm : WaitPatImmOff<i64, int_wasm_memory_atomic_wait64, or_is_add,
+ "MEMORY_ATOMIC_WAIT64">;
+
+// Select waits with just a constant offset.
multiclass WaitPatOffsetOnly<ValueType ty, Intrinsic kind, string inst> {
def : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
(!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$exp,
I64:$timeout)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
(!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$exp,
I64:$timeout)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
-defm : WaitPatOffsetOnly<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
-defm : WaitPatOffsetOnly<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
+defm : WaitPatOffsetOnly<i32, int_wasm_memory_atomic_wait32,
+ "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatOffsetOnly<i64, int_wasm_memory_atomic_wait64,
+ "MEMORY_ATOMIC_WAIT64">;
multiclass WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, string inst> {
def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
I64:$timeout)),
(!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
I64:$timeout)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
I64:$timeout)),
(!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
I64:$timeout)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
}
-defm : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32,
- "ATOMIC_WAIT_I32">;
-defm : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64,
- "ATOMIC_WAIT_I64">;
-} // Predicates = [HasAtomics]
+defm : WaitPatGlobalAddrOffOnly<i32, int_wasm_memory_atomic_wait32,
+ "MEMORY_ATOMIC_WAIT32">;
+defm : WaitPatGlobalAddrOffOnly<i64, int_wasm_memory_atomic_wait64,
+ "MEMORY_ATOMIC_WAIT64">;
//===----------------------------------------------------------------------===//
// Atomic fences
@@ -221,7 +227,6 @@ defm ATOMIC_LOAD_I32 : AtomicLoad<I32, "i32.atomic.load", 0x10>;
defm ATOMIC_LOAD_I64 : AtomicLoad<I64, "i64.atomic.load", 0x11>;
// Select loads with no constant offset.
-let Predicates = [HasAtomics] in {
defm : LoadPatNoOffset<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
defm : LoadPatNoOffset<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
@@ -240,7 +245,6 @@ defm : LoadPatOffsetOnly<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
defm : LoadPatGlobalAddrOffOnly<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
-} // Predicates = [HasAtomics]
// Extending loads. Note that there are only zero-extending atomic loads, no
// sign-extending loads.
@@ -285,7 +289,6 @@ def sext_aload_8_64 :
def sext_aload_16_64 :
PatFrag<(ops node:$addr), (anyext (i32 (atomic_load_16 node:$addr)))>;
-let Predicates = [HasAtomics] in {
// Select zero-extending loads with no constant offset.
defm : LoadPatNoOffset<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
defm : LoadPatNoOffset<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
@@ -344,7 +347,6 @@ defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, "ATOMIC_LOAD16_U_I32">;
defm : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
defm : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
-} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
// Atomic stores
@@ -363,16 +365,15 @@ defm ATOMIC_STORE_I64 : AtomicStore<I64, "i64.atomic.store", 0x18>;
// store: (store $val, $ptr)
// atomic_store: (store $ptr, $val)
-let Predicates = [HasAtomics] in {
// Select stores with no constant offset.
multiclass AStorePatNoOffset<ValueType ty, PatFrag kind, string inst> {
def : Pat<(kind I32:$addr, ty:$val),
(!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(kind I64:$addr, ty:$val),
(!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
defm : AStorePatNoOffset<i32, atomic_store_32, "ATOMIC_STORE_I32">;
defm : AStorePatNoOffset<i64, atomic_store_64, "ATOMIC_STORE_I64">;
@@ -384,10 +385,10 @@ multiclass AStorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
string inst> {
def : Pat<(kind (operand I32:$addr, imm:$off), ty:$val),
(!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(kind (operand I64:$addr, imm:$off), ty:$val),
(!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
defm : AStorePatImmOff<i32, atomic_store_32, regPlusImm, "ATOMIC_STORE_I32">;
defm : AStorePatImmOff<i64, atomic_store_64, regPlusImm, "ATOMIC_STORE_I64">;
@@ -396,10 +397,10 @@ defm : AStorePatImmOff<i64, atomic_store_64, regPlusImm, "ATOMIC_STORE_I64">;
multiclass AStorePatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
def : Pat<(kind imm:$off, ty:$val),
(!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(kind imm:$off, ty:$val),
(!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
defm : AStorePatOffsetOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
defm : AStorePatOffsetOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
@@ -407,15 +408,14 @@ defm : AStorePatOffsetOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
multiclass AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
(!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
(!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
}
defm : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
defm : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
-} // Predicates = [HasAtomics]
// Truncating stores.
defm ATOMIC_STORE8_I32 : AtomicStore<I32, "i32.atomic.store8", 0x19>;
@@ -436,7 +436,6 @@ def trunc_astore_8_64 : trunc_astore_64<atomic_store_8>;
def trunc_astore_16_64 : trunc_astore_64<atomic_store_16>;
def trunc_astore_32_64 : trunc_astore_64<atomic_store_32>;
-let Predicates = [HasAtomics] in {
// Truncating stores with no constant offset
defm : AStorePatNoOffset<i32, atomic_store_8, "ATOMIC_STORE8_I32">;
@@ -474,7 +473,6 @@ defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, "ATOMIC_STORE8_I64">;
defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, "ATOMIC_STORE16_I64">;
defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, "ATOMIC_STORE32_I64">;
-} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
// Atomic binary read-modify-writes
@@ -580,10 +578,10 @@ defm ATOMIC_RMW32_U_XCHG_I64 :
multiclass BinRMWPatNoOffset<ValueType ty, PatFrag kind, string inst> {
def : Pat<(ty (kind I32:$addr, ty:$val)),
(!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(ty (kind I64:$addr, ty:$val)),
(!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
// Select binary RMWs with a constant offset.
@@ -593,29 +591,29 @@ multiclass BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
string inst> {
def : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
(!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(ty (kind (operand I64:$addr, imm:$off), ty:$val)),
(!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
// Select binary RMWs with just a constant offset.
multiclass BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
def : Pat<(ty (kind imm:$off, ty:$val)),
(!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(ty (kind imm:$off, ty:$val)),
(!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
multiclass BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> {
def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
(!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
(!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
}
// Patterns for various addressing modes.
@@ -636,7 +634,6 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
defm : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
}
-let Predicates = [HasAtomics] in {
defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64,
"ATOMIC_RMW_ADD_I32", "ATOMIC_RMW_ADD_I64">;
defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64,
@@ -649,7 +646,6 @@ defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64,
"ATOMIC_RMW_XOR_I32", "ATOMIC_RMW_XOR_I64">;
defm : BinRMWPattern<atomic_swap_32, atomic_swap_64,
"ATOMIC_RMW_XCHG_I32", "ATOMIC_RMW_XCHG_I64">;
-} // Predicates = [HasAtomics]
// Truncating & zero-extending binary RMW patterns.
// These are combined patterns of truncating store patterns and zero-extending
@@ -752,7 +748,6 @@ multiclass BinRMWTruncExtPattern<
defm : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
}
-let Predicates = [HasAtomics] in {
defm : BinRMWTruncExtPattern<
atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, atomic_load_add_64,
"ATOMIC_RMW8_U_ADD_I32", "ATOMIC_RMW16_U_ADD_I32",
@@ -778,7 +773,6 @@ defm : BinRMWTruncExtPattern<
"ATOMIC_RMW8_U_XCHG_I32", "ATOMIC_RMW16_U_XCHG_I32",
"ATOMIC_RMW8_U_XCHG_I64", "ATOMIC_RMW16_U_XCHG_I64",
"ATOMIC_RMW32_U_XCHG_I64">;
-} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
// Atomic ternary read-modify-writes
@@ -827,10 +821,10 @@ defm ATOMIC_RMW32_U_CMPXCHG_I64 :
multiclass TerRMWPatNoOffset<ValueType ty, PatFrag kind, string inst> {
def : Pat<(ty (kind I32:$addr, ty:$exp, ty:$new)),
(!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$exp, ty:$new)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(ty (kind I64:$addr, ty:$exp, ty:$new)),
(!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$exp, ty:$new)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
// Select ternary RMWs with a constant offset.
@@ -840,10 +834,10 @@ multiclass TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
string inst> {
def : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)),
(!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics]>;
def : Pat<(ty (kind (operand I64:$addr, imm:$off), ty:$exp, ty:$new)),
(!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$exp, ty:$new)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics]>;
}
// Select ternary RMWs with just a constant offset.
@@ -860,11 +854,11 @@ multiclass TerRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
(!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
ty:$new)>,
- Requires<[HasAddr32]>;
+ Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
(!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
ty:$new)>,
- Requires<[HasAddr64]>;
+ Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
}
// Patterns for various addressing modes.
@@ -885,7 +879,6 @@ multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
defm : TerRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
}
-let Predicates = [HasAtomics] in
defm : TerRMWPattern<atomic_cmp_swap_32, atomic_cmp_swap_64,
"ATOMIC_RMW_CMPXCHG_I32", "ATOMIC_RMW_CMPXCHG_I64">;
@@ -994,7 +987,6 @@ multiclass TerRMWTruncExtPattern<
defm : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
}
-let Predicates = [HasAtomics] in
defm : TerRMWTruncExtPattern<
atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32, atomic_cmp_swap_64,
"ATOMIC_RMW8_U_CMPXCHG_I32", "ATOMIC_RMW16_U_CMPXCHG_I32",
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
index 3e9ef6fbc7ea..7aeae54d95a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -39,7 +39,7 @@ let mayStore = 1, hasSideEffects = 1 in
defm MEMORY_INIT_A#B :
BULK_I<(outs),
(ins i32imm_op:$seg, i32imm_op:$idx, rc:$dest,
- rc:$offset, rc:$size),
+ I32:$offset, I32:$size),
(outs), (ins i32imm_op:$seg, i32imm_op:$idx),
[],
"memory.init\t$seg, $idx, $dest, $offset, $size",
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 171dd9a67beb..702560bea100 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -103,7 +103,7 @@ defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>;
} // isReturn = 1
-let isTrap = 1 in
+let IsCanonical = 1, isTrap = 1 in
defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
} // isTerminator = 1
@@ -131,14 +131,11 @@ defm THROW : I<(outs), (ins event_op:$tag, variable_ops),
(outs), (ins event_op:$tag),
[(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag))],
"throw \t$tag", "throw \t$tag", 0x08>;
-defm RETHROW : I<(outs), (ins EXNREF:$exn), (outs), (ins), [],
- "rethrow \t$exn", "rethrow", 0x09>;
-// Pseudo instruction to be the lowering target of int_wasm_rethrow_in_catch
-// intrinsic. Will be converted to the real rethrow instruction later.
-let isPseudo = 1 in
-defm RETHROW_IN_CATCH : NRI<(outs), (ins), [(int_wasm_rethrow_in_catch)],
- "rethrow_in_catch", 0>;
+defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+// For C++ support, we only rethrow the latest exception, thus always setting
+// the depth to 0.
+def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
// Region within which an exception is caught: try / end_try
let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
@@ -146,26 +143,18 @@ defm TRY : NRI<(outs), (ins Signature:$sig), [], "try \t$sig", 0x06>;
defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
-// Catching an exception: catch / extract_exception
-let hasCtrlDep = 1, hasSideEffects = 1 in
-defm CATCH : I<(outs EXNREF:$dst), (ins), (outs), (ins), [],
- "catch \t$dst", "catch", 0x07>;
-
-// Querying / extracing exception: br_on_exn
-// br_on_exn queries an exnref to see if it matches the corresponding exception
-// tag index. If true it branches to the given label and pushes the
-// corresponding argument values of the exception onto the stack.
-let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in
-defm BR_ON_EXN : I<(outs), (ins bb_op:$dst, event_op:$tag, EXNREF:$exn),
- (outs), (ins bb_op:$dst, event_op:$tag), [],
- "br_on_exn \t$dst, $tag, $exn", "br_on_exn \t$dst, $tag",
- 0x0a>;
-// This is a pseudo instruction that simulates popping a value from stack, which
-// has been pushed by br_on_exn
-let isCodeGenOnly = 1, hasSideEffects = 1 in
-defm EXTRACT_EXCEPTION_I32 : NRI<(outs I32:$dst), (ins),
- [(set I32:$dst, (int_wasm_extract_exception))],
- "extract_exception\t$dst">;
+// Catching an exception: catch / catch_all
+let hasCtrlDep = 1, hasSideEffects = 1 in {
+// Currently 'catch' can only extract an i32, which is sufficient for C++
+// support, but according to the spec 'catch' can extract any number of values
+// based on the event type.
+defm CATCH : I<(outs I32:$dst), (ins event_op:$tag),
+ (outs), (ins event_op:$tag),
+ [(set I32:$dst,
+ (WebAssemblycatch (WebAssemblywrapper texternalsym:$tag)))],
+ "catch \t$dst, $tag", "catch \t$tag", 0x07>;
+defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x05>;
+}
// Pseudo instructions: cleanupret / catchret
let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 6fe1fd2b5c5a..db2ad05b4cdf 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -76,8 +76,10 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
CopyOpcode = WebAssembly::COPY_F64;
else if (RC == &WebAssembly::V128RegClass)
CopyOpcode = WebAssembly::COPY_V128;
- else if (RC == &WebAssembly::EXNREFRegClass)
- CopyOpcode = WebAssembly::COPY_EXNREF;
+ else if (RC == &WebAssembly::FUNCREFRegClass)
+ CopyOpcode = WebAssembly::COPY_FUNCREF;
+ else if (RC == &WebAssembly::EXTERNREFRegClass)
+ CopyOpcode = WebAssembly::COPY_EXTERNREF;
else
llvm_unreachable("Unexpected register class");
@@ -139,14 +141,6 @@ bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
else
FBB = MI.getOperand(0).getMBB();
break;
- case WebAssembly::BR_ON_EXN:
- if (HaveCond)
- return true;
- Cond.push_back(MachineOperand::CreateImm(true));
- Cond.push_back(MI.getOperand(2));
- TBB = MI.getOperand(0).getMBB();
- HaveCond = true;
- break;
}
if (MI.isBarrier())
break;
@@ -192,24 +186,10 @@ unsigned WebAssemblyInstrInfo::insertBranch(
assert(Cond.size() == 2 && "Expected a flag and a successor block");
- MachineFunction &MF = *MBB.getParent();
- auto &MRI = MF.getRegInfo();
- bool IsBrOnExn = Cond[1].isReg() && MRI.getRegClass(Cond[1].getReg()) ==
- &WebAssembly::EXNREFRegClass;
-
- if (Cond[0].getImm()) {
- if (IsBrOnExn) {
- const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception");
- BuildMI(&MBB, DL, get(WebAssembly::BR_ON_EXN))
- .addMBB(TBB)
- .addExternalSymbol(CPPExnSymbol)
- .add(Cond[1]);
- } else
- BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
- } else {
- assert(!IsBrOnExn && "br_on_exn does not have a reversed condition");
+ if (Cond[0].getImm())
+ BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
+ else
BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)).addMBB(TBB).add(Cond[1]);
- }
if (!FBB)
return 1;
@@ -220,14 +200,6 @@ unsigned WebAssemblyInstrInfo::insertBranch(
bool WebAssemblyInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
assert(Cond.size() == 2 && "Expected a flag and a condition expression");
-
- // br_on_exn's condition cannot be reversed
- MachineFunction &MF = *Cond[1].getParent()->getParent()->getParent();
- auto &MRI = MF.getRegInfo();
- if (Cond[1].isReg() &&
- MRI.getRegClass(Cond[1].getReg()) == &WebAssembly::EXNREFRegClass)
- return true;
-
Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 085910f01ee6..2f5a64a87a59 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -74,8 +74,6 @@ def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyCallSeqEnd :
SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
-def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
def SDT_WebAssemblyBrTable : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>;
@@ -83,7 +81,8 @@ def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
def SDT_WebAssemblyWrapperPIC : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyThrow : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyThrow : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyCatch : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
//===----------------------------------------------------------------------===//
// WebAssembly-specific DAG Nodes.
@@ -109,6 +108,8 @@ def WebAssemblywrapperPIC : SDNode<"WebAssemblyISD::WrapperPIC",
SDT_WebAssemblyWrapperPIC>;
def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow,
[SDNPHasChain, SDNPVariadic]>;
+def WebAssemblycatch : SDNode<"WebAssemblyISD::CATCH", SDT_WebAssemblyCatch,
+ [SDNPHasChain, SDNPSideEffect]>;
//===----------------------------------------------------------------------===//
// WebAssembly-specific Operands.
@@ -163,6 +164,9 @@ def vec_i64imm_op : Operand<i64>;
let OperandType = "OPERAND_FUNCTION32" in
def function32_op : Operand<i32>;
+let OperandType = "OPERAND_TABLE" in
+def table32_op : Operand<i32>;
+
let OperandType = "OPERAND_OFFSET32" in
def offset32_op : Operand<i32>;
@@ -184,6 +188,11 @@ def Signature : Operand<i32> {
let PrintMethod = "printWebAssemblySignatureOperand";
}
+let OperandType = "OPERAND_HEAPTYPE" in
+def HeapType : Operand<i32> {
+ let PrintMethod = "printWebAssemblyHeapTypeOperand";
+}
+
let OperandType = "OPERAND_TYPEINDEX" in
def TypeIndex : Operand<i32>;
@@ -236,7 +245,8 @@ defm "": ARGUMENT<I32, i32>;
defm "": ARGUMENT<I64, i64>;
defm "": ARGUMENT<F32, f32>;
defm "": ARGUMENT<F64, f64>;
-defm "": ARGUMENT<EXNREF, exnref>;
+defm "": ARGUMENT<FUNCREF, funcref>;
+defm "": ARGUMENT<EXTERNREF, externref>;
// local.get and local.set are not generated by instruction selection; they
// are implied by virtual register uses and defs.
@@ -306,7 +316,8 @@ defm "" : LOCAL<I64>;
defm "" : LOCAL<F32>;
defm "" : LOCAL<F64>;
defm "" : LOCAL<V128>, Requires<[HasSIMD128]>;
-defm "" : LOCAL<EXNREF>, Requires<[HasExceptionHandling]>;
+defm "" : LOCAL<FUNCREF>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<EXTERNREF>, Requires<[HasReferenceTypes]>;
let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
@@ -333,16 +344,25 @@ def : Pat<(i64 (WebAssemblywrapper tglobaladdr:$addr)),
(CONST_I64 tglobaladdr:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
- (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
+ (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
def : Pat<(i32 (WebAssemblywrapperPIC tglobaladdr:$addr)),
- (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
+ (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapperPIC tglobaladdr:$addr)),
+ (CONST_I64 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr64]>;
+
+def : Pat<(i32 (WebAssemblywrapper tglobaltlsaddr:$addr)),
+ (CONST_I32 tglobaltlsaddr:$addr)>, Requires<[HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper tglobaltlsaddr:$addr)),
+ (CONST_I64 tglobaltlsaddr:$addr)>, Requires<[HasAddr64]>;
def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
- (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC]>;
+ (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC, HasAddr32]>;
def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
- (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC]>;
+ (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper texternalsym:$addr)),
+ (CONST_I64 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
@@ -361,3 +381,4 @@ include "WebAssemblyInstrAtomics.td"
include "WebAssemblyInstrSIMD.td"
include "WebAssemblyInstrRef.td"
include "WebAssemblyInstrBulkMemory.td"
+include "WebAssemblyInstrTable.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index b3c63cc1f884..48b934457267 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -70,7 +70,7 @@ defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b, []>;
multiclass LoadPatNoOffset<ValueType ty, PatFrag kind, string inst> {
def : Pat<(ty (kind I32:$addr)), (!cast<NI>(inst # "_A32") 0, 0, I32:$addr)>,
Requires<[HasAddr32]>;
- def : Pat<(ty (kind I64:$addr)), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
+ def : Pat<(ty (kind (i64 I64:$addr))), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
Requires<[HasAddr64]>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 14d723750f07..7f324fc11210 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -11,15 +11,29 @@
///
//===----------------------------------------------------------------------===//
-defm SELECT_EXNREF : I<(outs EXNREF:$dst),
- (ins EXNREF:$lhs, EXNREF:$rhs, I32:$cond),
- (outs), (ins),
- [(set EXNREF:$dst,
- (select I32:$cond, EXNREF:$lhs, EXNREF:$rhs))],
- "exnref.select\t$dst, $lhs, $rhs, $cond",
- "exnref.select", 0x1b>;
+multiclass REF_I<WebAssemblyRegClass reg, ValueType vt> {
+ defm REF_NULL_#reg : I<(outs reg:$res), (ins HeapType:$heaptype),
+ (outs), (ins HeapType:$heaptype),
+ [],
+ "ref.null\t$res, $heaptype",
+ "ref.null\t$heaptype",
+ 0xd0>,
+ Requires<[HasReferenceTypes]>;
+ defm SELECT_#reg: I<(outs reg:$dst), (ins reg:$lhs, reg:$rhs, I32:$cond),
+ (outs), (ins),
+ [(set reg:$dst,
+ (select I32:$cond, reg:$lhs, reg:$rhs))],
+ vt#".select\t$dst, $lhs, $rhs, $cond",
+ vt#".select", 0x1b>,
+ Requires<[HasReferenceTypes]>;
+}
-def : Pat<(select (i32 (setne I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs),
- (SELECT_EXNREF EXNREF:$lhs, EXNREF:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (seteq I32:$cond, 0)), EXNREF:$lhs, EXNREF:$rhs),
- (SELECT_EXNREF EXNREF:$rhs, EXNREF:$lhs, I32:$cond)>;
+defm "" : REF_I<FUNCREF, funcref>;
+defm "" : REF_I<EXTERNREF, externref>;
+
+foreach reg = [FUNCREF, EXTERNREF] in {
+def : Pat<(select (i32 (setne I32:$cond, 0)), reg:$lhs, reg:$rhs),
+ (!cast<Instruction>("SELECT_"#reg) reg:$lhs, reg:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), reg:$lhs, reg:$rhs),
+ (!cast<Instruction>("SELECT_"#reg) reg:$rhs, reg:$lhs, I32:$cond)>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 4f3da2f35c61..9f3d0f4ab2c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -16,7 +16,9 @@ multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
list<dag> pattern_r, string asmstr_r = "",
string asmstr_s = "", bits<32> simdop = -1> {
defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
- !or(0xfd00, !and(0xff, simdop))>,
+ !if(!ge(simdop, 0x100),
+ !or(0xfd0000, !and(0xffff, simdop)),
+ !or(0xfd00, !and(0xff, simdop)))>,
Requires<[HasSIMD128]>;
}
@@ -35,6 +37,99 @@ def ImmI#SIZE : ImmLeaf<i32,
foreach SIZE = [2, 4, 8, 16, 32] in
def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
+// Create vector with identical lanes: splat
+def splat2 : PatFrag<(ops node:$x), (build_vector $x, $x)>;
+def splat4 : PatFrag<(ops node:$x), (build_vector $x, $x, $x, $x)>;
+def splat8 : PatFrag<(ops node:$x), (build_vector $x, $x, $x, $x,
+ $x, $x, $x, $x)>;
+def splat16 : PatFrag<(ops node:$x),
+ (build_vector $x, $x, $x, $x, $x, $x, $x, $x,
+ $x, $x, $x, $x, $x, $x, $x, $x)>;
+
+class Vec {
+ ValueType vt;
+ ValueType int_vt;
+ ValueType lane_vt;
+ WebAssemblyRegClass lane_rc;
+ int lane_bits;
+ ImmLeaf lane_idx;
+ PatFrag splat;
+ string prefix;
+ Vec split;
+}
+
+def I8x16 : Vec {
+ let vt = v16i8;
+ let int_vt = vt;
+ let lane_vt = i32;
+ let lane_rc = I32;
+ let lane_bits = 8;
+ let lane_idx = LaneIdx16;
+ let splat = splat16;
+ let prefix = "i8x16";
+}
+
+def I16x8 : Vec {
+ let vt = v8i16;
+ let int_vt = vt;
+ let lane_vt = i32;
+ let lane_rc = I32;
+ let lane_bits = 16;
+ let lane_idx = LaneIdx8;
+ let splat = splat8;
+ let prefix = "i16x8";
+ let split = I8x16;
+}
+
+def I32x4 : Vec {
+ let vt = v4i32;
+ let int_vt = vt;
+ let lane_vt = i32;
+ let lane_rc = I32;
+ let lane_bits = 32;
+ let lane_idx = LaneIdx4;
+ let splat = splat4;
+ let prefix = "i32x4";
+ let split = I16x8;
+}
+
+def I64x2 : Vec {
+ let vt = v2i64;
+ let int_vt = vt;
+ let lane_vt = i64;
+ let lane_rc = I64;
+ let lane_bits = 64;
+ let lane_idx = LaneIdx2;
+ let splat = splat2;
+ let prefix = "i64x2";
+ let split = I32x4;
+}
+
+def F32x4 : Vec {
+ let vt = v4f32;
+ let int_vt = v4i32;
+ let lane_vt = f32;
+ let lane_rc = F32;
+ let lane_bits = 32;
+ let lane_idx = LaneIdx4;
+ let splat = splat4;
+ let prefix = "f32x4";
+}
+
+def F64x2 : Vec {
+ let vt = v2f64;
+ let int_vt = v2i64;
+ let lane_vt = f64;
+ let lane_rc = F64;
+ let lane_bits = 64;
+ let lane_idx = LaneIdx2;
+ let splat = splat2;
+ let prefix = "f64x2";
+}
+
+defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
+
//===----------------------------------------------------------------------===//
// Load and store
//===----------------------------------------------------------------------===//
@@ -53,116 +148,186 @@ defm LOAD_V128_A64 :
"v128.load\t$off$p2align", 0>;
}
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm : LoadPatNoOffset<vec_t, load, "LOAD_V128">;
-defm : LoadPatImmOff<vec_t, load, regPlusImm, "LOAD_V128">;
-defm : LoadPatImmOff<vec_t, load, or_is_add, "LOAD_V128">;
-defm : LoadPatOffsetOnly<vec_t, load, "LOAD_V128">;
-defm : LoadPatGlobalAddrOffOnly<vec_t, load, "LOAD_V128">;
+// Def load patterns from WebAssemblyInstrMemory.td for vector types
+foreach vec = AllVecs in {
+defm : LoadPatNoOffset<vec.vt, load, "LOAD_V128">;
+defm : LoadPatImmOff<vec.vt, load, regPlusImm, "LOAD_V128">;
+defm : LoadPatImmOff<vec.vt, load, or_is_add, "LOAD_V128">;
+defm : LoadPatOffsetOnly<vec.vt, load, "LOAD_V128">;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, load, "LOAD_V128">;
}
-// vNxM.load_splat
-multiclass SIMDLoadSplat<string vec, bits<32> simdop> {
+// v128.loadX_splat
+multiclass SIMDLoadSplat<int size, bits<32> simdop> {
let mayLoad = 1, UseNamedOperandTable = 1 in {
- defm LOAD_SPLAT_#vec#_A32 :
+ defm LOAD#size#_SPLAT_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs),
(ins P2Align:$p2align, offset32_op:$off), [],
- vec#".load_splat\t$dst, ${off}(${addr})$p2align",
- vec#".load_splat\t$off$p2align", simdop>;
- defm LOAD_SPLAT_#vec#_A64 :
+ "v128.load"#size#"_splat\t$dst, ${off}(${addr})$p2align",
+ "v128.load"#size#"_splat\t$off$p2align", simdop>;
+ defm LOAD#size#_SPLAT_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
(outs),
(ins P2Align:$p2align, offset64_op:$off), [],
- vec#".load_splat\t$dst, ${off}(${addr})$p2align",
- vec#".load_splat\t$off$p2align", simdop>;
+ "v128.load"#size#"_splat\t$dst, ${off}(${addr})$p2align",
+ "v128.load"#size#"_splat\t$off$p2align", simdop>;
}
}
-defm "" : SIMDLoadSplat<"v8x16", 7>;
-defm "" : SIMDLoadSplat<"v16x8", 8>;
-defm "" : SIMDLoadSplat<"v32x4", 9>;
-defm "" : SIMDLoadSplat<"v64x2", 10>;
+defm "" : SIMDLoadSplat<8, 7>;
+defm "" : SIMDLoadSplat<16, 8>;
+defm "" : SIMDLoadSplat<32, 9>;
+defm "" : SIMDLoadSplat<64, 10>;
def wasm_load_splat_t : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>;
def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def load_splat : PatFrag<(ops node:$addr), (wasm_load_splat node:$addr)>;
-foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"],
- ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in {
-defm : LoadPatNoOffset<!cast<ValueType>(args[0]),
- load_splat,
- "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatImmOff<!cast<ValueType>(args[0]),
- load_splat,
- regPlusImm,
- "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatImmOff<!cast<ValueType>(args[0]),
- load_splat,
- or_is_add,
- "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
- load_splat,
- "LOAD_SPLAT_"#args[1]>;
-defm : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
- load_splat,
- "LOAD_SPLAT_"#args[1]>;
+foreach vec = AllVecs in {
+defvar inst = "LOAD"#vec.lane_bits#"_SPLAT";
+defm : LoadPatNoOffset<vec.vt, load_splat, inst>;
+defm : LoadPatImmOff<vec.vt, load_splat, regPlusImm, inst>;
+defm : LoadPatImmOff<vec.vt, load_splat, or_is_add, inst>;
+defm : LoadPatOffsetOnly<vec.vt, load_splat, inst>;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, load_splat, inst>;
}
// Load and extend
-multiclass SIMDLoadExtend<ValueType vec_t, string name, bits<32> simdop> {
+multiclass SIMDLoadExtend<Vec vec, string loadPat, bits<32> simdop> {
+ defvar signed = vec.prefix#".load"#loadPat#"_s";
+ defvar unsigned = vec.prefix#".load"#loadPat#"_u";
let mayLoad = 1, UseNamedOperandTable = 1 in {
- defm LOAD_EXTEND_S_#vec_t#_A32 :
+ defm LOAD_EXTEND_S_#vec#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- name#"_s\t$dst, ${off}(${addr})$p2align",
- name#"_s\t$off$p2align", simdop>;
- defm LOAD_EXTEND_U_#vec_t#_A32 :
+ signed#"\t$dst, ${off}(${addr})$p2align",
+ signed#"\t$off$p2align", simdop>;
+ defm LOAD_EXTEND_U_#vec#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- name#"_u\t$dst, ${off}(${addr})$p2align",
- name#"_u\t$off$p2align", !add(simdop, 1)>;
- defm LOAD_EXTEND_S_#vec_t#_A64 :
+ unsigned#"\t$dst, ${off}(${addr})$p2align",
+ unsigned#"\t$off$p2align", !add(simdop, 1)>;
+ defm LOAD_EXTEND_S_#vec#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
- name#"_s\t$dst, ${off}(${addr})$p2align",
- name#"_s\t$off$p2align", simdop>;
- defm LOAD_EXTEND_U_#vec_t#_A64 :
+ signed#"\t$dst, ${off}(${addr})$p2align",
+ signed#"\t$off$p2align", simdop>;
+ defm LOAD_EXTEND_U_#vec#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
- name#"_u\t$dst, ${off}(${addr})$p2align",
- name#"_u\t$off$p2align", !add(simdop, 1)>;
+ unsigned#"\t$dst, ${off}(${addr})$p2align",
+ unsigned#"\t$off$p2align", !add(simdop, 1)>;
}
}
-defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 1>;
-defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 3>;
-defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 5>;
+defm "" : SIMDLoadExtend<I16x8, "8x8", 1>;
+defm "" : SIMDLoadExtend<I32x4, "16x4", 3>;
+defm "" : SIMDLoadExtend<I64x2, "32x2", 5>;
+
+foreach vec = [I16x8, I32x4, I64x2] in
+foreach exts = [["sextloadvi", "_S"],
+ ["zextloadvi", "_U"],
+ ["extloadvi", "_U"]] in {
+defvar loadpat = !cast<PatFrag>(exts[0]#vec.split.lane_bits);
+defvar inst = "LOAD_EXTEND"#exts[1]#"_"#vec;
+defm : LoadPatNoOffset<vec.vt, loadpat, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, regPlusImm, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, or_is_add, inst>;
+defm : LoadPatOffsetOnly<vec.vt, loadpat, inst>;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, loadpat, inst>;
+}
+
+// Load lane into zero vector
+multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
+ defvar name = "v128.load"#vec.lane_bits#"_zero";
+ let mayLoad = 1, UseNamedOperandTable = 1 in {
+ defm LOAD_ZERO_#vec#_A32 :
+ SIMD_I<(outs V128:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ name#"\t$dst, ${off}(${addr})$p2align",
+ name#"\t$off$p2align", simdop>;
+ defm LOAD_ZERO_#vec#_A64 :
+ SIMD_I<(outs V128:$dst),
+ (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+ (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+ name#"\t$dst, ${off}(${addr})$p2align",
+ name#"\t$off$p2align", simdop>;
+ } // mayLoad = 1, UseNamedOperandTable = 1
+}
+
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDLoadZero<I32x4, 252>;
+defm "" : SIMDLoadZero<I64x2, 253>;
+
+foreach vec = [I32x4, I64x2] in {
+defvar loadpat = !cast<Intrinsic>("int_wasm_load"#vec.lane_bits#"_zero");
+defvar inst = "LOAD_ZERO_"#vec;
+defm : LoadPatNoOffset<vec.vt, loadpat, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, regPlusImm, inst>;
+defm : LoadPatImmOff<vec.vt, loadpat, or_is_add, inst>;
+defm : LoadPatOffsetOnly<vec.vt, loadpat, inst>;
+defm : LoadPatGlobalAddrOffOnly<vec.vt, loadpat, inst>;
+}
+
+// Load lane
+multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
+ defvar name = "v128.load"#vec.lane_bits#"_lane";
+ let mayLoad = 1, UseNamedOperandTable = 1 in {
+ defm LOAD_LANE_#vec#_A32 :
+ SIMD_I<(outs V128:$dst),
+ (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
+ I32:$addr, V128:$vec),
+ (outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
+ [], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
+ name#"\t$off$p2align, $idx", simdop>;
+ defm LOAD_LANE_#vec#_A64 :
+ SIMD_I<(outs V128:$dst),
+ (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
+ I64:$addr, V128:$vec),
+ (outs), (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx),
+ [], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
+ name#"\t$off$p2align, $idx", simdop>;
+ } // mayLoad = 1, UseNamedOperandTable = 1
+}
-foreach types = [[v8i16, i8], [v4i32, i16], [v2i64, i32]] in
-foreach exts = [["sextloadv", "_S"],
- ["zextloadv", "_U"],
- ["extloadv", "_U"]] in {
-defm : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]),
- "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm,
- "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add,
- "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
- "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
-defm : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
- "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDLoadLane<I8x16, 88>;
+defm "" : SIMDLoadLane<I16x8, 89>;
+defm "" : SIMDLoadLane<I32x4, 90>;
+defm "" : SIMDLoadLane<I64x2, 91>;
+
+// Select loads with no constant offset.
+multiclass LoadLanePatNoOffset<Vec vec, PatFrag kind> {
+ defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec#"_A32");
+ defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec#"_A64");
+ def : Pat<(vec.vt (kind (i32 I32:$addr),
+ (vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
+ (load_lane_a32 0, 0, imm:$idx, $addr, $vec)>,
+ Requires<[HasAddr32]>;
+ def : Pat<(vec.vt (kind (i64 I64:$addr),
+ (vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
+ (load_lane_a64 0, 0, imm:$idx, $addr, $vec)>,
+ Requires<[HasAddr64]>;
}
+defm : LoadLanePatNoOffset<I8x16, int_wasm_load8_lane>;
+defm : LoadLanePatNoOffset<I16x8, int_wasm_load16_lane>;
+defm : LoadLanePatNoOffset<I32x4, int_wasm_load32_lane>;
+defm : LoadLanePatNoOffset<I64x2, int_wasm_load64_lane>;
+
+// TODO: Also support the other load patterns for load_lane once the instructions
+// are merged to the proposal.
// Store: v128.store
let mayStore = 1, UseNamedOperandTable = 1 in {
@@ -177,30 +342,77 @@ defm STORE_V128_A64 :
"v128.store\t${off}(${addr})$p2align, $vec",
"v128.store\t$off$p2align", 11>;
}
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-defm : StorePatNoOffset<vec_t, store, "STORE_V128">;
-defm : StorePatImmOff<vec_t, store, regPlusImm, "STORE_V128">;
-defm : StorePatImmOff<vec_t, store, or_is_add, "STORE_V128">;
-defm : StorePatOffsetOnly<vec_t, store, "STORE_V128">;
-defm : StorePatGlobalAddrOffOnly<vec_t, store, "STORE_V128">;
+
+// Def store patterns from WebAssemblyInstrMemory.td for vector types
+foreach vec = AllVecs in {
+defm : StorePatNoOffset<vec.vt, store, "STORE_V128">;
+defm : StorePatImmOff<vec.vt, store, regPlusImm, "STORE_V128">;
+defm : StorePatImmOff<vec.vt, store, or_is_add, "STORE_V128">;
+defm : StorePatOffsetOnly<vec.vt, store, "STORE_V128">;
+defm : StorePatGlobalAddrOffOnly<vec.vt, store, "STORE_V128">;
+}
+
+// Store lane
+multiclass SIMDStoreLane<Vec vec, bits<32> simdop> {
+ defvar name = "v128.store"#vec.lane_bits#"_lane";
+ let mayStore = 1, UseNamedOperandTable = 1 in {
+ defm STORE_LANE_#vec#_A32 :
+ SIMD_I<(outs),
+ (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
+ I32:$addr, V128:$vec),
+ (outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
+ [], name#"\t${off}(${addr})$p2align, $vec, $idx",
+ name#"\t$off$p2align, $idx", simdop>;
+ defm STORE_LANE_#vec#_A64 :
+ SIMD_I<(outs V128:$dst),
+ (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
+ I64:$addr, V128:$vec),
+ (outs), (ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx),
+ [], name#"\t${off}(${addr})$p2align, $vec, $idx",
+ name#"\t$off$p2align, $idx", simdop>;
+ } // mayStore = 1, UseNamedOperandTable = 1
+}
+
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDStoreLane<I8x16, 92>;
+defm "" : SIMDStoreLane<I16x8, 93>;
+defm "" : SIMDStoreLane<I32x4, 94>;
+defm "" : SIMDStoreLane<I64x2, 95>;
+
+// Select stores with no constant offset.
+multiclass StoreLanePatNoOffset<Vec vec, PatFrag kind> {
+ def : Pat<(kind (i32 I32:$addr), (vec.vt V128:$vec), (i32 vec.lane_idx:$idx)),
+ (!cast<NI>("STORE_LANE_"#vec#"_A32") 0, 0, imm:$idx, $addr, $vec)>,
+ Requires<[HasAddr32]>;
+ def : Pat<(kind (i64 I64:$addr), (vec.vt V128:$vec), (i32 vec.lane_idx:$idx)),
+ (!cast<NI>("STORE_LANE_"#vec#"_A64") 0, 0, imm:$idx, $addr, $vec)>,
+ Requires<[HasAddr64]>;
}
+defm : StoreLanePatNoOffset<I8x16, int_wasm_store8_lane>;
+defm : StoreLanePatNoOffset<I16x8, int_wasm_store16_lane>;
+defm : StoreLanePatNoOffset<I32x4, int_wasm_store32_lane>;
+defm : StoreLanePatNoOffset<I64x2, int_wasm_store64_lane>;
+
+// TODO: Also support the other store patterns for store_lane once the
+// instructions are merged to the proposal.
+
//===----------------------------------------------------------------------===//
// Constructing SIMD values
//===----------------------------------------------------------------------===//
// Constant: v128.const
-multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
+multiclass ConstVec<Vec vec, dag ops, dag pat, string args> {
let isMoveImm = 1, isReMaterializable = 1,
Predicates = [HasUnimplementedSIMD128] in
- defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
- [(set V128:$dst, (vec_t pat))],
- "v128.const\t$dst, "#args,
- "v128.const\t"#args, 12>;
+ defm CONST_V128_#vec : SIMD_I<(outs V128:$dst), ops, (outs), ops,
+ [(set V128:$dst, (vec.vt pat))],
+ "v128.const\t$dst, "#args,
+ "v128.const\t"#args, 12>;
}
-defm "" : ConstVec<v16i8,
+defm "" : ConstVec<I8x16,
(ins vec_i8imm_op:$i0, vec_i8imm_op:$i1,
vec_i8imm_op:$i2, vec_i8imm_op:$i3,
vec_i8imm_op:$i4, vec_i8imm_op:$i5,
@@ -215,7 +427,7 @@ defm "" : ConstVec<v16i8,
ImmI8:$iC, ImmI8:$iD, ImmI8:$iE, ImmI8:$iF),
!strconcat("$i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7, ",
"$i8, $i9, $iA, $iB, $iC, $iD, $iE, $iF")>;
-defm "" : ConstVec<v8i16,
+defm "" : ConstVec<I16x8,
(ins vec_i16imm_op:$i0, vec_i16imm_op:$i1,
vec_i16imm_op:$i2, vec_i16imm_op:$i3,
vec_i16imm_op:$i4, vec_i16imm_op:$i5,
@@ -225,23 +437,23 @@ defm "" : ConstVec<v8i16,
ImmI16:$i4, ImmI16:$i5, ImmI16:$i6, ImmI16:$i7),
"$i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7">;
let IsCanonical = 1 in
-defm "" : ConstVec<v4i32,
+defm "" : ConstVec<I32x4,
(ins vec_i32imm_op:$i0, vec_i32imm_op:$i1,
vec_i32imm_op:$i2, vec_i32imm_op:$i3),
(build_vector (i32 imm:$i0), (i32 imm:$i1),
(i32 imm:$i2), (i32 imm:$i3)),
"$i0, $i1, $i2, $i3">;
-defm "" : ConstVec<v2i64,
+defm "" : ConstVec<I64x2,
(ins vec_i64imm_op:$i0, vec_i64imm_op:$i1),
(build_vector (i64 imm:$i0), (i64 imm:$i1)),
"$i0, $i1">;
-defm "" : ConstVec<v4f32,
+defm "" : ConstVec<F32x4,
(ins f32imm_op:$i0, f32imm_op:$i1,
f32imm_op:$i2, f32imm_op:$i3),
(build_vector (f32 fpimm:$i0), (f32 fpimm:$i1),
(f32 fpimm:$i2), (f32 fpimm:$i3)),
"$i0, $i1, $i2, $i3">;
-defm "" : ConstVec<v2f64,
+defm "" : ConstVec<F64x2,
(ins f64imm_op:$i0, f64imm_op:$i1),
(build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
"$i0, $i1">;
@@ -269,10 +481,10 @@ defm SHUFFLE :
vec_i8imm_op:$mC, vec_i8imm_op:$mD,
vec_i8imm_op:$mE, vec_i8imm_op:$mF),
[],
- "v8x16.shuffle\t$dst, $x, $y, "#
+ "i8x16.shuffle\t$dst, $x, $y, "#
"$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
"$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
- "v8x16.shuffle\t"#
+ "i8x16.shuffle\t"#
"$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
"$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
13>;
@@ -280,8 +492,8 @@ defm SHUFFLE :
// Shuffles after custom lowering
def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
+foreach vec = AllVecs in {
+def : Pat<(vec.vt (wasm_shuffle (vec.vt V128:$x), (vec.vt V128:$y),
(i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
(i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
(i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
@@ -290,178 +502,150 @@ def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
(i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
(i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
(i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))),
- (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y),
- (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
- (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
- (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
- (i32 LaneIdx32:$m6), (i32 LaneIdx32:$m7),
- (i32 LaneIdx32:$m8), (i32 LaneIdx32:$m9),
- (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
- (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
- (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>;
+ (SHUFFLE $x, $y,
+ imm:$m0, imm:$m1, imm:$m2, imm:$m3,
+ imm:$m4, imm:$m5, imm:$m6, imm:$m7,
+ imm:$m8, imm:$m9, imm:$mA, imm:$mB,
+ imm:$mC, imm:$mD, imm:$mE, imm:$mF)>;
}
-// Swizzle lanes: v8x16.swizzle
+// Swizzle lanes: i8x16.swizzle
def wasm_swizzle_t : SDTypeProfile<1, 2, []>;
def wasm_swizzle : SDNode<"WebAssemblyISD::SWIZZLE", wasm_swizzle_t>;
defm SWIZZLE :
SIMD_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
[(set (v16i8 V128:$dst),
(wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
- "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 14>;
+ "i8x16.swizzle\t$dst, $src, $mask", "i8x16.swizzle", 14>;
def : Pat<(int_wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)),
- (SWIZZLE V128:$src, V128:$mask)>;
+ (SWIZZLE $src, $mask)>;
+
+multiclass Splat<Vec vec, bits<32> simdop> {
+ defm SPLAT_#vec : SIMD_I<(outs V128:$dst), (ins vec.lane_rc:$x),
+ (outs), (ins),
+ [(set (vec.vt V128:$dst),
+ (vec.splat vec.lane_rc:$x))],
+ vec.prefix#".splat\t$dst, $x", vec.prefix#".splat",
+ simdop>;
+}
-// Create vector with identical lanes: splat
-def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
-def splat4 : PatFrag<(ops node:$x), (build_vector
- node:$x, node:$x, node:$x, node:$x)>;
-def splat8 : PatFrag<(ops node:$x), (build_vector
- node:$x, node:$x, node:$x, node:$x,
- node:$x, node:$x, node:$x, node:$x)>;
-def splat16 : PatFrag<(ops node:$x), (build_vector
- node:$x, node:$x, node:$x, node:$x,
- node:$x, node:$x, node:$x, node:$x,
- node:$x, node:$x, node:$x, node:$x,
- node:$x, node:$x, node:$x, node:$x)>;
-
-multiclass Splat<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
- PatFrag splat_pat, bits<32> simdop> {
- defm SPLAT_#vec_t : SIMD_I<(outs V128:$dst), (ins reg_t:$x), (outs), (ins),
- [(set (vec_t V128:$dst), (splat_pat reg_t:$x))],
- vec#".splat\t$dst, $x", vec#".splat", simdop>;
-}
-
-defm "" : Splat<v16i8, "i8x16", I32, splat16, 15>;
-defm "" : Splat<v8i16, "i16x8", I32, splat8, 16>;
-defm "" : Splat<v4i32, "i32x4", I32, splat4, 17>;
-defm "" : Splat<v2i64, "i64x2", I64, splat2, 18>;
-defm "" : Splat<v4f32, "f32x4", F32, splat4, 19>;
-defm "" : Splat<v2f64, "f64x2", F64, splat2, 20>;
+defm "" : Splat<I8x16, 15>;
+defm "" : Splat<I16x8, 16>;
+defm "" : Splat<I32x4, 17>;
+defm "" : Splat<I64x2, 18>;
+defm "" : Splat<F32x4, 19>;
+defm "" : Splat<F64x2, 20>;
// scalar_to_vector leaves high lanes undefined, so can be a splat
-class ScalarSplatPat<ValueType vec_t, ValueType lane_t,
- WebAssemblyRegClass reg_t> :
- Pat<(vec_t (scalar_to_vector (lane_t reg_t:$x))),
- (!cast<Instruction>("SPLAT_"#vec_t) reg_t:$x)>;
-
-def : ScalarSplatPat<v16i8, i32, I32>;
-def : ScalarSplatPat<v8i16, i32, I32>;
-def : ScalarSplatPat<v4i32, i32, I32>;
-def : ScalarSplatPat<v2i64, i64, I64>;
-def : ScalarSplatPat<v4f32, f32, F32>;
-def : ScalarSplatPat<v2f64, f64, F64>;
+foreach vec = AllVecs in
+def : Pat<(vec.vt (scalar_to_vector (vec.lane_vt vec.lane_rc:$x))),
+ (!cast<Instruction>("SPLAT_"#vec) $x)>;
//===----------------------------------------------------------------------===//
// Accessing lanes
//===----------------------------------------------------------------------===//
// Extract lane as a scalar: extract_lane / extract_lane_s / extract_lane_u
-multiclass ExtractLane<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
- bits<32> simdop, string suffix = ""> {
- defm EXTRACT_LANE_#vec_t#suffix :
- SIMD_I<(outs reg_t:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
+multiclass ExtractLane<Vec vec, bits<32> simdop, string suffix = ""> {
+ defm EXTRACT_LANE_#vec#suffix :
+ SIMD_I<(outs vec.lane_rc:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
(outs), (ins vec_i8imm_op:$idx), [],
- vec#".extract_lane"#suffix#"\t$dst, $vec, $idx",
- vec#".extract_lane"#suffix#"\t$idx", simdop>;
+ vec.prefix#".extract_lane"#suffix#"\t$dst, $vec, $idx",
+ vec.prefix#".extract_lane"#suffix#"\t$idx", simdop>;
}
-defm "" : ExtractLane<v16i8, "i8x16", I32, 21, "_s">;
-defm "" : ExtractLane<v16i8, "i8x16", I32, 22, "_u">;
-defm "" : ExtractLane<v8i16, "i16x8", I32, 24, "_s">;
-defm "" : ExtractLane<v8i16, "i16x8", I32, 25, "_u">;
-defm "" : ExtractLane<v4i32, "i32x4", I32, 27>;
-defm "" : ExtractLane<v2i64, "i64x2", I64, 29>;
-defm "" : ExtractLane<v4f32, "f32x4", F32, 31>;
-defm "" : ExtractLane<v2f64, "f64x2", F64, 33>;
+defm "" : ExtractLane<I8x16, 21, "_s">;
+defm "" : ExtractLane<I8x16, 22, "_u">;
+defm "" : ExtractLane<I16x8, 24, "_s">;
+defm "" : ExtractLane<I16x8, 25, "_u">;
+defm "" : ExtractLane<I32x4, 27>;
+defm "" : ExtractLane<I64x2, 29>;
+defm "" : ExtractLane<F32x4, 31>;
+defm "" : ExtractLane<F64x2, 33>;
def : Pat<(vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)),
- (EXTRACT_LANE_v16i8_u V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I8x16_u $vec, imm:$idx)>;
def : Pat<(vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)),
- (EXTRACT_LANE_v8i16_u V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I16x8_u $vec, imm:$idx)>;
def : Pat<(vector_extract (v4i32 V128:$vec), (i32 LaneIdx4:$idx)),
- (EXTRACT_LANE_v4i32 V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I32x4 $vec, imm:$idx)>;
def : Pat<(vector_extract (v4f32 V128:$vec), (i32 LaneIdx4:$idx)),
- (EXTRACT_LANE_v4f32 V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_F32x4 $vec, imm:$idx)>;
def : Pat<(vector_extract (v2i64 V128:$vec), (i32 LaneIdx2:$idx)),
- (EXTRACT_LANE_v2i64 V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I64x2 $vec, imm:$idx)>;
def : Pat<(vector_extract (v2f64 V128:$vec), (i32 LaneIdx2:$idx)),
- (EXTRACT_LANE_v2f64 V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_F64x2 $vec, imm:$idx)>;
def : Pat<
(sext_inreg (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)), i8),
- (EXTRACT_LANE_v16i8_s V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I8x16_s $vec, imm:$idx)>;
def : Pat<
(and (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)), (i32 0xff)),
- (EXTRACT_LANE_v16i8_u V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I8x16_u $vec, imm:$idx)>;
def : Pat<
(sext_inreg (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), i16),
- (EXTRACT_LANE_v8i16_s V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I16x8_s $vec, imm:$idx)>;
def : Pat<
(and (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), (i32 0xffff)),
- (EXTRACT_LANE_v8i16_u V128:$vec, imm:$idx)>;
+ (EXTRACT_LANE_I16x8_u $vec, imm:$idx)>;
// Replace lane value: replace_lane
-multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
- WebAssemblyRegClass reg_t, ValueType lane_t,
- bits<32> simdop> {
- defm REPLACE_LANE_#vec_t :
- SIMD_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx, reg_t:$x),
- (outs), (ins vec_i8imm_op:$idx),
- [(set V128:$dst, (vector_insert
- (vec_t V128:$vec), (lane_t reg_t:$x), (i32 imm_t:$idx)))],
- vec#".replace_lane\t$dst, $vec, $idx, $x",
- vec#".replace_lane\t$idx", simdop>;
-}
-
-defm "" : ReplaceLane<v16i8, "i8x16", LaneIdx16, I32, i32, 23>;
-defm "" : ReplaceLane<v8i16, "i16x8", LaneIdx8, I32, i32, 26>;
-defm "" : ReplaceLane<v4i32, "i32x4", LaneIdx4, I32, i32, 28>;
-defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 30>;
-defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 32>;
-defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 34>;
+multiclass ReplaceLane<Vec vec, bits<32> simdop> {
+ defm REPLACE_LANE_#vec :
+ SIMD_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx, vec.lane_rc:$x),
+ (outs), (ins vec_i8imm_op:$idx),
+ [(set V128:$dst, (vector_insert
+ (vec.vt V128:$vec),
+ (vec.lane_vt vec.lane_rc:$x),
+ (i32 vec.lane_idx:$idx)))],
+ vec.prefix#".replace_lane\t$dst, $vec, $idx, $x",
+ vec.prefix#".replace_lane\t$idx", simdop>;
+}
+
+defm "" : ReplaceLane<I8x16, 23>;
+defm "" : ReplaceLane<I16x8, 26>;
+defm "" : ReplaceLane<I32x4, 28>;
+defm "" : ReplaceLane<I64x2, 30>;
+defm "" : ReplaceLane<F32x4, 32>;
+defm "" : ReplaceLane<F64x2, 34>;
// Lower undef lane indices to zero
def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
- (REPLACE_LANE_v16i8 V128:$vec, 0, I32:$x)>;
+ (REPLACE_LANE_I8x16 $vec, 0, $x)>;
def : Pat<(vector_insert (v8i16 V128:$vec), I32:$x, undef),
- (REPLACE_LANE_v8i16 V128:$vec, 0, I32:$x)>;
+ (REPLACE_LANE_I16x8 $vec, 0, $x)>;
def : Pat<(vector_insert (v4i32 V128:$vec), I32:$x, undef),
- (REPLACE_LANE_v4i32 V128:$vec, 0, I32:$x)>;
+ (REPLACE_LANE_I32x4 $vec, 0, $x)>;
def : Pat<(vector_insert (v2i64 V128:$vec), I64:$x, undef),
- (REPLACE_LANE_v2i64 V128:$vec, 0, I64:$x)>;
+ (REPLACE_LANE_I64x2 $vec, 0, $x)>;
def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
- (REPLACE_LANE_v4f32 V128:$vec, 0, F32:$x)>;
+ (REPLACE_LANE_F32x4 $vec, 0, $x)>;
def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
- (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
+ (REPLACE_LANE_F64x2 $vec, 0, $x)>;
//===----------------------------------------------------------------------===//
// Comparisons
//===----------------------------------------------------------------------===//
-multiclass SIMDCondition<ValueType vec_t, ValueType out_t, string vec,
- string name, CondCode cond, bits<32> simdop> {
- defm _#vec_t :
+multiclass SIMDCondition<Vec vec, string name, CondCode cond, bits<32> simdop> {
+ defm _#vec :
SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
- [(set (out_t V128:$dst),
- (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond)
- )],
- vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>;
+ [(set (vec.int_vt V128:$dst),
+ (setcc (vec.vt V128:$lhs), (vec.vt V128:$rhs), cond))],
+ vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+ vec.prefix#"."#name, simdop>;
}
multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst> {
- defm "" : SIMDCondition<v16i8, v16i8, "i8x16", name, cond, baseInst>;
- defm "" : SIMDCondition<v8i16, v8i16, "i16x8", name, cond,
- !add(baseInst, 10)>;
- defm "" : SIMDCondition<v4i32, v4i32, "i32x4", name, cond,
- !add(baseInst, 20)>;
+ defm "" : SIMDCondition<I8x16, name, cond, baseInst>;
+ defm "" : SIMDCondition<I16x8, name, cond, !add(baseInst, 10)>;
+ defm "" : SIMDCondition<I32x4, name, cond, !add(baseInst, 20)>;
}
multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
- defm "" : SIMDCondition<v4f32, v4i32, "f32x4", name, cond, baseInst>;
- defm "" : SIMDCondition<v2f64, v2i64, "f64x2", name, cond,
- !add(baseInst, 6)>;
+ defm "" : SIMDCondition<F32x4, name, cond, baseInst>;
+ defm "" : SIMDCondition<F64x2, name, cond, !add(baseInst, 6)>;
}
// Equality: eq
@@ -499,108 +683,157 @@ defm GE : SIMDConditionFP<"ge", SETOGE, 70>;
// Lower float comparisons that don't care about NaN to standard WebAssembly
// float comparisons. These instructions are generated with nnan and in the
// target-independent expansion of unordered comparisons and ordered ne.
-foreach nodes = [[seteq, EQ_v4f32], [setne, NE_v4f32], [setlt, LT_v4f32],
- [setgt, GT_v4f32], [setle, LE_v4f32], [setge, GE_v4f32]] in
+foreach nodes = [[seteq, EQ_F32x4], [setne, NE_F32x4], [setlt, LT_F32x4],
+ [setgt, GT_F32x4], [setle, LE_F32x4], [setge, GE_F32x4]] in
def : Pat<(v4i32 (nodes[0] (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
- (v4i32 (nodes[1] (v4f32 V128:$lhs), (v4f32 V128:$rhs)))>;
+ (nodes[1] $lhs, $rhs)>;
-foreach nodes = [[seteq, EQ_v2f64], [setne, NE_v2f64], [setlt, LT_v2f64],
- [setgt, GT_v2f64], [setle, LE_v2f64], [setge, GE_v2f64]] in
+foreach nodes = [[seteq, EQ_F64x2], [setne, NE_F64x2], [setlt, LT_F64x2],
+ [setgt, GT_F64x2], [setle, LE_F64x2], [setge, GE_F64x2]] in
def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
- (v2i64 (nodes[1] (v2f64 V128:$lhs), (v2f64 V128:$rhs)))>;
+ (nodes[1] $lhs, $rhs)>;
+
+// Prototype i64x2.eq
+defm EQ_v2i64 :
+ SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
+ [(set (v2i64 V128:$dst),
+ (int_wasm_eq (v2i64 V128:$lhs), (v2i64 V128:$rhs)))],
+ "i64x2.eq\t$dst, $lhs, $rhs", "i64x2.eq", 192>;
//===----------------------------------------------------------------------===//
// Bitwise operations
//===----------------------------------------------------------------------===//
-multiclass SIMDBinary<ValueType vec_t, string vec, SDNode node, string name,
- bits<32> simdop> {
- defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
- (outs), (ins),
- [(set (vec_t V128:$dst),
- (node (vec_t V128:$lhs), (vec_t V128:$rhs))
- )],
- vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name,
- simdop>;
+multiclass SIMDBinary<Vec vec, SDNode node, string name, bits<32> simdop> {
+ defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins),
+ [(set (vec.vt V128:$dst),
+ (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))],
+ vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+ vec.prefix#"."#name, simdop>;
}
-multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop> {
- defm "" : SIMDBinary<v16i8, "v128", node, name, simdop>;
- defm "" : SIMDBinary<v8i16, "v128", node, name, simdop>;
- defm "" : SIMDBinary<v4i32, "v128", node, name, simdop>;
- defm "" : SIMDBinary<v2i64, "v128", node, name, simdop>;
+multiclass SIMDBitwise<SDNode node, string name, bits<32> simdop, bit commutable = false> {
+ let isCommutable = commutable in
+ defm "" : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins), [],
+ "v128."#name#"\t$dst, $lhs, $rhs", "v128."#name, simdop>;
+ foreach vec = IntVecs in
+ def : Pat<(node (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+ (!cast<NI>(NAME) $lhs, $rhs)>;
}
-multiclass SIMDUnary<ValueType vec_t, string vec, SDNode node, string name,
- bits<32> simdop> {
- defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
- [(set (vec_t V128:$dst),
- (vec_t (node (vec_t V128:$vec)))
- )],
- vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
+multiclass SIMDUnary<Vec vec, SDNode node, string name, bits<32> simdop> {
+ defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$v), (outs), (ins),
+ [(set (vec.vt V128:$dst),
+ (vec.vt (node (vec.vt V128:$v))))],
+ vec.prefix#"."#name#"\t$dst, $v",
+ vec.prefix#"."#name, simdop>;
}
// Bitwise logic: v128.not
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
-defm NOT: SIMDUnary<vec_t, "v128", vnot, "not", 77>;
+defm NOT : SIMD_I<(outs V128:$dst), (ins V128:$v), (outs), (ins), [],
+ "v128.not\t$dst, $v", "v128.not", 77>;
+foreach vec = IntVecs in
+def : Pat<(vnot (vec.vt V128:$v)), (NOT $v)>;
// Bitwise logic: v128.and / v128.or / v128.xor
-let isCommutable = 1 in {
-defm AND : SIMDBitwise<and, "and", 78>;
-defm OR : SIMDBitwise<or, "or", 80>;
-defm XOR : SIMDBitwise<xor, "xor", 81>;
-} // isCommutable = 1
+defm AND : SIMDBitwise<and, "and", 78, true>;
+defm OR : SIMDBitwise<or, "or", 80, true>;
+defm XOR : SIMDBitwise<xor, "xor", 81, true>;
// Bitwise logic: v128.andnot
def andnot : PatFrag<(ops node:$left, node:$right), (and $left, (vnot $right))>;
defm ANDNOT : SIMDBitwise<andnot, "andnot", 79>;
// Bitwise select: v128.bitselect
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
- defm BITSELECT_#vec_t :
- SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
- [(set (vec_t V128:$dst),
- (vec_t (int_wasm_bitselect
- (vec_t V128:$v1), (vec_t V128:$v2), (vec_t V128:$c)
- ))
- )],
- "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>;
+defm BITSELECT :
+ SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), [],
+ "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>;
+
+foreach vec = AllVecs in
+def : Pat<(vec.vt (int_wasm_bitselect
+ (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))),
+ (BITSELECT $v1, $v2, $c)>;
// Bitselect is equivalent to (c & v1) | (~c & v2)
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
- def : Pat<(vec_t (or (and (vec_t V128:$c), (vec_t V128:$v1)),
- (and (vnot V128:$c), (vec_t V128:$v2)))),
- (!cast<Instruction>("BITSELECT_"#vec_t)
- V128:$v1, V128:$v2, V128:$c)>;
+foreach vec = IntVecs in
+def : Pat<(vec.vt (or (and (vec.vt V128:$c), (vec.vt V128:$v1)),
+ (and (vnot V128:$c), (vec.vt V128:$v2)))),
+ (BITSELECT $v1, $v2, $c)>;
+
+// Also implement vselect in terms of bitselect
+foreach vec = AllVecs in
+def : Pat<(vec.vt (vselect
+ (vec.int_vt V128:$c), (vec.vt V128:$v1), (vec.vt V128:$v2))),
+ (BITSELECT $v1, $v2, $c)>;
+
+// MVP select on v128 values
+defm SELECT_V128 :
+ I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, I32:$cond), (outs), (ins), [],
+ "v128.select\t$dst, $lhs, $rhs, $cond", "v128.select", 0x1b>;
+
+foreach vec = AllVecs in {
+def : Pat<(select I32:$cond, (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+ (SELECT_V128 $lhs, $rhs, $cond)>;
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select
+ (i32 (setne I32:$cond, 0)), (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+ (SELECT_V128 $lhs, $rhs, $cond)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select
+ (i32 (seteq I32:$cond, 0)), (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+ (SELECT_V128 $rhs, $lhs, $cond)>;
+} // foreach vec
+
+// Sign select
+multiclass SIMDSignSelect<Vec vec, bits<32> simdop> {
+ defm SIGNSELECT_#vec :
+ SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst),
+ (vec.vt (int_wasm_signselect
+ (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))))],
+ vec.prefix#".signselect\t$dst, $v1, $v2, $c",
+ vec.prefix#".signselect", simdop>;
+}
+
+defm : SIMDSignSelect<I8x16, 125>;
+defm : SIMDSignSelect<I16x8, 126>;
+defm : SIMDSignSelect<I32x4, 127>;
+defm : SIMDSignSelect<I64x2, 148>;
//===----------------------------------------------------------------------===//
// Integer unary arithmetic
//===----------------------------------------------------------------------===//
multiclass SIMDUnaryInt<SDNode node, string name, bits<32> baseInst> {
- defm "" : SIMDUnary<v16i8, "i8x16", node, name, baseInst>;
- defm "" : SIMDUnary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
- defm "" : SIMDUnary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
- defm "" : SIMDUnary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+ defm "" : SIMDUnary<I8x16, node, name, baseInst>;
+ defm "" : SIMDUnary<I16x8, node, name, !add(baseInst, 32)>;
+ defm "" : SIMDUnary<I32x4, node, name, !add(baseInst, 64)>;
+ defm "" : SIMDUnary<I64x2, node, name, !add(baseInst, 96)>;
}
-multiclass SIMDReduceVec<ValueType vec_t, string vec, SDNode op, string name,
- bits<32> simdop> {
- defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
- [(set I32:$dst, (i32 (op (vec_t V128:$vec))))],
- vec#"."#name#"\t$dst, $vec", vec#"."#name, simdop>;
+multiclass SIMDReduceVec<Vec vec, SDNode op, string name, bits<32> simdop> {
+ defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+ [(set I32:$dst, (i32 (op (vec.vt V128:$vec))))],
+ vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name,
+ simdop>;
}
multiclass SIMDReduce<SDNode op, string name, bits<32> baseInst> {
- defm "" : SIMDReduceVec<v16i8, "i8x16", op, name, baseInst>;
- defm "" : SIMDReduceVec<v8i16, "i16x8", op, name, !add(baseInst, 32)>;
- defm "" : SIMDReduceVec<v4i32, "i32x4", op, name, !add(baseInst, 64)>;
- defm "" : SIMDReduceVec<v2i64, "i64x2", op, name, !add(baseInst, 96)>;
+ defm "" : SIMDReduceVec<I8x16, op, name, baseInst>;
+ defm "" : SIMDReduceVec<I16x8, op, name, !add(baseInst, 32)>;
+ defm "" : SIMDReduceVec<I32x4, op, name, !add(baseInst, 64)>;
+ defm "" : SIMDReduceVec<I64x2, op, name, !add(baseInst, 96)>;
}
// Integer vector negation
-def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, $in)>;
// Integer absolute value: abs
defm ABS : SIMDUnaryInt<abs, "abs", 96>;
@@ -614,64 +847,56 @@ defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 98>;
// All lanes true: all_true
defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 99>;
+// Population count: popcnt
+defm POPCNT : SIMDUnary<I8x16, int_wasm_popcnt, "popcnt", 124>;
+
// Reductions already return 0 or 1, so and 1, setne 0, and seteq 1
// can be folded out
foreach reduction =
[["int_wasm_anytrue", "ANYTRUE"], ["int_wasm_alltrue", "ALLTRUE"]] in
-foreach ty = [v16i8, v8i16, v4i32, v2i64] in {
-def : Pat<(i32 (and
- (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
- (i32 1)
- )),
- (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
-def : Pat<(i32 (setne
- (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
- (i32 0)
- )),
- (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
-def : Pat<(i32 (seteq
- (i32 (!cast<Intrinsic>(reduction[0]) (ty V128:$x))),
- (i32 1)
- )),
- (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
-}
-
-multiclass SIMDBitmask<ValueType vec_t, string vec, bits<32> simdop> {
- defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
- [(set I32:$dst,
- (i32 (int_wasm_bitmask (vec_t V128:$vec)))
- )],
- vec#".bitmask\t$dst, $vec", vec#".bitmask", simdop>;
-}
-
-defm BITMASK : SIMDBitmask<v16i8, "i8x16", 100>;
-defm BITMASK : SIMDBitmask<v8i16, "i16x8", 132>;
-defm BITMASK : SIMDBitmask<v4i32, "i32x4", 164>;
+foreach vec = IntVecs in {
+defvar intrinsic = !cast<Intrinsic>(reduction[0]);
+defvar inst = !cast<NI>(reduction[1]#"_"#vec);
+def : Pat<(i32 (and (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
+def : Pat<(i32 (setne (i32 (intrinsic (vec.vt V128:$x))), (i32 0))), (inst $x)>;
+def : Pat<(i32 (seteq (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
+}
+
+multiclass SIMDBitmask<Vec vec, bits<32> simdop> {
+ defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+ [(set I32:$dst,
+ (i32 (int_wasm_bitmask (vec.vt V128:$vec))))],
+ vec.prefix#".bitmask\t$dst, $vec", vec.prefix#".bitmask",
+ simdop>;
+}
+
+defm BITMASK : SIMDBitmask<I8x16, 100>;
+defm BITMASK : SIMDBitmask<I16x8, 132>;
+defm BITMASK : SIMDBitmask<I32x4, 164>;
+defm BITMASK : SIMDBitmask<I64x2, 196>;
//===----------------------------------------------------------------------===//
// Bit shifts
//===----------------------------------------------------------------------===//
-multiclass SIMDShift<ValueType vec_t, string vec, SDNode node, string name,
- bits<32> simdop> {
- defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x),
- (outs), (ins),
- [(set (vec_t V128:$dst), (node V128:$vec, I32:$x))],
- vec#"."#name#"\t$dst, $vec, $x", vec#"."#name, simdop>;
+multiclass SIMDShift<Vec vec, SDNode node, string name, bits<32> simdop> {
+ defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x), (outs), (ins),
+ [(set (vec.vt V128:$dst), (node V128:$vec, I32:$x))],
+ vec.prefix#"."#name#"\t$dst, $vec, $x",
+ vec.prefix#"."#name, simdop>;
}
multiclass SIMDShiftInt<SDNode node, string name, bits<32> baseInst> {
- defm "" : SIMDShift<v16i8, "i8x16", node, name, baseInst>;
- defm "" : SIMDShift<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
- defm "" : SIMDShift<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
- defm "" : SIMDShift<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+ defm "" : SIMDShift<I8x16, node, name, baseInst>;
+ defm "" : SIMDShift<I16x8, node, name, !add(baseInst, 32)>;
+ defm "" : SIMDShift<I32x4, node, name, !add(baseInst, 64)>;
+ defm "" : SIMDShift<I64x2, node, name, !add(baseInst, 96)>;
}
// WebAssembly SIMD shifts are nonstandard in that the shift amount is
// an i32 rather than a vector, so they need custom nodes.
-def wasm_shift_t : SDTypeProfile<1, 2,
- [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]
->;
+def wasm_shift_t :
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>;
def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>;
def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>;
@@ -688,24 +913,24 @@ defm SHR_U : SIMDShiftInt<wasm_shr_u, "shr_u", 109>;
//===----------------------------------------------------------------------===//
multiclass SIMDBinaryIntNoI8x16<SDNode node, string name, bits<32> baseInst> {
- defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
- defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
- defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+ defm "" : SIMDBinary<I16x8, node, name, !add(baseInst, 32)>;
+ defm "" : SIMDBinary<I32x4, node, name, !add(baseInst, 64)>;
+ defm "" : SIMDBinary<I64x2, node, name, !add(baseInst, 96)>;
}
multiclass SIMDBinaryIntSmall<SDNode node, string name, bits<32> baseInst> {
- defm "" : SIMDBinary<v16i8, "i8x16", node, name, baseInst>;
- defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
+ defm "" : SIMDBinary<I8x16, node, name, baseInst>;
+ defm "" : SIMDBinary<I16x8, node, name, !add(baseInst, 32)>;
}
multiclass SIMDBinaryIntNoI64x2<SDNode node, string name, bits<32> baseInst> {
defm "" : SIMDBinaryIntSmall<node, name, baseInst>;
- defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
+ defm "" : SIMDBinary<I32x4, node, name, !add(baseInst, 64)>;
}
multiclass SIMDBinaryInt<SDNode node, string name, bits<32> baseInst> {
defm "" : SIMDBinaryIntNoI64x2<node, name, baseInst>;
- defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+ defm "" : SIMDBinary<I64x2, node, name, !add(baseInst, 96)>;
}
// Integer addition: add / add_saturate_s / add_saturate_u
@@ -736,38 +961,74 @@ defm MAX_U : SIMDBinaryIntNoI64x2<umax, "max_u", 121>;
// Integer unsigned rounding average: avgr_u
let isCommutable = 1 in {
-defm AVGR_U : SIMDBinary<v16i8, "i8x16", int_wasm_avgr_unsigned, "avgr_u", 123>;
-defm AVGR_U : SIMDBinary<v8i16, "i16x8", int_wasm_avgr_unsigned, "avgr_u", 155>;
+defm AVGR_U : SIMDBinary<I8x16, int_wasm_avgr_unsigned, "avgr_u", 123>;
+defm AVGR_U : SIMDBinary<I16x8, int_wasm_avgr_unsigned, "avgr_u", 155>;
}
-def add_nuw : PatFrag<(ops node:$lhs, node:$rhs),
- (add node:$lhs, node:$rhs),
+def add_nuw : PatFrag<(ops node:$lhs, node:$rhs), (add $lhs, $rhs),
"return N->getFlags().hasNoUnsignedWrap();">;
-foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in
+foreach vec = [I8x16, I16x8] in {
+defvar inst = !cast<NI>("AVGR_U_"#vec);
def : Pat<(wasm_shr_u
(add_nuw
- (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)),
- (nodes[1] (i32 1))
- ),
- (i32 1)
- ),
- (!cast<NI>("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>;
+ (add_nuw (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
+ (vec.splat (i32 1))),
+ (i32 1)),
+ (inst $lhs, $rhs)>;
+}
// Widening dot product: i32x4.dot_i16x8_s
let isCommutable = 1 in
defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
[(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))],
"i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
- 180>;
+ 186>;
+
+// Extending multiplication: extmul_{low,high}_P, extmul_high
+multiclass SIMDExtBinary<Vec vec, SDNode node, string name, bits<32> simdop> {
+ defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+ (outs), (ins),
+ [(set (vec.vt V128:$dst), (node
+ (vec.split.vt V128:$lhs),(vec.split.vt V128:$rhs)))],
+ vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
+ vec.prefix#"."#name, simdop>;
+}
+
+defm EXTMUL_LOW_S :
+ SIMDExtBinary<I16x8, int_wasm_extmul_low_signed, "extmul_low_i8x16_s", 154>;
+defm EXTMUL_HIGH_S :
+ SIMDExtBinary<I16x8, int_wasm_extmul_high_signed, "extmul_high_i8x16_s", 157>;
+defm EXTMUL_LOW_U :
+ SIMDExtBinary<I16x8, int_wasm_extmul_low_unsigned, "extmul_low_i8x16_u", 158>;
+defm EXTMUL_HIGH_U :
+ SIMDExtBinary<I16x8, int_wasm_extmul_high_unsigned, "extmul_high_i8x16_u", 159>;
+
+defm EXTMUL_LOW_S :
+ SIMDExtBinary<I32x4, int_wasm_extmul_low_signed, "extmul_low_i16x8_s", 187>;
+defm EXTMUL_HIGH_S :
+ SIMDExtBinary<I32x4, int_wasm_extmul_high_signed, "extmul_high_i16x8_s", 189>;
+defm EXTMUL_LOW_U :
+ SIMDExtBinary<I32x4, int_wasm_extmul_low_unsigned, "extmul_low_i16x8_u", 190>;
+defm EXTMUL_HIGH_U :
+ SIMDExtBinary<I32x4, int_wasm_extmul_high_unsigned, "extmul_high_i16x8_u", 191>;
+
+defm EXTMUL_LOW_S :
+ SIMDExtBinary<I64x2, int_wasm_extmul_low_signed, "extmul_low_i32x4_s", 210>;
+defm EXTMUL_HIGH_S :
+ SIMDExtBinary<I64x2, int_wasm_extmul_high_signed, "extmul_high_i32x4_s", 211>;
+defm EXTMUL_LOW_U :
+ SIMDExtBinary<I64x2, int_wasm_extmul_low_unsigned, "extmul_low_i32x4_u", 214>;
+defm EXTMUL_HIGH_U :
+ SIMDExtBinary<I64x2, int_wasm_extmul_high_unsigned, "extmul_high_i32x4_u", 215>;
//===----------------------------------------------------------------------===//
// Floating-point unary arithmetic
//===----------------------------------------------------------------------===//
multiclass SIMDUnaryFP<SDNode node, string name, bits<32> baseInst> {
- defm "" : SIMDUnary<v4f32, "f32x4", node, name, baseInst>;
- defm "" : SIMDUnary<v2f64, "f64x2", node, name, !add(baseInst, 12)>;
+ defm "" : SIMDUnary<F32x4, node, name, baseInst>;
+ defm "" : SIMDUnary<F64x2, node, name, !add(baseInst, 12)>;
}
// Absolute value: abs
@@ -780,22 +1041,22 @@ defm NEG : SIMDUnaryFP<fneg, "neg", 225>;
defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 227>;
// Rounding: ceil, floor, trunc, nearest
-defm CEIL : SIMDUnary<v4f32, "f32x4", int_wasm_ceil, "ceil", 216>;
-defm FLOOR : SIMDUnary<v4f32, "f32x4", int_wasm_floor, "floor", 217>;
-defm TRUNC: SIMDUnary<v4f32, "f32x4", int_wasm_trunc, "trunc", 218>;
-defm NEAREST: SIMDUnary<v4f32, "f32x4", int_wasm_nearest, "nearest", 219>;
-defm CEIL : SIMDUnary<v2f64, "f64x2", int_wasm_ceil, "ceil", 220>;
-defm FLOOR : SIMDUnary<v2f64, "f64x2", int_wasm_floor, "floor", 221>;
-defm TRUNC: SIMDUnary<v2f64, "f64x2", int_wasm_trunc, "trunc", 222>;
-defm NEAREST: SIMDUnary<v2f64, "f64x2", int_wasm_nearest, "nearest", 223>;
+defm CEIL : SIMDUnary<F32x4, int_wasm_ceil, "ceil", 216>;
+defm FLOOR : SIMDUnary<F32x4, int_wasm_floor, "floor", 217>;
+defm TRUNC: SIMDUnary<F32x4, int_wasm_trunc, "trunc", 218>;
+defm NEAREST: SIMDUnary<F32x4, int_wasm_nearest, "nearest", 219>;
+defm CEIL : SIMDUnary<F64x2, int_wasm_ceil, "ceil", 220>;
+defm FLOOR : SIMDUnary<F64x2, int_wasm_floor, "floor", 221>;
+defm TRUNC: SIMDUnary<F64x2, int_wasm_trunc, "trunc", 222>;
+defm NEAREST: SIMDUnary<F64x2, int_wasm_nearest, "nearest", 223>;
//===----------------------------------------------------------------------===//
// Floating-point binary arithmetic
//===----------------------------------------------------------------------===//
multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
- defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
- defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 12)>;
+ defm "" : SIMDBinary<F32x4, node, name, baseInst>;
+ defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
}
// Addition: add
@@ -828,63 +1089,151 @@ defm PMAX : SIMDBinaryFP<int_wasm_pmax, "pmax", 235>;
// Conversions
//===----------------------------------------------------------------------===//
-multiclass SIMDConvert<ValueType vec_t, ValueType arg_t, SDNode op,
- string name, bits<32> simdop> {
- defm op#_#vec_t#_#arg_t :
+multiclass SIMDConvert<Vec vec, Vec arg, SDNode op, string name,
+ bits<32> simdop> {
+ defm op#_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
- [(set (vec_t V128:$dst), (vec_t (op (arg_t V128:$vec))))],
- name#"\t$dst, $vec", name, simdop>;
+ [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
+ vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
}
// Floating point to integer with saturation: trunc_sat
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_sint, "i32x4.trunc_sat_f32x4_s", 248>;
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 249>;
+defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
+defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
// Integer to floating point: convert
-defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_i32x4_s", 250>;
-defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_i32x4_u", 251>;
+defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
+defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
+
+// Lower llvm.wasm.trunc.saturate.* to saturating instructions
+def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
+ (fp_to_sint_I32x4 $src)>;
+def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
+ (fp_to_uint_I32x4 $src)>;
// Widening operations
-multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
- bits<32> baseInst> {
- defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_signed,
- vec#".widen_low_"#arg#"_s", baseInst>;
- defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_signed,
- vec#".widen_high_"#arg#"_s", !add(baseInst, 1)>;
- defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_unsigned,
- vec#".widen_low_"#arg#"_u", !add(baseInst, 2)>;
- defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_unsigned,
- vec#".widen_high_"#arg#"_u", !add(baseInst, 3)>;
+def widen_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def widen_low_s : SDNode<"WebAssemblyISD::WIDEN_LOW_S", widen_t>;
+def widen_high_s : SDNode<"WebAssemblyISD::WIDEN_HIGH_S", widen_t>;
+def widen_low_u : SDNode<"WebAssemblyISD::WIDEN_LOW_U", widen_t>;
+def widen_high_u : SDNode<"WebAssemblyISD::WIDEN_HIGH_U", widen_t>;
+
+// TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
+multiclass SIMDWiden<Vec vec, bits<32> baseInst> {
+ defm "" : SIMDConvert<vec, vec.split, widen_low_s,
+ "widen_low_"#vec.split.prefix#"_s", baseInst>;
+ defm "" : SIMDConvert<vec, vec.split, widen_high_s,
+ "widen_high_"#vec.split.prefix#"_s", !add(baseInst, 1)>;
+ defm "" : SIMDConvert<vec, vec.split, widen_low_u,
+ "widen_low_"#vec.split.prefix#"_u", !add(baseInst, 2)>;
+ defm "" : SIMDConvert<vec, vec.split, widen_high_u,
+ "widen_high_"#vec.split.prefix#"_u", !add(baseInst, 3)>;
}
-defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 135>;
-defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 167>;
+defm "" : SIMDWiden<I16x8, 135>;
+defm "" : SIMDWiden<I32x4, 167>;
+
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_low_signed,
+ "widen_low_i32x4_s", 199>;
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_high_signed,
+ "widen_high_i32x4_s", 200>;
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_low_unsigned,
+ "widen_low_i32x4_u", 201>;
+defm "" : SIMDConvert<I64x2, I32x4, int_wasm_widen_high_unsigned,
+ "widen_high_i32x4_u", 202>;
// Narrowing operations
-multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
- bits<32> baseInst> {
- defm NARROW_S_#vec_t :
+multiclass SIMDNarrow<Vec vec, bits<32> baseInst> {
+ defvar name = vec.split.prefix#".narrow_"#vec.prefix;
+ defm NARROW_S_#vec.split :
SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
- [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_signed
- (arg_t V128:$low), (arg_t V128:$high))))],
- vec#".narrow_"#arg#"_s\t$dst, $low, $high", vec#".narrow_"#arg#"_s",
- baseInst>;
- defm NARROW_U_#vec_t :
+ [(set (vec.split.vt V128:$dst), (vec.split.vt (int_wasm_narrow_signed
+ (vec.vt V128:$low), (vec.vt V128:$high))))],
+ name#"_s\t$dst, $low, $high", name#"_s", baseInst>;
+ defm NARROW_U_#vec.split :
SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
- [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_unsigned
- (arg_t V128:$low), (arg_t V128:$high))))],
- vec#".narrow_"#arg#"_u\t$dst, $low, $high", vec#".narrow_"#arg#"_u",
- !add(baseInst, 1)>;
+ [(set (vec.split.vt V128:$dst), (vec.split.vt (int_wasm_narrow_unsigned
+ (vec.vt V128:$low), (vec.vt V128:$high))))],
+ name#"_u\t$dst, $low, $high", name#"_u", !add(baseInst, 1)>;
}
-defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 101>;
-defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 133>;
+defm "" : SIMDNarrow<I16x8, 101>;
+defm "" : SIMDNarrow<I32x4, 133>;
+
+// Use narrowing operations for truncating stores. Since the narrowing
+// operations are saturating instead of truncating, we need to mask
+// the stored values first.
+// TODO: Use consts instead of splats
+def store_v8i8_trunc_v8i16 :
+ OutPatFrag<(ops node:$val),
+ (EXTRACT_LANE_I64x2
+ (NARROW_U_I8x16
+ (AND (SPLAT_I32x4 (CONST_I32 0x00ff00ff)), node:$val),
+ $val), // Unused input
+ 0)>;
+
+def store_v4i16_trunc_v4i32 :
+ OutPatFrag<(ops node:$val),
+ (EXTRACT_LANE_I64x2
+ (NARROW_U_I16x8
+ (AND (SPLAT_I32x4 (CONST_I32 0x0000ffff)), node:$val),
+ $val), // Unused input
+ 0)>;
+
+// Store patterns adapted from WebAssemblyInstrMemory.td
+multiclass NarrowingStorePatNoOffset<Vec vec, OutPatFrag out> {
+ defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+ def : Pat<(node vec.vt:$val, I32:$addr),
+ (STORE_I64_A32 0, 0, $addr, (out $val))>,
+ Requires<[HasAddr32]>;
+ def : Pat<(node vec.vt:$val, I64:$addr),
+ (STORE_I64_A64 0, 0, $addr, (out $val))>,
+ Requires<[HasAddr64]>;
+}
-// Lower llvm.wasm.trunc.saturate.* to saturating instructions
-def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
- (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
-def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
- (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
+defm : NarrowingStorePatNoOffset<I16x8, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatNoOffset<I32x4, store_v4i16_trunc_v4i32>;
+
+multiclass NarrowingStorePatImmOff<Vec vec, PatFrag operand, OutPatFrag out> {
+ defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+ def : Pat<(node vec.vt:$val, (operand I32:$addr, imm:$off)),
+ (STORE_I64_A32 0, imm:$off, $addr, (out $val))>,
+ Requires<[HasAddr32]>;
+ def : Pat<(node vec.vt:$val, (operand I64:$addr, imm:$off)),
+ (STORE_I64_A64 0, imm:$off, $addr, (out $val))>,
+ Requires<[HasAddr64]>;
+}
+
+defm : NarrowingStorePatImmOff<I16x8, regPlusImm, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatImmOff<I32x4, regPlusImm, store_v4i16_trunc_v4i32>;
+defm : NarrowingStorePatImmOff<I16x8, or_is_add, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatImmOff<I32x4, or_is_add, store_v4i16_trunc_v4i32>;
+
+multiclass NarrowingStorePatOffsetOnly<Vec vec, OutPatFrag out> {
+ defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+ def : Pat<(node vec.vt:$val, imm:$off),
+ (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (out $val))>,
+ Requires<[HasAddr32]>;
+ def : Pat<(node vec.vt:$val, imm:$off),
+ (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (out $val))>,
+ Requires<[HasAddr64]>;
+}
+
+defm : NarrowingStorePatOffsetOnly<I16x8, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatOffsetOnly<I32x4, store_v4i16_trunc_v4i32>;
+
+multiclass NarrowingStorePatGlobalAddrOffOnly<Vec vec, OutPatFrag out> {
+ defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
+ def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_I64_A32 0, tglobaladdr:$off, (CONST_I32 0), (out $val))>,
+ Requires<[IsNotPIC, HasAddr32]>;
+ def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
+ (STORE_I64_A64 0, tglobaladdr:$off, (CONST_I64 0), (out $val))>,
+ Requires<[IsNotPIC, HasAddr64]>;
+}
+
+defm : NarrowingStorePatGlobalAddrOffOnly<I16x8, store_v8i8_trunc_v8i16>;
+defm : NarrowingStorePatGlobalAddrOffOnly<I32x4, store_v4i16_trunc_v4i32>;
// Bitcasts are nops
// Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
@@ -897,24 +1246,96 @@ foreach t2 = !foldl(
) in
def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
+// Extended pairwise addition
+defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed,
+ "extadd_pairwise_i8x16_s", 0xc2>;
+defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_unsigned,
+ "extadd_pairwise_i8x16_u", 0xc3>;
+defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed,
+ "extadd_pairwise_i16x8_s", 0xa5>;
+defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
+ "extadd_pairwise_i16x8_u", 0xa6>;
+
+
+// Prototype f64x2 conversions
+defm "" : SIMDConvert<F64x2, I32x4, int_wasm_convert_low_signed,
+ "convert_low_i32x4_s", 0x53>;
+defm "" : SIMDConvert<F64x2, I32x4, int_wasm_convert_low_unsigned,
+ "convert_low_i32x4_u", 0x54>;
+defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_saturate_zero_signed,
+ "trunc_sat_zero_f64x2_s", 0x55>;
+defm "" : SIMDConvert<I32x4, F64x2, int_wasm_trunc_saturate_zero_unsigned,
+ "trunc_sat_zero_f64x2_u", 0x56>;
+defm "" : SIMDConvert<F32x4, F64x2, int_wasm_demote_zero,
+ "demote_zero_f64x2", 0x57>;
+defm "" : SIMDConvert<F64x2, F32x4, int_wasm_promote_low,
+ "promote_low_f32x4", 0x69>;
+
//===----------------------------------------------------------------------===//
// Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS)
//===----------------------------------------------------------------------===//
-multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
- defm QFMA_#vec_t :
+multiclass SIMDQFM<Vec vec, bits<32> simdopA, bits<32> simdopS> {
+ defm QFMA_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
(outs), (ins),
- [(set (vec_t V128:$dst),
- (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
- vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>;
- defm QFMS_#vec_t :
+ [(set (vec.vt V128:$dst), (int_wasm_qfma
+ (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".qfma\t$dst, $a, $b, $c", vec.prefix#".qfma", simdopA>;
+ defm QFMS_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
(outs), (ins),
- [(set (vec_t V128:$dst),
- (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
- vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
+ [(set (vec.vt V128:$dst), (int_wasm_qfms
+ (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".qfms\t$dst, $a, $b, $c", vec.prefix#".qfms", simdopS>;
}
-defm "" : SIMDQFM<v4f32, "f32x4", 252>;
-defm "" : SIMDQFM<v2f64, "f64x2", 254>;
+defm "" : SIMDQFM<F32x4, 180, 212>;
+defm "" : SIMDQFM<F64x2, 254, 255>;
+
+//===----------------------------------------------------------------------===//
+// Saturating Rounding Q-Format Multiplication
+//===----------------------------------------------------------------------===//
+
+defm Q15MULR_SAT_S :
+ SIMDBinary<I16x8, int_wasm_q15mulr_saturate_signed, "q15mulr_sat_s", 156>;
+
+//===----------------------------------------------------------------------===//
+// Experimental prefetch instructions: prefetch.t, prefetch.nt
+//===----------------------------------------------------------------------===//
+
+let mayLoad = true, UseNamedOperandTable = true in {
+defm PREFETCH_T_A32 :
+ SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "prefetch.t\t${off}(${addr})$p2align",
+ "prefetch.t\t$off$p2align", 0xc5>;
+defm PREFETCH_T_A64 :
+ SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+ (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+ "prefetch.t\t${off}(${addr})$p2align",
+ "prefetch.t\t$off$p2align", 0xc5>;
+defm PREFETCH_NT_A32 :
+ SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "prefetch.nt\t${off}(${addr})$p2align",
+ "prefetch.nt\t$off$p2align", 0xc6>;
+defm PREFETCH_NT_A64 :
+ SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+ (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+ "prefetch.nt\t${off}(${addr})$p2align",
+ "prefetch.nt\t$off$p2align", 0xc6>;
+} // mayLoad, UseNamedOperandTable
+
+multiclass PrefetchPatNoOffset<PatFrag kind, string inst> {
+ def : Pat<(kind I32:$addr), (!cast<NI>(inst # "_A32") 0, 0, $addr)>,
+ Requires<[HasAddr32]>;
+ def : Pat<(kind I64:$addr), (!cast<NI>(inst # "_A64") 0, 0, $addr)>,
+ Requires<[HasAddr64]>;
+}
+
+foreach inst = [["PREFETCH_T", "int_wasm_prefetch_t"],
+ ["PREFETCH_NT", "int_wasm_prefetch_nt"]] in {
+defvar node = !cast<Intrinsic>(inst[1]);
+defm : PrefetchPatNoOffset<node, inst[0]>;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
new file mode 100644
index 000000000000..97638c3494ae
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -0,0 +1,64 @@
+// WebAssemblyInstrTable.td - WebAssembly Table codegen support -*- tablegen -*-
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// WebAssembly Table operand code-gen constructs.
+/// Instructions that handle tables
+//===----------------------------------------------------------------------===//
+
+
+multiclass TABLE<WebAssemblyRegClass rt> {
+ defm TABLE_GET_#rt : I<(outs rt:$res), (ins table32_op:$table),
+ (outs), (ins table32_op:$table),
+ [],
+ "table.get\t$res, $table",
+ "table.get\t$table",
+ 0x25>;
+
+ defm TABLE_SET_#rt : I<(outs), (ins table32_op:$table, rt:$val, I32:$i),
+ (outs), (ins table32_op:$table),
+ [],
+ "table.set\t$table, $val, $i",
+ "table.set\t$table",
+ 0x26>;
+
+ defm TABLE_GROW_#rt : I<(outs I32:$sz), (ins table32_op:$table, I32:$n, rt:$val),
+ (outs), (ins table32_op:$table),
+ [],
+ "table.grow\t$sz, $table, $n, $val",
+ "table.grow\t$table",
+ 0xfc0f>;
+
+ defm TABLE_FILL_#rt : I<(outs), (ins table32_op:$table, I32:$n, rt:$val, I32:$i),
+ (outs), (ins table32_op:$table),
+ [],
+ "table.fill\t$table, $n, $val, $i",
+ "table.fill\t$table",
+ 0xfc11>;
+
+}
+
+defm "" : TABLE<FUNCREF>, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<EXTERNREF>, Requires<[HasReferenceTypes]>;
+
+defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table),
+ (outs), (ins table32_op:$table),
+ [],
+ "table.size\t$sz, $table",
+ "table.size\t$table",
+ 0xfc10>,
+ Requires<[HasReferenceTypes]>;
+
+
+defm TABLE_COPY : I<(outs), (ins table32_op:$table1, table32_op:$table2, I32:$n, I32:$s, I32:$d),
+ (outs), (ins table32_op:$table1, table32_op:$table2),
+ [],
+ "table.copy\t$table1, $table2, $n, $s, $d",
+ "table.copy\t$table1, $table2",
+ 0xfc0e>,
+ Requires<[HasReferenceTypes]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 346938daf1aa..e07dae65fc4a 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -15,7 +15,7 @@
#include "WebAssembly.h"
#include "WebAssemblySubtarget.h"
#include "WebAssemblyUtilities.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -32,15 +32,17 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
}
bool runOnMachineFunction(MachineFunction &MF) override;
+ bool removeUnreachableEHPads(MachineFunction &MF);
void recordCatchRetBBs(MachineFunction &MF);
- bool addCatches(MachineFunction &MF);
+ bool hoistCatches(MachineFunction &MF);
+ bool addCatchAlls(MachineFunction &MF);
bool replaceFuncletReturns(MachineFunction &MF);
bool removeUnnecessaryUnreachables(MachineFunction &MF);
- bool addExceptionExtraction(MachineFunction &MF);
+ bool ensureSingleBBTermPads(MachineFunction &MF);
bool restoreStackPointer(MachineFunction &MF);
MachineBasicBlock *getMatchingEHPad(MachineInstr *MI);
- SmallSet<MachineBasicBlock *, 8> CatchRetBBs;
+ SmallPtrSet<MachineBasicBlock *, 8> CatchRetBBs;
public:
static char ID; // Pass identification, replacement for typeid
@@ -94,15 +96,18 @@ WebAssemblyLateEHPrepare::getMatchingEHPad(MachineInstr *MI) {
template <typename Container>
static void eraseDeadBBsAndChildren(const Container &MBBs) {
SmallVector<MachineBasicBlock *, 8> WL(MBBs.begin(), MBBs.end());
+ SmallPtrSet<MachineBasicBlock *, 8> Deleted;
while (!WL.empty()) {
MachineBasicBlock *MBB = WL.pop_back_val();
- if (!MBB->pred_empty())
+ if (Deleted.count(MBB) || !MBB->pred_empty())
continue;
- SmallVector<MachineBasicBlock *, 4> Succs(MBB->succ_begin(),
- MBB->succ_end());
+ SmallVector<MachineBasicBlock *, 4> Succs(MBB->successors());
WL.append(MBB->succ_begin(), MBB->succ_end());
for (auto *Succ : Succs)
MBB->removeSuccessor(Succ);
+ // To prevent deleting the same BB multiple times, which can happen when
+ // 'MBBs' contain both a parent and a child
+ Deleted.insert(MBB);
MBB->eraseFromParent();
}
}
@@ -118,21 +123,33 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
if (MF.getFunction().hasPersonalityFn()) {
+ Changed |= removeUnreachableEHPads(MF);
recordCatchRetBBs(MF);
- Changed |= addCatches(MF);
+ Changed |= hoistCatches(MF);
+ Changed |= addCatchAlls(MF);
Changed |= replaceFuncletReturns(MF);
+ Changed |= ensureSingleBBTermPads(MF);
}
Changed |= removeUnnecessaryUnreachables(MF);
- if (MF.getFunction().hasPersonalityFn()) {
- Changed |= addExceptionExtraction(MF);
+ if (MF.getFunction().hasPersonalityFn())
Changed |= restoreStackPointer(MF);
- }
return Changed;
}
-// Record which BB ends with 'CATCHRET' instruction, because this will be
-// replaced with BRs later. This set of 'CATCHRET' BBs is necessary in
-// 'getMatchingEHPad' function.
+// Remove unreachable EH pads and its children. If they remain, CFG
+// stackification can be tricky.
+bool WebAssemblyLateEHPrepare::removeUnreachableEHPads(MachineFunction &MF) {
+ SmallVector<MachineBasicBlock *, 4> ToDelete;
+ for (auto &MBB : MF)
+ if (MBB.isEHPad() && MBB.pred_empty())
+ ToDelete.push_back(&MBB);
+ eraseDeadBBsAndChildren(ToDelete);
+ return !ToDelete.empty();
+}
+
+// Record which BB ends with catchret instruction, because this will be replaced
+// with 'br's later. This set of catchret BBs is necessary in 'getMatchingEHPad'
+// function.
void WebAssemblyLateEHPrepare::recordCatchRetBBs(MachineFunction &MF) {
CatchRetBBs.clear();
for (auto &MBB : MF) {
@@ -145,25 +162,69 @@ void WebAssemblyLateEHPrepare::recordCatchRetBBs(MachineFunction &MF) {
}
}
-// Add catch instruction to beginning of catchpads and cleanuppads.
-bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
+// Hoist catch instructions to the beginning of their matching EH pad BBs in
+// case,
+// (1) catch instruction is not the first instruction in EH pad.
+// ehpad:
+// some_other_instruction
+// ...
+// %exn = catch 0
+// (2) catch instruction is in a non-EH pad BB. For example,
+// ehpad:
+// br bb0
+// bb0:
+// %exn = catch 0
+bool WebAssemblyLateEHPrepare::hoistCatches(MachineFunction &MF) {
+ bool Changed = false;
+ SmallVector<MachineInstr *, 16> Catches;
+ for (auto &MBB : MF)
+ for (auto &MI : MBB)
+ if (WebAssembly::isCatch(MI.getOpcode()))
+ Catches.push_back(&MI);
+
+ for (auto *Catch : Catches) {
+ MachineBasicBlock *EHPad = getMatchingEHPad(Catch);
+ assert(EHPad && "No matching EH pad for catch");
+ auto InsertPos = EHPad->begin();
+ // Skip EH_LABELs in the beginning of an EH pad if present. We don't use
+ // these labels at the moment, but other targets also seem to have an
+ // EH_LABEL instruction in the beginning of an EH pad.
+ while (InsertPos != EHPad->end() && InsertPos->isEHLabel())
+ InsertPos++;
+ if (InsertPos == Catch)
+ continue;
+ Changed = true;
+ EHPad->insert(InsertPos, Catch->removeFromParent());
+ }
+ return Changed;
+}
+
+// Add catch_all to beginning of cleanup pads.
+bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) {
bool Changed = false;
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
+
for (auto &MBB : MF) {
- if (MBB.isEHPad()) {
+ if (!MBB.isEHPad())
+ continue;
+ auto InsertPos = MBB.begin();
+ // Skip EH_LABELs in the beginning of an EH pad if present.
+ while (InsertPos != MBB.end() && InsertPos->isEHLabel())
+ InsertPos++;
+ // This runs after hoistCatches(), so we assume that if there is a catch,
+ // that should be the non-EH label first instruction in an EH pad.
+ if (InsertPos == MBB.end() ||
+ !WebAssembly::isCatch(InsertPos->getOpcode())) {
Changed = true;
- auto InsertPos = MBB.begin();
- if (InsertPos->isEHLabel()) // EH pad starts with an EH label
- ++InsertPos;
- Register DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
- BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(),
- TII.get(WebAssembly::CATCH), DstReg);
+ BuildMI(MBB, InsertPos, InsertPos->getDebugLoc(),
+ TII.get(WebAssembly::CATCH_ALL));
}
}
return Changed;
}
+// Replace pseudo-instructions catchret and cleanupret with br and rethrow
+// respectively.
bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
bool Changed = false;
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
@@ -185,17 +246,11 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
Changed = true;
break;
}
- case WebAssembly::CLEANUPRET:
- case WebAssembly::RETHROW_IN_CATCH: {
- // Replace a cleanupret/rethrow_in_catch with a rethrow
- auto *EHPad = getMatchingEHPad(TI);
- auto CatchPos = EHPad->begin();
- if (CatchPos->isEHLabel()) // EH pad starts with an EH label
- ++CatchPos;
- MachineInstr *Catch = &*CatchPos;
- Register ExnReg = Catch->getOperand(0).getReg();
+ case WebAssembly::CLEANUPRET: {
+ // Replace a cleanupret with a rethrow. For C++ support, currently
+ // rethrow's immediate argument is always 0 (= the latest exception).
BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
- .addReg(ExnReg);
+ .addImm(0);
TI->eraseFromParent();
Changed = true;
break;
@@ -205,6 +260,7 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
return Changed;
}
+// Remove unnecessary unreachables after a throw or rethrow.
bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
MachineFunction &MF) {
bool Changed = false;
@@ -220,8 +276,7 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
// because throw itself is a terminator, and also delete successors if
// any.
MBB.erase(std::next(MI.getIterator()), MBB.end());
- SmallVector<MachineBasicBlock *, 8> Succs(MBB.succ_begin(),
- MBB.succ_end());
+ SmallVector<MachineBasicBlock *, 8> Succs(MBB.successors());
for (auto *Succ : Succs)
if (!Succ->isEHPad())
MBB.removeSuccessor(Succ);
@@ -232,154 +287,78 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
return Changed;
}
-// Wasm uses 'br_on_exn' instruction to check the tag of an exception. It takes
-// exnref type object returned by 'catch', and branches to the destination if it
-// matches a given tag. We currently use __cpp_exception symbol to represent the
-// tag for all C++ exceptions.
+// Clang-generated terminate pads are an single-BB EH pad in the form of
+// termpad:
+// %exn = catch $__cpp_exception
+// call @__clang_call_terminate(%exn)
+// unreachable
+// (There can be local.set and local.gets before the call if we didn't run
+// RegStackify)
+// But code transformations can change or add more control flow, so the call to
+// __clang_call_terminate() function may not be in the original EH pad anymore.
+// This ensures every terminate pad is a single BB in the form illustrated
+// above.
//
-// block $l (result i32)
-// ...
-// ;; exnref $e is on the stack at this point
-// br_on_exn $l $e ;; branch to $l with $e's arguments
-// ...
-// end
-// ;; Here we expect the extracted values are on top of the wasm value stack
-// ... Handle exception using values ...
-//
-// br_on_exn takes an exnref object and branches if it matches the given tag.
-// There can be multiple br_on_exn instructions if we want to match for another
-// tag, but for now we only test for __cpp_exception tag, and if it does not
-// match, i.e., it is a foreign exception, we rethrow it.
-//
-// In the destination BB that's the target of br_on_exn, extracted exception
-// values (in C++'s case a single i32, which represents an exception pointer)
-// are placed on top of the wasm stack. Because we can't model wasm stack in
-// LLVM instruction, we use 'extract_exception' pseudo instruction to retrieve
-// it. The pseudo instruction will be deleted later.
-bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
+// This is preparation work for the HandleEHTerminatePads pass later, which
+// duplicates terminate pads both for 'catch' and 'catch_all'. Refer to
+// WebAssemblyHandleEHTerminatePads.cpp for details.
+bool WebAssemblyLateEHPrepare::ensureSingleBBTermPads(MachineFunction &MF) {
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- auto *EHInfo = MF.getWasmEHFuncInfo();
- SmallVector<MachineInstr *, 16> ExtractInstrs;
- SmallVector<MachineInstr *, 8> ToDelete;
- for (auto &MBB : MF) {
- for (auto &MI : MBB) {
- if (MI.getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) {
- if (MI.getOperand(0).isDead())
- ToDelete.push_back(&MI);
- else
- ExtractInstrs.push_back(&MI);
- }
- }
- }
- bool Changed = !ToDelete.empty() || !ExtractInstrs.empty();
- for (auto *MI : ToDelete)
- MI->eraseFromParent();
- if (ExtractInstrs.empty())
- return Changed;
-
- // Find terminate pads.
- SmallSet<MachineBasicBlock *, 8> TerminatePads;
+
+ // Find calls to __clang_call_terminate()
+ SmallVector<MachineInstr *, 8> ClangCallTerminateCalls;
+ SmallPtrSet<MachineBasicBlock *, 8> TermPads;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (MI.isCall()) {
const MachineOperand &CalleeOp = MI.getOperand(0);
if (CalleeOp.isGlobal() && CalleeOp.getGlobal()->getName() ==
- WebAssembly::ClangCallTerminateFn)
- TerminatePads.insert(getMatchingEHPad(&MI));
+ WebAssembly::ClangCallTerminateFn) {
+ MachineBasicBlock *EHPad = getMatchingEHPad(&MI);
+ assert(EHPad && "No matching EH pad for __clang_call_terminate");
+ // In case a __clang_call_terminate call is duplicated during code
+ // transformation so one terminate pad contains multiple
+ // __clang_call_terminate calls, we only count one of them
+ if (TermPads.insert(EHPad).second)
+ ClangCallTerminateCalls.push_back(&MI);
+ }
}
}
}
- for (auto *Extract : ExtractInstrs) {
- MachineBasicBlock *EHPad = getMatchingEHPad(Extract);
- assert(EHPad && "No matching EH pad for extract_exception");
- auto CatchPos = EHPad->begin();
- if (CatchPos->isEHLabel()) // EH pad starts with an EH label
- ++CatchPos;
- MachineInstr *Catch = &*CatchPos;
-
- if (Catch->getNextNode() != Extract)
- EHPad->insert(Catch->getNextNode(), Extract->removeFromParent());
-
- // - Before:
- // ehpad:
- // %exnref:exnref = catch
- // %exn:i32 = extract_exception
- // ... use exn ...
- //
- // - After:
- // ehpad:
- // %exnref:exnref = catch
- // br_on_exn %thenbb, $__cpp_exception, %exnref
- // br %elsebb
- // elsebb:
- // rethrow
- // thenbb:
- // %exn:i32 = extract_exception
- // ... use exn ...
- Register ExnReg = Catch->getOperand(0).getReg();
- auto *ThenMBB = MF.CreateMachineBasicBlock();
- auto *ElseMBB = MF.CreateMachineBasicBlock();
- MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB);
- MF.insert(std::next(MachineFunction::iterator(ElseMBB)), ThenMBB);
- ThenMBB->splice(ThenMBB->end(), EHPad, Extract, EHPad->end());
- ThenMBB->transferSuccessors(EHPad);
- EHPad->addSuccessor(ThenMBB);
- EHPad->addSuccessor(ElseMBB);
-
- DebugLoc DL = Extract->getDebugLoc();
- const char *CPPExnSymbol = MF.createExternalSymbolName("__cpp_exception");
- BuildMI(EHPad, DL, TII.get(WebAssembly::BR_ON_EXN))
- .addMBB(ThenMBB)
- .addExternalSymbol(CPPExnSymbol)
- .addReg(ExnReg);
- BuildMI(EHPad, DL, TII.get(WebAssembly::BR)).addMBB(ElseMBB);
-
- // When this is a terminate pad with __clang_call_terminate() call, we don't
- // rethrow it anymore and call __clang_call_terminate() with a nullptr
- // argument, which will call std::terminate().
- //
- // - Before:
- // ehpad:
- // %exnref:exnref = catch
- // %exn:i32 = extract_exception
- // call @__clang_call_terminate(%exn)
- // unreachable
- //
- // - After:
- // ehpad:
- // %exnref:exnref = catch
- // br_on_exn %thenbb, $__cpp_exception, %exnref
- // br %elsebb
- // elsebb:
- // call @__clang_call_terminate(0)
- // unreachable
- // thenbb:
- // %exn:i32 = extract_exception
- // call @__clang_call_terminate(%exn)
- // unreachable
- if (TerminatePads.count(EHPad)) {
- Function *ClangCallTerminateFn =
- MF.getFunction().getParent()->getFunction(
- WebAssembly::ClangCallTerminateFn);
- assert(ClangCallTerminateFn &&
- "There is no __clang_call_terminate() function");
- Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
- BuildMI(ElseMBB, DL, TII.get(WebAssembly::CONST_I32), Reg).addImm(0);
- BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL))
- .addGlobalAddress(ClangCallTerminateFn)
- .addReg(Reg);
- BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE));
-
- } else {
- BuildMI(ElseMBB, DL, TII.get(WebAssembly::RETHROW)).addReg(ExnReg);
- if (EHInfo->hasEHPadUnwindDest(EHPad))
- ElseMBB->addSuccessor(EHInfo->getEHPadUnwindDest(EHPad));
- }
- }
+ bool Changed = false;
+ for (auto *Call : ClangCallTerminateCalls) {
+ MachineBasicBlock *EHPad = getMatchingEHPad(Call);
+ assert(EHPad && "No matching EH pad for __clang_call_terminate");
+
+ // If it is already the form we want, skip it
+ if (Call->getParent() == EHPad &&
+ Call->getNextNode()->getOpcode() == WebAssembly::UNREACHABLE)
+ continue;
- return true;
+ // In case the __clang_call_terminate() call is not in its matching EH pad,
+ // move the call to the end of EH pad and add an unreachable instruction
+ // after that. Delete all successors and their children if any, because here
+ // the program terminates.
+ Changed = true;
+ // This runs after hoistCatches(), so catch instruction should be at the top
+ MachineInstr *Catch = WebAssembly::findCatch(EHPad);
+ assert(Catch && "EH pad does not have a catch instruction");
+ // Takes the result register of the catch instruction as argument. There may
+ // have been some other local.set/local.gets in between, but at this point
+ // we don't care.
+ Call->getOperand(1).setReg(Catch->getOperand(0).getReg());
+ auto InsertPos = std::next(MachineBasicBlock::iterator(Catch));
+ EHPad->insert(InsertPos, Call->removeFromParent());
+ BuildMI(*EHPad, InsertPos, Call->getDebugLoc(),
+ TII.get(WebAssembly::UNREACHABLE));
+ EHPad->erase(InsertPos, EHPad->end());
+ SmallVector<MachineBasicBlock *, 8> Succs(EHPad->successors());
+ for (auto *Succ : Succs)
+ EHPad->removeSuccessor(Succ);
+ eraseDeadBBsAndChildren(Succs);
+ }
+ return Changed;
}
// After the stack is unwound due to a thrown exception, the __stack_pointer
@@ -406,7 +385,7 @@ bool WebAssemblyLateEHPrepare::restoreStackPointer(MachineFunction &MF) {
auto InsertPos = MBB.begin();
if (InsertPos->isEHLabel()) // EH pad starts with an EH label
++InsertPos;
- if (InsertPos->getOpcode() == WebAssembly::CATCH)
+ if (WebAssembly::isCatch(InsertPos->getOpcode()))
++InsertPos;
FrameLowering->writeSPToGlobal(FrameLowering->getSPReg(MF), MF, MBB,
InsertPos, MBB.begin()->getDebugLoc());
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 5fce4a600510..d3bbadf27478 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -140,8 +140,7 @@
/// 1) Lower
/// longjmp(buf, value)
/// into
-/// emscripten_longjmp_jmpbuf(buf, value)
-/// emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later.
+/// emscripten_longjmp(buf, value)
///
/// In case calls to setjmp() exists
///
@@ -196,19 +195,16 @@
/// stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
/// each setjmp callsite. Label 0 means this longjmp buffer does not
/// correspond to one of the setjmp callsites in this function, so in this
-/// case we just chain the longjmp to the caller. (Here we call
-/// emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf.
-/// emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while
-/// emscripten_longjmp takes an int. Both of them will eventually be lowered
-/// to emscripten_longjmp in s2wasm, but here we need two signatures - we
-/// can't translate an int value to a jmp_buf.)
-/// Label -1 means no longjmp occurred. Otherwise we jump to the right
-/// post-setjmp BB based on the label.
+/// case we just chain the longjmp to the caller. Label -1 means no longjmp
+/// occurred. Otherwise we jump to the right post-setjmp BB based on the
+/// label.
///
///===----------------------------------------------------------------------===//
#include "WebAssembly.h"
+#include "WebAssemblyTargetMachine.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
@@ -239,7 +235,6 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
Function *ResumeF = nullptr;
Function *EHTypeIDF = nullptr;
Function *EmLongjmpF = nullptr;
- Function *EmLongjmpJmpbufF = nullptr;
Function *SaveSetjmpF = nullptr;
Function *TestSetjmpF = nullptr;
@@ -314,13 +309,23 @@ static bool canThrow(const Value *V) {
// Get a global variable with the given name. If it doesn't exist declare it,
// which will generate an import and asssumes that it will exist at link time.
static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB,
+ WebAssemblyTargetMachine &TM,
const char *Name) {
-
- auto *GV =
- dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
+ auto Int32Ty = IRB.getInt32Ty();
+ auto *GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, Int32Ty));
if (!GV)
report_fatal_error(Twine("unable to create global: ") + Name);
+ // If the target supports TLS, make this variable thread-local. We can't just
+ // unconditionally make it thread-local and depend on
+ // CoalesceFeaturesAndStripAtomics to downgrade it, because stripping TLS has
+ // the side effect of disallowing the object from being linked into a
+ // shared-memory module, which we don't want to be responsible for.
+ auto *Subtarget = TM.getSubtargetImpl();
+ auto TLS = Subtarget->hasAtomics() && Subtarget->hasBulkMemory()
+ ? GlobalValue::LocalExecTLSModel
+ : GlobalValue::NotThreadLocal;
+ GV->setThreadLocalMode(TLS);
return GV;
}
@@ -338,7 +343,7 @@ static std::string getSignature(FunctionType *FTy) {
if (FTy->isVarArg())
OS << "_...";
Sig = OS.str();
- Sig.erase(remove_if(Sig, isSpace), Sig.end());
+ erase_if(Sig, isSpace);
// When s2wasm parses .s file, a comma means the end of an argument. So a
// mangled function name can contain any character but a comma.
std::replace(Sig.begin(), Sig.end(), ',', '.');
@@ -630,6 +635,40 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
}
}
+// Replace uses of longjmp with emscripten_longjmp. emscripten_longjmp takes
+// arguments of type {i32, i32} and longjmp takes {jmp_buf*, i32}, so we need a
+// ptrtoint instruction here to make the type match. jmp_buf* will eventually be
+// lowered to i32 in the wasm backend.
+static void replaceLongjmpWithEmscriptenLongjmp(Function *LongjmpF,
+ Function *EmLongjmpF) {
+ SmallVector<CallInst *, 8> ToErase;
+ LLVMContext &C = LongjmpF->getParent()->getContext();
+ IRBuilder<> IRB(C);
+
+ // For calls to longjmp, replace it with emscripten_longjmp and cast its first
+ // argument (jmp_buf*) to int
+ for (User *U : LongjmpF->users()) {
+ auto *CI = dyn_cast<CallInst>(U);
+ if (CI && CI->getCalledFunction() == LongjmpF) {
+ IRB.SetInsertPoint(CI);
+ Value *Jmpbuf =
+ IRB.CreatePtrToInt(CI->getArgOperand(0), IRB.getInt32Ty(), "jmpbuf");
+ IRB.CreateCall(EmLongjmpF, {Jmpbuf, CI->getArgOperand(1)});
+ ToErase.push_back(CI);
+ }
+ }
+ for (auto *I : ToErase)
+ I->eraseFromParent();
+
+ // If we have any remaining uses of longjmp's function pointer, replace it
+ // with (int(*)(jmp_buf*, int))emscripten_longjmp.
+ if (!LongjmpF->uses().empty()) {
+ Value *EmLongjmp =
+ IRB.CreateBitCast(EmLongjmpF, LongjmpF->getType(), "em_longjmp");
+ LongjmpF->replaceAllUsesWith(EmLongjmp);
+ }
+}
+
bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
LLVM_DEBUG(dbgs() << "********** Lower Emscripten EH & SjLj **********\n");
@@ -642,11 +681,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
+ if ((EnableEH || DoSjLj) &&
+ Triple(M.getTargetTriple()).getArch() == Triple::wasm64)
+ report_fatal_error("Emscripten EH/SjLj is not supported with wasm64 yet");
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ assert(TPC && "Expected a TargetPassConfig");
+ auto &TM = TPC->getTM<WebAssemblyTargetMachine>();
+
// Declare (or get) global variables __THREW__, __threwValue, and
// getTempRet0/setTempRet0 function which are used in common for both
// exception handling and setjmp/longjmp handling
- ThrewGV = getGlobalVariableI32(M, IRB, "__THREW__");
- ThrewValueGV = getGlobalVariableI32(M, IRB, "__threwValue");
+ ThrewGV = getGlobalVariableI32(M, IRB, TM, "__THREW__");
+ ThrewValueGV = getGlobalVariableI32(M, IRB, TM, "__threwValue");
GetTempRet0Func = getEmscriptenFunction(
FunctionType::get(IRB.getInt32Ty(), false), "getTempRet0", &M);
SetTempRet0Func = getEmscriptenFunction(
@@ -680,22 +727,21 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
if (DoSjLj) {
Changed = true; // We have setjmp or longjmp somewhere
- if (LongjmpF) {
- // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is
- // defined in JS code
- EmLongjmpJmpbufF = getEmscriptenFunction(LongjmpF->getFunctionType(),
- "emscripten_longjmp_jmpbuf", &M);
- LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
- }
+ // Register emscripten_longjmp function
+ FunctionType *FTy = FunctionType::get(
+ IRB.getVoidTy(), {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
+ EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
+
+ if (LongjmpF)
+ replaceLongjmpWithEmscriptenLongjmp(LongjmpF, EmLongjmpF);
if (SetjmpF) {
// Register saveSetjmp function
FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
- FunctionType *FTy =
- FunctionType::get(Type::getInt32PtrTy(C),
- {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
- Type::getInt32PtrTy(C), IRB.getInt32Ty()},
- false);
+ FTy = FunctionType::get(Type::getInt32PtrTy(C),
+ {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
+ Type::getInt32PtrTy(C), IRB.getInt32Ty()},
+ false);
SaveSetjmpF = getEmscriptenFunction(FTy, "saveSetjmp", &M);
// Register testSetjmp function
@@ -704,10 +750,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
{IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}, false);
TestSetjmpF = getEmscriptenFunction(FTy, "testSetjmp", &M);
- FTy = FunctionType::get(IRB.getVoidTy(),
- {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
- EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
-
// Only traverse functions that uses setjmp in order not to insert
// unnecessary prep / cleanup code in every function
SmallPtrSet<Function *, 8> SetjmpUsers;
@@ -769,7 +811,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
} else {
// This can't throw, and we don't need this invoke, just replace it with a
// call+branch
- SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
+ SmallVector<Value *, 16> Args(II->args());
CallInst *NewCall =
IRB.CreateCall(II->getFunctionType(), II->getCalledOperand(), Args);
NewCall->takeName(II);
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 304dca2ebfe4..86d59ef807ab 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -38,29 +38,34 @@ cl::opt<bool>
" instruction output for test purposes only."),
cl::init(false));
+extern cl::opt<bool> EnableEmException;
+extern cl::opt<bool> EnableEmSjLj;
+
static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
MCSymbol *
WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
const GlobalValue *Global = MO.getGlobal();
- auto *WasmSym = cast<MCSymbolWasm>(Printer.getSymbol(Global));
-
- if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
- const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
- const TargetMachine &TM = MF.getTarget();
- const Function &CurrentFunc = MF.getFunction();
-
- SmallVector<MVT, 1> ResultMVTs;
- SmallVector<MVT, 4> ParamMVTs;
- const auto *const F = dyn_cast<Function>(Global);
- computeSignatureVTs(FuncTy, F, CurrentFunc, TM, ParamMVTs, ResultMVTs);
-
- auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs);
- WasmSym->setSignature(Signature.get());
- Printer.addSignature(std::move(Signature));
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
- }
-
+ if (!isa<Function>(Global))
+ return cast<MCSymbolWasm>(Printer.getSymbol(Global));
+
+ const auto *FuncTy = cast<FunctionType>(Global->getValueType());
+ const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
+ const TargetMachine &TM = MF.getTarget();
+ const Function &CurrentFunc = MF.getFunction();
+
+ SmallVector<MVT, 1> ResultMVTs;
+ SmallVector<MVT, 4> ParamMVTs;
+ const auto *const F = dyn_cast<Function>(Global);
+ computeSignatureVTs(FuncTy, F, CurrentFunc, TM, ParamMVTs, ResultMVTs);
+ auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs);
+
+ bool InvokeDetected = false;
+ auto *WasmSym = Printer.getMCSymbolForFunction(
+ F, EnableEmException || EnableEmSjLj, Signature.get(), InvokeDetected);
+ WasmSym->setSignature(Signature.get());
+ Printer.addSignature(std::move(Signature));
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
return WasmSym;
}
@@ -134,6 +139,9 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
case WebAssemblyII::MO_MEMORY_BASE_REL:
Kind = MCSymbolRefExpr::VK_WASM_MBREL;
break;
+ case WebAssemblyII::MO_TLS_BASE_REL:
+ Kind = MCSymbolRefExpr::VK_WASM_TLSREL;
+ break;
case WebAssemblyII::MO_TABLE_BASE_REL:
Kind = MCSymbolRefExpr::VK_WASM_TBREL;
break;
@@ -266,6 +274,11 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
SmallVector<wasm::ValType, 4>());
break;
}
+ } else if (Info.OperandType == WebAssembly::OPERAND_HEAPTYPE) {
+ assert(static_cast<WebAssembly::HeapType>(MO.getImm()) !=
+ WebAssembly::HeapType::Invalid);
+ // With typed function references, this will need a case for type
+ // index operands. Otherwise, fall through.
}
}
MCOp = MCOperand::createImm(MO.getImm());
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index a2da0ea849e0..6bfed1a7195c 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -97,7 +97,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
// values through live-range splitting and stackification, it will have to
// do.
MF.getInfo<WebAssemblyFunctionInfo>()->setFrameBaseVreg(
- SplitLIs.back()->reg);
+ SplitLIs.back()->reg());
}
SplitLIs.clear();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index a587c9d23d2b..ba1c4b7233f2 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -111,8 +111,11 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
case WebAssembly::V128RegClassID:
CopyLocalOpc = WebAssembly::COPY_V128;
break;
- case WebAssembly::EXNREFRegClassID:
- CopyLocalOpc = WebAssembly::COPY_EXNREF;
+ case WebAssembly::FUNCREFRegClassID:
+ CopyLocalOpc = WebAssembly::COPY_FUNCREF;
+ break;
+ case WebAssembly::EXTERNREFRegClassID:
+ CopyLocalOpc = WebAssembly::COPY_EXTERNREF;
break;
default:
llvm_unreachable("Unexpected register class for return operand");
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 20fe2b2b7bfc..fe127dec8aed 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -106,8 +106,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
continue;
LiveInterval *LI = &Liveness->getInterval(VReg);
- assert(LI->weight == 0.0f);
- LI->weight = computeWeight(MRI, MBFI, VReg);
+ assert(LI->weight() == 0.0f);
+ LI->setWeight(computeWeight(MRI, MBFI, VReg));
LLVM_DEBUG(LI->dump());
SortedIntervals.push_back(LI);
}
@@ -118,10 +118,10 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
// TODO: Investigate more intelligent sorting heuristics. For starters, we
// should try to coalesce adjacent live intervals before non-adjacent ones.
llvm::sort(SortedIntervals, [MRI](LiveInterval *LHS, LiveInterval *RHS) {
- if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
- return MRI->isLiveIn(LHS->reg);
- if (LHS->weight != RHS->weight)
- return LHS->weight > RHS->weight;
+ if (MRI->isLiveIn(LHS->reg()) != MRI->isLiveIn(RHS->reg()))
+ return MRI->isLiveIn(LHS->reg());
+ if (LHS->weight() != RHS->weight())
+ return LHS->weight() > RHS->weight();
if (LHS->empty() || RHS->empty())
return !LHS->empty() && RHS->empty();
return *LHS < *RHS;
@@ -135,14 +135,14 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
LiveInterval *LI = SortedIntervals[I];
- unsigned Old = LI->reg;
+ unsigned Old = LI->reg();
size_t Color = I;
const TargetRegisterClass *RC = MRI->getRegClass(Old);
// Check if it's possible to reuse any of the used colors.
if (!MRI->isLiveIn(Old))
for (unsigned C : UsedColors.set_bits()) {
- if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
+ if (MRI->getRegClass(SortedIntervals[C]->reg()) != RC)
continue;
for (LiveInterval *OtherLI : Assignments[C])
if (!OtherLI->empty() && OtherLI->overlaps(*LI))
@@ -152,7 +152,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
continue_outer:;
}
- unsigned New = SortedIntervals[Color]->reg;
+ unsigned New = SortedIntervals[Color]->reg();
SlotMapping[I] = New;
Changed |= Old != New;
UsedColors.set(Color);
@@ -160,7 +160,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
// If we reassigned the stack pointer, update the debug frame base info.
if (Old != New && MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Old)
MFI.setFrameBaseVreg(New);
- LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg)
+ LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg())
<< " to vreg" << Register::virtReg2Index(New) << "\n");
}
if (!Changed)
@@ -168,7 +168,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
// Rewrite register operands.
for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
- unsigned Old = SortedIntervals[I]->reg;
+ unsigned Old = SortedIntervals[I]->reg();
unsigned New = SlotMapping[I];
if (Old != New)
MRI->replaceRegWith(Old, New);
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 1d4e2e3a8f9e..d474b9a2c1ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -123,7 +123,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
} else if (RegClass == &WebAssembly::V128RegClass) {
// TODO: Replace this with v128.const 0 once that is supported in V8
Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
- MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32));
+ MI->setDesc(TII->get(WebAssembly::SPLAT_I32x4));
MI->addOperand(MachineOperand::CreateReg(TempReg, false));
MachineInstr *Const = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(WebAssembly::CONST_I32), TempReg)
@@ -342,7 +342,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
// instruction in which the current value is used, we cannot
// stackify. Stackifying in this case would require that def moving below the
// current def in the stack, which cannot be achieved, even with locals.
- for (const auto &SubsequentDef : drop_begin(DefI->defs(), 1)) {
+ for (const auto &SubsequentDef : drop_begin(DefI->defs())) {
for (const auto &PriorUse : UseI->uses()) {
if (&PriorUse == Use)
break;
@@ -359,10 +359,9 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
if (NextI == Insert)
return true;
- // 'catch' and 'extract_exception' should be the first instruction of a BB and
- // cannot move.
- if (DefI->getOpcode() == WebAssembly::CATCH ||
- DefI->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32)
+ // 'catch' and 'catch_all' should be the first instruction of a BB and cannot
+ // move.
+ if (WebAssembly::isCatch(DefI->getOpcode()))
return false;
// Check for register dependencies.
@@ -595,7 +594,7 @@ static MachineInstr *rematerializeCheapDef(
if (IsDead) {
LLVM_DEBUG(dbgs() << " - Deleting original\n");
SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
- LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+ LIS.removePhysRegDefAt(MCRegister::from(WebAssembly::ARGUMENTS), Idx);
LIS.removeInterval(Reg);
LIS.RemoveMachineInstrFromMaps(Def);
Def.eraseFromParent();
@@ -693,7 +692,7 @@ class TreeWalkerState {
public:
explicit TreeWalkerState(MachineInstr *Insert) {
const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
- if (Range.begin() != Range.end())
+ if (!Range.empty())
Worklist.push_back(reverse(Range));
}
@@ -702,11 +701,10 @@ public:
MachineOperand &pop() {
RangeTy &Range = Worklist.back();
MachineOperand &Op = *Range.begin();
- Range = drop_begin(Range, 1);
- if (Range.begin() == Range.end())
+ Range = drop_begin(Range);
+ if (Range.empty())
Worklist.pop_back();
- assert((Worklist.empty() ||
- Worklist.back().begin() != Worklist.back().end()) &&
+ assert((Worklist.empty() || !Worklist.back().empty()) &&
"Empty ranges shouldn't remain in the worklist");
return Op;
}
@@ -714,7 +712,7 @@ public:
/// Push Instr's operands onto the stack to be visited.
void pushOperands(MachineInstr *Instr) {
const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
- if (Range.begin() != Range.end())
+ if (!Range.empty())
Worklist.push_back(reverse(Range));
}
@@ -733,7 +731,7 @@ public:
if (Worklist.empty())
return false;
const RangeTy &Range = Worklist.back();
- return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+ return !Range.empty() && Range.begin()->getParent() == Instr;
}
/// Test whether the given register is present on the stack, indicating an
@@ -865,24 +863,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
if (WebAssembly::isArgument(DefI->getOpcode()))
continue;
- // Currently catch's return value register cannot be stackified, because
- // the wasm LLVM backend currently does not support live-in values
- // entering blocks, which is a part of multi-value proposal.
- //
- // Once we support live-in values of wasm blocks, this can be:
- // catch ; push exnref value onto stack
- // block exnref -> i32
- // br_on_exn $__cpp_exception ; pop the exnref value
- // end_block
- //
- // But because we don't support it yet, the catch instruction's dst
- // register should be assigned to a local to be propagated across
- // 'block' boundary now.
- //
- // TODO: Fix this once we support the multivalue blocks
- if (DefI->getOpcode() == WebAssembly::CATCH)
- continue;
-
MachineOperand *Def = DefI->findRegisterDefOperand(Reg);
assert(Def != nullptr);
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 6d3d6c723277..ba2936b492a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -43,7 +43,8 @@ def F64_0 : WebAssemblyReg<"%f64.0">;
def V128_0: WebAssemblyReg<"%v128">;
-def EXNREF_0 : WebAssemblyReg<"%exnref.0">;
+def FUNCREF_0 : WebAssemblyReg<"%funcref.0">;
+def EXTERNREF_0 : WebAssemblyReg<"%externref.0">;
// The value stack "register". This is an opaque entity which serves to order
// uses and defs that must remain in LIFO order.
@@ -64,4 +65,5 @@ def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
def V128 : WebAssemblyRegClass<[v4f32, v2f64, v2i64, v4i32, v16i8, v8i16], 128,
(add V128_0)>;
-def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>;
+def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
+def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
new file mode 100644
index 000000000000..cd84e68aed14
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
@@ -0,0 +1,78 @@
+#include "WebAssemblySortRegion.h"
+#include "WebAssemblyExceptionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+using namespace WebAssembly;
+
+namespace llvm {
+namespace WebAssembly {
+template <>
+bool ConcreteSortRegion<MachineLoop>::isLoop() const {
+ return true;
+}
+} // end namespace WebAssembly
+} // end namespace llvm
+
+const SortRegion *SortRegionInfo::getRegionFor(const MachineBasicBlock *MBB) {
+ const auto *ML = MLI.getLoopFor(MBB);
+ const auto *WE = WEI.getExceptionFor(MBB);
+ if (!ML && !WE)
+ return nullptr;
+ // We determine subregion relationship by domination of their headers, i.e.,
+ // if region A's header dominates region B's header, B is a subregion of A.
+ // WebAssemblyException contains BBs in all its subregions (loops or
+ // exceptions), but MachineLoop may not, because MachineLoop does not
+ // contain BBs that don't have a path to its header even if they are
+ // dominated by its header. So here we should use
+ // WE->contains(ML->getHeader()), but not ML->contains(WE->getHeader()).
+ if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
+ // If the smallest region containing MBB is a loop
+ if (LoopMap.count(ML))
+ return LoopMap[ML].get();
+ LoopMap[ML] = std::make_unique<ConcreteSortRegion<MachineLoop>>(ML);
+ return LoopMap[ML].get();
+ } else {
+ // If the smallest region containing MBB is an exception
+ if (ExceptionMap.count(WE))
+ return ExceptionMap[WE].get();
+ ExceptionMap[WE] =
+ std::make_unique<ConcreteSortRegion<WebAssemblyException>>(WE);
+ return ExceptionMap[WE].get();
+ }
+}
+
+MachineBasicBlock *SortRegionInfo::getBottom(const SortRegion *R) {
+ if (R->isLoop())
+ return getBottom(MLI.getLoopFor(R->getHeader()));
+ else
+ return getBottom(WEI.getExceptionFor(R->getHeader()));
+}
+
+MachineBasicBlock *SortRegionInfo::getBottom(const MachineLoop *ML) {
+ MachineBasicBlock *Bottom = ML->getHeader();
+ for (MachineBasicBlock *MBB : ML->blocks()) {
+ if (MBB->getNumber() > Bottom->getNumber())
+ Bottom = MBB;
+ // MachineLoop does not contain all BBs dominated by its header. BBs that
+ // don't have a path back to the loop header aren't included. But for the
+ // purpose of CFG sorting and stackification, we need a bottom BB among all
+ // BBs that are dominated by the loop header. So we check if there is any
+ // WebAssemblyException contained in this loop, and computes the most bottom
+ // BB of them all.
+ if (MBB->isEHPad()) {
+ MachineBasicBlock *ExBottom = getBottom(WEI.getExceptionFor(MBB));
+ if (ExBottom->getNumber() > Bottom->getNumber())
+ Bottom = ExBottom;
+ }
+ }
+ return Bottom;
+}
+
+MachineBasicBlock *SortRegionInfo::getBottom(const WebAssemblyException *WE) {
+ MachineBasicBlock *Bottom = WE->getHeader();
+ for (MachineBasicBlock *MBB : WE->blocks())
+ if (MBB->getNumber() > Bottom->getNumber())
+ Bottom = MBB;
+ return Bottom;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
new file mode 100644
index 000000000000..e92bf1764185
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
@@ -0,0 +1,91 @@
+//===-- WebAssemblySortRegion.h - WebAssembly Sort SortRegion ----*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements regions used in CFGSort and CFGStackify.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSORTREGION_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSORTREGION_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineLoop;
+class MachineLoopInfo;
+class WebAssemblyException;
+class WebAssemblyExceptionInfo;
+
+namespace WebAssembly {
+
+// Wrapper for loops and exceptions
+class SortRegion {
+public:
+ virtual ~SortRegion() = default;
+ virtual MachineBasicBlock *getHeader() const = 0;
+ virtual bool contains(const MachineBasicBlock *MBB) const = 0;
+ virtual unsigned getNumBlocks() const = 0;
+ using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+ virtual iterator_range<block_iterator> blocks() const = 0;
+ virtual bool isLoop() const = 0;
+};
+
+template <typename T> class ConcreteSortRegion : public SortRegion {
+ const T *Unit;
+
+public:
+ ConcreteSortRegion(const T *Unit) : Unit(Unit) {}
+ MachineBasicBlock *getHeader() const override { return Unit->getHeader(); }
+ bool contains(const MachineBasicBlock *MBB) const override {
+ return Unit->contains(MBB);
+ }
+ unsigned getNumBlocks() const override { return Unit->getNumBlocks(); }
+ iterator_range<block_iterator> blocks() const override {
+ return Unit->blocks();
+ }
+ bool isLoop() const override { return false; }
+};
+
+// This class has information of nested SortRegions; this is analogous to what
+// LoopInfo is for loops.
+class SortRegionInfo {
+ friend class ConcreteSortRegion<MachineLoopInfo>;
+ friend class ConcreteSortRegion<WebAssemblyException>;
+
+ const MachineLoopInfo &MLI;
+ const WebAssemblyExceptionInfo &WEI;
+ DenseMap<const MachineLoop *, std::unique_ptr<SortRegion>> LoopMap;
+ DenseMap<const WebAssemblyException *, std::unique_ptr<SortRegion>>
+ ExceptionMap;
+
+public:
+ SortRegionInfo(const MachineLoopInfo &MLI,
+ const WebAssemblyExceptionInfo &WEI)
+ : MLI(MLI), WEI(WEI) {}
+
+ // Returns a smallest loop or exception that contains MBB
+ const SortRegion *getRegionFor(const MachineBasicBlock *MBB);
+
+ // Return the "bottom" block among all blocks dominated by the region
+ // (MachineLoop or WebAssemblyException) header. This works when the entity is
+ // discontiguous.
+ MachineBasicBlock *getBottom(const SortRegion *R);
+ MachineBasicBlock *getBottom(const MachineLoop *ML);
+ MachineBasicBlock *getBottom(const WebAssemblyException *WE);
+};
+
+} // end namespace WebAssembly
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index cacf5ab078a0..7943e1ecc8e1 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -33,7 +33,7 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
if (CPU.empty())
CPU = "generic";
- ParseSubtargetFeatures(CPU, FS);
+ ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
return *this;
}
@@ -41,9 +41,10 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
const std::string &CPU,
const std::string &FS,
const TargetMachine &TM)
- : WebAssemblyGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
- FrameLowering(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
- TSInfo(), TLInfo(TM, *this) {}
+ : WebAssemblyGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+ TargetTriple(TT), FrameLowering(),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)), TSInfo(),
+ TLInfo(TM, *this) {}
bool WebAssemblySubtarget::enableAtomicExpand() const {
// If atomics are disabled, atomic ops are lowered instead of expanded
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 8b95a3ddb837..a1c872ef2135 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -105,7 +105,7 @@ public:
/// Parses features string setting specified subtarget options. Definition of
/// function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 7bf655c925a4..135055a43afc 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -34,13 +34,13 @@ using namespace llvm;
#define DEBUG_TYPE "wasm"
// Emscripten's asm.js-style exception handling
-static cl::opt<bool> EnableEmException(
+cl::opt<bool> EnableEmException(
"enable-emscripten-cxx-exceptions",
cl::desc("WebAssembly Emscripten-style exception handling"),
cl::init(false));
// Emscripten's asm.js-style setjmp/longjmp handling
-static cl::opt<bool> EnableEmSjLj(
+cl::opt<bool> EnableEmSjLj(
"enable-emscripten-sjlj",
cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
cl::init(false));
@@ -145,6 +145,11 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
WebAssemblyTargetMachine::~WebAssemblyTargetMachine() = default; // anchor.
+const WebAssemblySubtarget *WebAssemblyTargetMachine::getSubtargetImpl() const {
+ return getSubtargetImpl(std::string(getTargetCPU()),
+ std::string(getTargetFeatureString()));
+}
+
const WebAssemblySubtarget *
WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU,
std::string FS) const {
@@ -160,12 +165,10 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString().str()
- : TargetCPU;
- std::string FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString().str()
- : TargetFS;
+ std::string CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+ std::string FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
// This needs to be done before we create a new subtarget since any
// creation will depend on the TM and the code generation flags on the
@@ -193,6 +196,7 @@ public:
FeatureBitset Features = coalesceFeatures(M);
std::string FeatureStr = getFeatureString(Features);
+ WasmTM->setTargetFeatureString(FeatureStr);
for (auto &F : M)
replaceFeatures(F, FeatureStr);
@@ -273,10 +277,9 @@ private:
bool stripThreadLocals(Module &M) {
bool Stripped = false;
for (auto &GV : M.globals()) {
- if (GV.getThreadLocalMode() !=
- GlobalValue::ThreadLocalMode::NotThreadLocal) {
+ if (GV.isThreadLocal()) {
Stripped = true;
- GV.setThreadLocalMode(GlobalValue::ThreadLocalMode::NotThreadLocal);
+ GV.setThreadLocal(false);
}
}
return Stripped;
@@ -323,10 +326,10 @@ public:
void addPreEmitPass() override;
// No reg alloc
- bool addRegAssignmentFast() override { return false; }
+ bool addRegAssignAndRewriteFast() override { return false; }
// No reg alloc
- bool addRegAssignmentOptimized() override { return false; }
+ bool addRegAssignAndRewriteOptimized() override { return false; }
};
} // end anonymous namespace
@@ -350,7 +353,7 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
//===----------------------------------------------------------------------===//
void WebAssemblyPassConfig::addIRPasses() {
- // Runs LowerAtomicPass if necessary
+ // Lower atomics and TLS if necessary
addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
// This is a no-op if atomics are not used in the module
@@ -443,7 +446,8 @@ void WebAssemblyPassConfig::addPreEmitPass() {
// Do various transformations for exception handling.
// Every CFG-changing optimizations should come before this.
- addPass(createWebAssemblyLateEHPrepare());
+ if (TM->Options.ExceptionModel == ExceptionHandling::Wasm)
+ addPass(createWebAssemblyLateEHPrepare());
// Now that we have a prologue and epilogue and all frame indices are
// rewritten, eliminate SP and FP. This allows them to be stackified,
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index dd5b39773313..29e968bfe8eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -33,6 +33,7 @@ public:
~WebAssemblyTargetMachine() override;
+ const WebAssemblySubtarget *getSubtargetImpl() const;
const WebAssemblySubtarget *getSubtargetImpl(std::string CPU,
std::string FS) const;
const WebAssemblySubtarget *
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 28703a2787e0..be1cfbaef3e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -84,3 +84,21 @@ unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return Cost;
}
+
+bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ // Allow inlining only when the Callee has a subset of the Caller's
+ // features. In principle, we should be able to inline regardless of any
+ // features because WebAssembly supports features at module granularity, not
+ // function granularity, but without this restriction it would be possible for
+ // a module to "forget" about features if all the functions that used them
+ // were inlined.
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ return (CallerBits & CalleeBits) == CalleeBits;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 79588a9f5669..41e358c159b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -67,6 +67,9 @@ public:
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
/// @}
+
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index bc2bb4fd6935..f8fb57d8a461 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -15,6 +15,7 @@
#include "WebAssemblyMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/MC/MCContext.h"
using namespace llvm;
const char *const WebAssembly::ClangCallTerminateFn = "__clang_call_terminate";
@@ -96,3 +97,35 @@ const MachineOperand &WebAssembly::getCalleeOp(const MachineInstr &MI) {
llvm_unreachable("Not a call instruction");
}
}
+
+MCSymbolWasm *
+WebAssembly::getOrCreateFunctionTableSymbol(MCContext &Ctx,
+ const StringRef &Name) {
+ // FIXME: Duplicates functionality from
+ // MC/WasmObjectWriter::recordRelocation.
+ MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name));
+ if (Sym) {
+ if (!Sym->isFunctionTable())
+ Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table");
+ } else {
+ Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name));
+ Sym->setFunctionTable();
+ // The default function table is synthesized by the linker.
+ Sym->setUndefined();
+ }
+ return Sym;
+}
+
+// Find a catch instruction from an EH pad.
+MachineInstr *WebAssembly::findCatch(MachineBasicBlock *EHPad) {
+ assert(EHPad->isEHPad());
+ auto Pos = EHPad->begin();
+ // Skip any label or debug instructions. Also skip 'end' marker instructions
+ // that may exist after marker placement in CFGStackify.
+ while (Pos != EHPad->end() &&
+ (Pos->isLabel() || Pos->isDebugInstr() || isMarker(Pos->getOpcode())))
+ Pos++;
+ if (Pos != EHPad->end() && WebAssembly::isCatch(Pos->getOpcode()))
+ return &*Pos;
+ return nullptr;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
index 4f0ed43a2481..41ad7869cf46 100644
--- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -15,10 +15,14 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
namespace llvm {
+class MachineBasicBlock;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCSymbolWasm;
+class StringRef;
class WebAssemblyFunctionInfo;
namespace WebAssembly {
@@ -33,21 +37,19 @@ extern const char *const CxaRethrowFn;
extern const char *const StdTerminateFn;
extern const char *const PersonalityWrapperFn;
-/// Return the "bottom" block of an entity, which can be either a MachineLoop or
-/// WebAssemblyException. This differs from MachineLoop::getBottomBlock in that
-/// it works even if the entity is discontiguous.
-template <typename T> MachineBasicBlock *getBottom(const T *Unit) {
- MachineBasicBlock *Bottom = Unit->getHeader();
- for (MachineBasicBlock *MBB : Unit->blocks())
- if (MBB->getNumber() > Bottom->getNumber())
- Bottom = MBB;
- return Bottom;
-}
-
/// Returns the operand number of a callee, assuming the argument is a call
/// instruction.
const MachineOperand &getCalleeOp(const MachineInstr &MI);
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx,
+ const StringRef &Name);
+
+/// Find a catch instruction from an EH pad. Returns null if no catch
+/// instruction found or the catch is in an invalid location.
+MachineInstr *findCatch(MachineBasicBlock *EHPad);
+
} // end namespace WebAssembly
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index a3014b2aba92..9d9a20183f0f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -32,6 +32,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -56,37 +57,53 @@ static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
namespace {
static const char OpPrecedence[] = {
- 0, // IC_OR
- 1, // IC_XOR
- 2, // IC_AND
- 3, // IC_LSHIFT
- 3, // IC_RSHIFT
- 4, // IC_PLUS
- 4, // IC_MINUS
- 5, // IC_MULTIPLY
- 5, // IC_DIVIDE
- 5, // IC_MOD
- 6, // IC_NOT
- 7, // IC_NEG
- 8, // IC_RPAREN
- 9, // IC_LPAREN
- 0, // IC_IMM
- 0 // IC_REGISTER
+ 0, // IC_OR
+ 1, // IC_XOR
+ 2, // IC_AND
+ 4, // IC_LSHIFT
+ 4, // IC_RSHIFT
+ 5, // IC_PLUS
+ 5, // IC_MINUS
+ 6, // IC_MULTIPLY
+ 6, // IC_DIVIDE
+ 6, // IC_MOD
+ 7, // IC_NOT
+ 8, // IC_NEG
+ 9, // IC_RPAREN
+ 10, // IC_LPAREN
+ 0, // IC_IMM
+ 0, // IC_REGISTER
+ 3, // IC_EQ
+ 3, // IC_NE
+ 3, // IC_LT
+ 3, // IC_LE
+ 3, // IC_GT
+ 3 // IC_GE
};
class X86AsmParser : public MCTargetAsmParser {
ParseInstructionInfo *InstInfo;
bool Code16GCC;
+ unsigned ForcedDataPrefix = 0;
enum VEXEncoding {
VEXEncoding_Default,
VEXEncoding_VEX,
+ VEXEncoding_VEX2,
VEXEncoding_VEX3,
VEXEncoding_EVEX,
};
VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
+ enum DispEncoding {
+ DispEncoding_Default,
+ DispEncoding_Disp8,
+ DispEncoding_Disp32,
+ };
+
+ DispEncoding ForcedDispEncoding = DispEncoding_Default;
+
private:
SMLoc consumeToken() {
MCAsmParser &Parser = getParser();
@@ -132,7 +149,13 @@ private:
IC_RPAREN,
IC_LPAREN,
IC_IMM,
- IC_REGISTER
+ IC_REGISTER,
+ IC_EQ,
+ IC_NE,
+ IC_LT,
+ IC_LE,
+ IC_GT,
+ IC_GE
};
enum IntelOperatorKind {
@@ -142,12 +165,19 @@ private:
IOK_TYPE,
};
+ enum MasmOperatorKind {
+ MOK_INVALID = 0,
+ MOK_LENGTHOF,
+ MOK_SIZEOF,
+ MOK_TYPE,
+ };
+
class InfixCalculator {
typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
SmallVector<ICToken, 4> PostfixStack;
- bool isUnaryOperator(const InfixCalculatorTok Op) {
+ bool isUnaryOperator(InfixCalculatorTok Op) const {
return Op == IC_NEG || Op == IC_NOT;
}
@@ -314,6 +344,44 @@ private:
Val = Op1.second >> Op2.second;
OperandStack.push_back(std::make_pair(IC_IMM, Val));
break;
+ case IC_EQ:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Equals operation with an immediate and a register!");
+ Val = (Op1.second == Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_NE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Not-equals operation with an immediate and a register!");
+ Val = (Op1.second != Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LT:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Less-than operation with an immediate and a register!");
+ Val = (Op1.second < Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Less-than-or-equal operation with an immediate and a "
+ "register!");
+ Val = (Op1.second <= Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_GT:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Greater-than operation with an immediate and a register!");
+ Val = (Op1.second > Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_GE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Greater-than-or-equal operation with an immediate and a "
+ "register!");
+ Val = (Op1.second >= Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
}
}
}
@@ -327,6 +395,12 @@ private:
IES_OR,
IES_XOR,
IES_AND,
+ IES_EQ,
+ IES_NE,
+ IES_LT,
+ IES_LE,
+ IES_GT,
+ IES_GE,
IES_LSHIFT,
IES_RSHIFT,
IES_PLUS,
@@ -359,7 +433,7 @@ private:
bool MemExpr;
bool OffsetOperator;
SMLoc OffsetOperatorLoc;
- StringRef CurType;
+ AsmTypeInfo CurType;
bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
if (Sym) {
@@ -378,22 +452,25 @@ private:
MemExpr(false), OffsetOperator(false) {}
void addImm(int64_t imm) { Imm += imm; }
- short getBracCount() { return BracCount; }
- bool isMemExpr() { return MemExpr; }
- bool isOffsetOperator() { return OffsetOperator; }
- SMLoc getOffsetLoc() { return OffsetOperatorLoc; }
- unsigned getBaseReg() { return BaseReg; }
- unsigned getIndexReg() { return IndexReg; }
- unsigned getScale() { return Scale; }
- const MCExpr *getSym() { return Sym; }
- StringRef getSymName() { return SymName; }
- StringRef getType() { return CurType; }
+ short getBracCount() const { return BracCount; }
+ bool isMemExpr() const { return MemExpr; }
+ bool isOffsetOperator() const { return OffsetOperator; }
+ SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
+ unsigned getBaseReg() const { return BaseReg; }
+ unsigned getIndexReg() const { return IndexReg; }
+ unsigned getScale() const { return Scale; }
+ const MCExpr *getSym() const { return Sym; }
+ StringRef getSymName() const { return SymName; }
+ StringRef getType() const { return CurType.Name; }
+ unsigned getSize() const { return CurType.Size; }
+ unsigned getElementSize() const { return CurType.ElementSize; }
+ unsigned getLength() const { return CurType.Length; }
int64_t getImm() { return Imm + IC.execute(); }
- bool isValidEndState() {
+ bool isValidEndState() const {
return State == IES_RBRAC || State == IES_INTEGER;
}
- bool hadError() { return State == IES_ERROR; }
- InlineAsmIdentifierInfo &getIdentifierInfo() { return Info; }
+ bool hadError() const { return State == IES_ERROR; }
+ const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
void onOr() {
IntelExprState CurrState = State;
@@ -440,6 +517,96 @@ private:
}
PrevState = CurrState;
}
+ void onEq() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_EQ;
+ IC.pushOperator(IC_EQ);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onNE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_NE;
+ IC.pushOperator(IC_NE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLT() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LT;
+ IC.pushOperator(IC_LT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LE;
+ IC.pushOperator(IC_LE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onGT() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_GT;
+ IC.pushOperator(IC_GT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onGE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_GE;
+ IC.pushOperator(IC_GE);
+ break;
+ }
+ PrevState = CurrState;
+ }
void onLShift() {
IntelExprState CurrState = State;
switch (State) {
@@ -510,6 +677,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_PLUS:
@@ -565,6 +738,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_PLUS:
@@ -620,7 +799,8 @@ private:
}
bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
const InlineAsmIdentifierInfo &IDInfo,
- bool ParsingMSInlineAsm, StringRef &ErrMsg) {
+ const AsmTypeInfo &Type, bool ParsingMSInlineAsm,
+ StringRef &ErrMsg) {
// InlineAsm: Treat an enum value as an integer
if (ParsingMSInlineAsm)
if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
@@ -639,6 +819,7 @@ private:
case IES_NOT:
case IES_INIT:
case IES_LBRAC:
+ case IES_LPAREN:
if (setSymRef(SymRef, SymRefName, ErrMsg))
return true;
MemExpr = true;
@@ -646,6 +827,7 @@ private:
IC.pushOperand(IC_IMM);
if (ParsingMSInlineAsm)
Info = IDInfo;
+ setTypeInfo(Type);
break;
}
return false;
@@ -662,6 +844,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_DIVIDE:
@@ -744,6 +932,8 @@ private:
case IES_RPAREN:
State = IES_PLUS;
IC.pushOperator(IC_PLUS);
+ CurType.Length = 1;
+ CurType.Size = CurType.ElementSize;
break;
case IES_INIT:
case IES_CAST:
@@ -796,6 +986,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_MULTIPLY:
@@ -827,8 +1023,8 @@ private:
}
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
- const InlineAsmIdentifierInfo &IDInfo, bool ParsingMSInlineAsm,
- StringRef &ErrMsg) {
+ const InlineAsmIdentifierInfo &IDInfo,
+ bool ParsingMSInlineAsm, StringRef &ErrMsg) {
PrevState = State;
switch (State) {
default:
@@ -852,19 +1048,19 @@ private:
}
return false;
}
- void onCast(StringRef Type) {
+ void onCast(AsmTypeInfo Info) {
PrevState = State;
switch (State) {
default:
State = IES_ERROR;
break;
case IES_LPAREN:
- setType(Type);
+ setTypeInfo(Info);
State = IES_CAST;
break;
}
}
- void setType(StringRef Type) { CurType = Type; }
+ void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
};
bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
@@ -878,11 +1074,6 @@ private:
return Parser.Error(L, Msg, Range);
}
- std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg, SMRange R = SMRange()) {
- Error(Loc, Msg, R);
- return nullptr;
- }
-
bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc,
SMLoc EndLoc);
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
@@ -898,17 +1089,21 @@ private:
std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
OperandVector &FinalOperands);
- std::unique_ptr<X86Operand> ParseOperand();
- std::unique_ptr<X86Operand> ParseATTOperand();
- std::unique_ptr<X86Operand> ParseIntelOperand();
+ bool ParseOperand(OperandVector &Operands);
+ bool ParseATTOperand(OperandVector &Operands);
+ bool ParseIntelOperand(OperandVector &Operands);
bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
InlineAsmIdentifierInfo &Info, SMLoc &End);
bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
- std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start);
+ unsigned IdentifyMasmOperator(StringRef Name);
+ bool ParseMasmOperator(unsigned OpKind, int64_t &Val);
+ bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands);
bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM,
bool &ParseError, SMLoc &End);
+ bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End);
void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
SMLoc End);
bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
@@ -917,20 +1112,21 @@ private:
bool IsUnevaluatedOperand, SMLoc &End,
bool IsParsingOffsetOperator = false);
- std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg,
- const MCExpr *&Disp,
- const SMLoc &StartLoc,
- SMLoc &EndLoc);
+ bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
+ SMLoc EndLoc, OperandVector &Operands);
X86::CondCode ParseConditionCode(StringRef CCode);
bool ParseIntelMemoryOperandSize(unsigned &Size);
- std::unique_ptr<X86Operand>
- CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
- unsigned IndexReg, unsigned Scale, SMLoc Start,
- SMLoc End, unsigned Size, StringRef Identifier,
- const InlineAsmIdentifierInfo &Info);
-
+ bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ const InlineAsmIdentifierInfo &Info,
+ OperandVector &Operands);
+
+ bool parseDirectiveArch();
+ bool parseDirectiveNops(SMLoc L);
bool parseDirectiveEven(SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
@@ -942,7 +1138,6 @@ private:
bool parseDirectiveFPOStackAlign(SMLoc L);
bool parseDirectiveFPOEndPrologue(SMLoc L);
bool parseDirectiveFPOEndProc(SMLoc L);
- bool parseDirectiveFPOData(SMLoc L);
/// SEH directives.
bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo);
@@ -992,8 +1187,7 @@ private:
/// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
/// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
/// return false if no parsing errors occurred, true otherwise.
- bool HandleAVX512Operand(OperandVector &Operands,
- const MCParsedAsmOperand &Op);
+ bool HandleAVX512Operand(OperandVector &Operands);
bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
@@ -1188,8 +1382,6 @@ bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName,
// FIXME: This should be done using Requires<Not64BitMode> and
// Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
// checked.
- // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
- // REX prefix.
if (RegNo == X86::RIZ || RegNo == X86::RIP ||
X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
X86II::isX86_64NonExtLowByteReg(RegNo) ||
@@ -1524,16 +1716,17 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
return false;
}
-std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+bool X86AsmParser::ParseOperand(OperandVector &Operands) {
if (isParsingIntelSyntax())
- return ParseIntelOperand();
- return ParseATTOperand();
+ return ParseIntelOperand(Operands);
+
+ return ParseATTOperand(Operands);
}
-std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
+bool X86AsmParser::CreateMemForMSInlineAsm(
unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
- const InlineAsmIdentifierInfo &Info) {
+ const InlineAsmIdentifierInfo &Info, OperandVector &Operands) {
// If we found a decl other than a VarDecl, then assume it is a FuncDecl or
// some other label reference.
if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
@@ -1545,8 +1738,10 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
}
// Create an absolute memory reference in order to match against
// instructions taking a PC relative operand.
- return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
- Identifier, Info.Label.Decl);
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
+ End, Size, Identifier,
+ Info.Label.Decl));
+ return false;
}
// We either have a direct symbol reference, or an offset from a symbol. The
// parser always puts the symbol on the LHS, so look there for size
@@ -1563,17 +1758,19 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
// It is widely common for MS InlineAsm to use a global variable and one/two
// registers in a mmory expression, and though unaccessible via rip/eip.
if (IsGlobalLV && (BaseReg || IndexReg)) {
- return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End);
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, Start, End));
+ return false;
+ }
// Otherwise, we set the base register to a non-zero value
// if we don't know the actual value at this time. This is necessary to
// get the matching correct in some cases.
- } else {
- BaseReg = BaseReg ? BaseReg : 1;
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
- IndexReg, Scale, Start, End, Size,
- /*DefaultBaseReg=*/X86::RIP, Identifier, Decl,
- FrontendSize);
- }
+ BaseReg = BaseReg ? BaseReg : 1;
+ Operands.push_back(X86Operand::CreateMem(
+ getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End,
+ Size,
+ /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize));
+ return false;
}
// Some binary bitwise operators have a named synonymous
@@ -1582,8 +1779,10 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
IntelExprStateMachine &SM,
bool &ParseError, SMLoc &End) {
- // A named operator should be either lower or upper case, but not a mix
- if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
+ // A named operator should be either lower or upper case, but not a mix...
+ // except in MASM, which uses full case-insensitivity.
+ if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
+ !getParser().isParsingMasm())
return false;
if (Name.equals_lower("not")) {
SM.onNot();
@@ -1619,15 +1818,39 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
End = consumeToken();
return true;
}
+bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
+ IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End) {
+ if (Name.equals_lower("eq")) {
+ SM.onEq();
+ } else if (Name.equals_lower("ne")) {
+ SM.onNE();
+ } else if (Name.equals_lower("lt")) {
+ SM.onLT();
+ } else if (Name.equals_lower("le")) {
+ SM.onLE();
+ } else if (Name.equals_lower("gt")) {
+ SM.onGT();
+ } else if (Name.equals_lower("ge")) {
+ SM.onGE();
+ } else {
+ return false;
+ }
+ End = consumeToken();
+ return true;
+}
bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
MCAsmParser &Parser = getParser();
- const AsmToken &Tok = Parser.getTok();
StringRef ErrMsg;
AsmToken::TokenKind PrevTK = AsmToken::Error;
bool Done = false;
while (!Done) {
+ // Get a fresh reference on each loop iteration in case the previous
+ // iteration moved the token storage during UnLex().
+ const AsmToken &Tok = Parser.getTok();
+
bool UpdateLocLex = true;
AsmToken::TokenKind TK = getLexer().getKind();
@@ -1636,6 +1859,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if ((Done = SM.isValidEndState()))
break;
return Error(Tok.getLoc(), "unknown token in expression");
+ case AsmToken::Error:
+ return Error(getLexer().getErrLoc(), getLexer().getErr());
+ break;
case AsmToken::EndOfStatement:
Done = true;
break;
@@ -1645,18 +1871,73 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (ParseIntelDotOperator(SM, End))
return true;
break;
+ case AsmToken::Dot:
+ if (!Parser.isParsingMasm()) {
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ // MASM allows spaces around the dot operator (e.g., "var . x")
+ Lex();
+ UpdateLocLex = false;
+ if (ParseIntelDotOperator(SM, End))
+ return true;
+ break;
+ case AsmToken::Dollar:
+ if (!Parser.isParsingMasm()) {
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ LLVM_FALLTHROUGH;
+ case AsmToken::String: {
+ if (Parser.isParsingMasm()) {
+ // MASM parsers handle strings in expressions as constants.
+ SMLoc ValueLoc = Tok.getLoc();
+ int64_t Res;
+ const MCExpr *Val;
+ if (Parser.parsePrimaryExpr(Val, End, nullptr))
+ return true;
+ UpdateLocLex = false;
+ if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
+ return Error(ValueLoc, "expected absolute value");
+ if (SM.onInteger(Res, ErrMsg))
+ return Error(ValueLoc, ErrMsg);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ }
case AsmToken::At:
- case AsmToken::String:
case AsmToken::Identifier: {
SMLoc IdentLoc = Tok.getLoc();
StringRef Identifier = Tok.getString();
UpdateLocLex = false;
+ if (Parser.isParsingMasm()) {
+ size_t DotOffset = Identifier.find_first_of('.');
+ if (DotOffset != StringRef::npos) {
+ consumeToken();
+ StringRef LHS = Identifier.slice(0, DotOffset);
+ StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1);
+ StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos);
+ if (!RHS.empty()) {
+ getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS));
+ }
+ getLexer().UnLex(AsmToken(AsmToken::Dot, Dot));
+ if (!LHS.empty()) {
+ getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS));
+ }
+ break;
+ }
+ }
// (MASM only) <TYPE> PTR operator
if (Parser.isParsingMasm()) {
const AsmToken &NextTok = getLexer().peekTok();
if (NextTok.is(AsmToken::Identifier) &&
NextTok.getIdentifier().equals_lower("ptr")) {
- SM.onCast(Identifier);
+ AsmTypeInfo Info;
+ if (Parser.lookUpType(Identifier, Info))
+ return Error(Tok.getLoc(), "unknown type");
+ SM.onCast(Info);
// Eat type and PTR.
consumeToken();
End = consumeToken();
@@ -1681,16 +1962,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (SM.onRegister(Reg, ErrMsg))
return Error(IdentLoc, ErrMsg);
- StringRef Type;
- unsigned Offset = 0;
+ AsmFieldInfo Info;
SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
- if (Parser.lookUpField(Field, Type, Offset))
+ if (Parser.lookUpField(Field, Info))
return Error(FieldStartLoc, "unknown offset");
else if (SM.onPlus(ErrMsg))
return Error(getTok().getLoc(), ErrMsg);
- else if (SM.onInteger(Offset, ErrMsg))
+ else if (SM.onInteger(Info.Offset, ErrMsg))
return Error(IdentLoc, ErrMsg);
- SM.setType(Type);
+ SM.setTypeInfo(Info.Type);
End = consumeToken();
break;
@@ -1704,8 +1984,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return true;
break;
}
+ if (Parser.isParsingMasm() &&
+ ParseMasmNamedOperator(Identifier, SM, ParseError, End)) {
+ if (ParseError)
+ return true;
+ break;
+ }
// Symbol reference, when parsing assembly content
InlineAsmIdentifierInfo Info;
+ AsmFieldInfo FieldInfo;
const MCExpr *Val;
if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
// MS Dot Operator expression
@@ -1722,8 +2009,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
if (SM.onInteger(Val, ErrMsg))
return Error(IdentLoc, ErrMsg);
- } else
+ } else {
return true;
+ }
break;
}
// MS InlineAsm identifier
@@ -1732,13 +2020,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return Error(IdentLoc, "expected identifier");
if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
return true;
- else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+ else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+ true, ErrMsg))
return Error(IdentLoc, ErrMsg);
break;
}
- if (getParser().parsePrimaryExpr(Val, End)) {
+ if (Parser.isParsingMasm()) {
+ if (unsigned OpKind = IdentifyMasmOperator(Identifier)) {
+ int64_t Val;
+ if (ParseMasmOperator(OpKind, Val))
+ return true;
+ if (SM.onInteger(Val, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ if (!getParser().lookUpType(Identifier, FieldInfo.Type)) {
+ // Field offset immediate; <TYPE>.<field specification>
+ Lex(); // eat type
+ bool EndDot = parseOptionalToken(AsmToken::Dot);
+ while (EndDot || (getTok().is(AsmToken::Identifier) &&
+ getTok().getString().startswith("."))) {
+ getParser().parseIdentifier(Identifier);
+ if (!EndDot)
+ Identifier.consume_front(".");
+ EndDot = Identifier.consume_back(".");
+ if (getParser().lookUpField(FieldInfo.Type.Name, Identifier,
+ FieldInfo)) {
+ SMLoc IDEnd =
+ SMLoc::getFromPointer(Identifier.data() + Identifier.size());
+ return Error(IdentLoc, "Unable to lookup field reference!",
+ SMRange(IdentLoc, IDEnd));
+ }
+ if (!EndDot)
+ EndDot = parseOptionalToken(AsmToken::Dot);
+ }
+ if (SM.onInteger(FieldInfo.Offset, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ }
+ if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) {
return Error(Tok.getLoc(), "Unexpected identifier!");
- } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
+ } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+ false, ErrMsg)) {
return Error(IdentLoc, ErrMsg);
}
break;
@@ -1761,8 +2085,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return Error(Loc, "invalid reference to undefined symbol");
StringRef Identifier = Sym->getName();
InlineAsmIdentifierInfo Info;
- if (SM.onIdentifierExpr(Val, Identifier, Info, isParsingMSInlineAsm(),
- ErrMsg))
+ AsmTypeInfo Type;
+ if (SM.onIdentifierExpr(Val, Identifier, Info, Type,
+ isParsingMSInlineAsm(), ErrMsg))
return Error(Loc, ErrMsg);
End = consumeToken();
} else {
@@ -1904,14 +2229,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(
}
//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
-std::unique_ptr<X86Operand>
-X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
+bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
// Eat "{" and mark the current place.
const SMLoc consumedToken = consumeToken();
if (Tok.isNot(AsmToken::Identifier))
- return ErrorOperand(Tok.getLoc(), "Expected an identifier after {");
+ return Error(Tok.getLoc(), "Expected an identifier after {");
if (Tok.getIdentifier().startswith("r")){
int rndMode = StringSwitch<int>(Tok.getIdentifier())
.Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
@@ -1920,67 +2244,76 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
.Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
.Default(-1);
if (-1 == rndMode)
- return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
+ return Error(Tok.getLoc(), "Invalid rounding mode.");
Parser.Lex(); // Eat "r*" of r*-sae
if (!getLexer().is(AsmToken::Minus))
- return ErrorOperand(Tok.getLoc(), "Expected - at this point");
+ return Error(Tok.getLoc(), "Expected - at this point");
Parser.Lex(); // Eat "-"
Parser.Lex(); // Eat the sae
if (!getLexer().is(AsmToken::RCurly))
- return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ return Error(Tok.getLoc(), "Expected } at this point");
SMLoc End = Tok.getEndLoc();
Parser.Lex(); // Eat "}"
const MCExpr *RndModeOp =
MCConstantExpr::create(rndMode, Parser.getContext());
- return X86Operand::CreateImm(RndModeOp, Start, End);
+ Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End));
+ return false;
}
if(Tok.getIdentifier().equals("sae")){
Parser.Lex(); // Eat the sae
if (!getLexer().is(AsmToken::RCurly))
- return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+ return Error(Tok.getLoc(), "Expected } at this point");
Parser.Lex(); // Eat "}"
- return X86Operand::CreateToken("{sae}", consumedToken);
+ Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken));
+ return false;
}
- return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+ return Error(Tok.getLoc(), "unknown token in expression");
}
/// Parse the '.' operator.
bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
SMLoc &End) {
const AsmToken &Tok = getTok();
- StringRef Type;
- unsigned Offset = 0;
+ AsmFieldInfo Info;
// Drop the optional '.'.
StringRef DotDispStr = Tok.getString();
if (DotDispStr.startswith("."))
DotDispStr = DotDispStr.drop_front(1);
+ StringRef TrailingDot;
// .Imm gets lexed as a real.
if (Tok.is(AsmToken::Real)) {
APInt DotDisp;
DotDispStr.getAsInteger(10, DotDisp);
- Offset = DotDisp.getZExtValue();
+ Info.Offset = DotDisp.getZExtValue();
} else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
Tok.is(AsmToken::Identifier)) {
+ if (DotDispStr.endswith(".")) {
+ TrailingDot = DotDispStr.substr(DotDispStr.size() - 1);
+ DotDispStr = DotDispStr.drop_back(1);
+ }
const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
const StringRef Base = BaseMember.first, Member = BaseMember.second;
- if (getParser().lookUpField(SM.getType(), DotDispStr, Type, Offset) &&
- getParser().lookUpField(SM.getSymName(), DotDispStr, Type, Offset) &&
- getParser().lookUpField(DotDispStr, Type, Offset) &&
+ if (getParser().lookUpField(SM.getType(), DotDispStr, Info) &&
+ getParser().lookUpField(SM.getSymName(), DotDispStr, Info) &&
+ getParser().lookUpField(DotDispStr, Info) &&
(!SemaCallback ||
- SemaCallback->LookupInlineAsmField(Base, Member, Offset)))
+ SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset)))
return Error(Tok.getLoc(), "Unable to lookup field reference!");
- } else
+ } else {
return Error(Tok.getLoc(), "Unexpected token type!");
+ }
// Eat the DotExpression and update End
End = SMLoc::getFromPointer(DotDispStr.data());
const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size();
while (Tok.getLoc().getPointer() < DotExprEndLoc)
Lex();
- SM.addImm(Offset);
- SM.setType(Type);
+ if (!TrailingDot.empty())
+ getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot));
+ SM.addImm(Info.Offset);
+ SM.setTypeInfo(Info.Type);
return false;
}
@@ -1995,7 +2328,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
if (!isParsingMSInlineAsm()) {
if ((getTok().isNot(AsmToken::Identifier) &&
getTok().isNot(AsmToken::String)) ||
- getParser().parsePrimaryExpr(Val, End))
+ getParser().parsePrimaryExpr(Val, End, nullptr))
return Error(Start, "unexpected token!");
} else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) {
return Error(Start, "unable to lookup expression");
@@ -2031,7 +2364,7 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
SMLoc Start = Tok.getLoc(), End;
StringRef Identifier = Tok.getString();
if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/true, End))
+ /*IsUnevaluatedOperand=*/true, End))
return 0;
if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
@@ -2050,6 +2383,73 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
return CVal;
}
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("type", MOK_TYPE)
+ .Cases("size", "sizeof", MOK_SIZEOF)
+ .Cases("length", "lengthof", MOK_LENGTHOF)
+ .Default(MOK_INVALID);
+}
+
+/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZEOF operator returns the size of a type or
+/// variable in bytes. A variable's size is the product of its LENGTH and TYPE.
+/// The TYPE operator returns the size of a variable. If the variable is an
+/// array, TYPE returns the size of a single element.
+bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
+ MCAsmParser &Parser = getParser();
+ SMLoc OpLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat operator.
+
+ Val = 0;
+ if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) {
+ // Check for SIZEOF(<type>) and TYPE(<type>).
+ bool InParens = Parser.getTok().is(AsmToken::LParen);
+ const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok();
+ AsmTypeInfo Type;
+ if (IDTok.is(AsmToken::Identifier) &&
+ !Parser.lookUpType(IDTok.getIdentifier(), Type)) {
+ Val = Type.Size;
+
+ // Eat tokens.
+ if (InParens)
+ parseToken(AsmToken::LParen);
+ parseToken(AsmToken::Identifier);
+ if (InParens)
+ parseToken(AsmToken::RParen);
+ }
+ }
+
+ if (!Val) {
+ IntelExprStateMachine SM;
+ SMLoc End, Start = Parser.getTok().getLoc();
+ if (ParseIntelExpression(SM, End))
+ return true;
+
+ switch (OpKind) {
+ default:
+ llvm_unreachable("Unexpected operand kind!");
+ case MOK_SIZEOF:
+ Val = SM.getSize();
+ break;
+ case MOK_LENGTHOF:
+ Val = SM.getLength();
+ break;
+ case MOK_TYPE:
+ Val = SM.getElementSize();
+ break;
+ }
+
+ if (!Val)
+ return Error(OpLoc, "expression has unknown type", SMRange(Start, End));
+ }
+
+ return false;
+}
+
bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
Size = StringSwitch<unsigned>(getTok().getString())
.Cases("BYTE", "byte", 8)
@@ -2076,7 +2476,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
return false;
}
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
SMLoc Start, End;
@@ -2084,28 +2484,31 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// Parse optional Size directive.
unsigned Size;
if (ParseIntelMemoryOperandSize(Size))
- return nullptr;
+ return true;
bool PtrInOperand = bool(Size);
Start = Tok.getLoc();
// Rounding mode operand.
if (getLexer().is(AsmToken::LCurly))
- return ParseRoundingModeOp(Start);
+ return ParseRoundingModeOp(Start, Operands);
// Register operand.
unsigned RegNo = 0;
if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) {
if (RegNo == X86::RIP)
- return ErrorOperand(Start, "rip can only be used as a base register");
+ return Error(Start, "rip can only be used as a base register");
// A Register followed by ':' is considered a segment override
- if (Tok.isNot(AsmToken::Colon))
- return !PtrInOperand ? X86Operand::CreateReg(RegNo, Start, End) :
- ErrorOperand(Start, "expected memory operand after 'ptr', "
+ if (Tok.isNot(AsmToken::Colon)) {
+ if (PtrInOperand)
+ return Error(Start, "expected memory operand after 'ptr', "
"found register operand instead");
+ Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
+ return false;
+ }
// An alleged segment override. check if we have a valid segment register
if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
- return ErrorOperand(Start, "invalid segment register");
+ return Error(Start, "invalid segment register");
// Eat ':' and update Start location
Start = Lex().getLoc();
}
@@ -2113,7 +2516,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// Immediates and Memory
IntelExprStateMachine SM;
if (ParseIntelExpression(SM, End))
- return nullptr;
+ return true;
if (isParsingMSInlineAsm())
RewriteIntelExpression(SM, Start, Tok.getLoc());
@@ -2130,22 +2533,27 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// and we are parsing a segment override
if (!SM.isMemExpr() && !RegNo) {
if (isParsingMSInlineAsm() && SM.isOffsetOperator()) {
- const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo();
+ const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
// Disp includes the address of a variable; make sure this is recorded
// for later handling.
- return X86Operand::CreateImm(Disp, Start, End, SM.getSymName(),
- Info.Var.Decl, Info.Var.IsGlobalLV);
+ Operands.push_back(X86Operand::CreateImm(Disp, Start, End,
+ SM.getSymName(), Info.Var.Decl,
+ Info.Var.IsGlobalLV));
+ return false;
}
}
- return X86Operand::CreateImm(Disp, Start, End);
+ Operands.push_back(X86Operand::CreateImm(Disp, Start, End));
+ return false;
}
StringRef ErrMsg;
unsigned BaseReg = SM.getBaseReg();
unsigned IndexReg = SM.getIndexReg();
unsigned Scale = SM.getScale();
+ if (!PtrInOperand)
+ Size = SM.getElementSize() << 3;
if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP &&
(IndexReg == X86::ESP || IndexReg == X86::RSP))
@@ -2164,7 +2572,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
if (Scale != 0 &&
X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))
- return ErrorOperand(Start, "16-bit addresses cannot have a scale");
+ return Error(Start, "16-bit addresses cannot have a scale");
// If there was no explicit scale specified, change it to 1.
if (Scale == 0)
@@ -2180,26 +2588,33 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
if ((BaseReg || IndexReg) &&
CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
ErrMsg))
- return ErrorOperand(Start, ErrMsg);
+ return Error(Start, ErrMsg);
if (isParsingMSInlineAsm())
return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start,
End, Size, SM.getSymName(),
- SM.getIdentifierInfo());
+ SM.getIdentifierInfo(), Operands);
// When parsing x64 MS-style assembly, all memory operands default to
// RIP-relative when interpreted as non-absolute references.
- if (Parser.isParsingMasm() && is64BitMode())
- return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, BaseReg,
- IndexReg, Scale, Start, End, Size,
- /*DefaultBaseReg=*/X86::RIP);
-
- if (!(BaseReg || IndexReg || RegNo))
- return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
- return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
- BaseReg, IndexReg, Scale, Start, End, Size);
+ if (Parser.isParsingMasm() && is64BitMode()) {
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start,
+ End, Size,
+ /*DefaultBaseReg=*/X86::RIP));
+ return false;
+ }
+
+ if ((BaseReg || IndexReg || RegNo))
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start,
+ End, Size));
+ else
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
+ return false;
}
-std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
switch (getLexer().getKind()) {
case AsmToken::Dollar: {
@@ -2214,12 +2629,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
"expected immediate expression") ||
getParser().parseExpression(Val, End) ||
check(isa<X86MCExpr>(Val), L, "expected immediate expression"))
- return nullptr;
- return X86Operand::CreateImm(Val, Start, End);
+ return true;
+ Operands.push_back(X86Operand::CreateImm(Val, Start, End));
+ return false;
}
case AsmToken::LCurly: {
SMLoc Start = Parser.getTok().getLoc();
- return ParseRoundingModeOp(Start);
+ return ParseRoundingModeOp(Start, Operands);
}
default: {
// This a memory operand or a register. We have some parsing complications
@@ -2233,7 +2649,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
if (getLexer().isNot(AsmToken::LParen)) {
// No '(' so this is either a displacement expression or a register.
if (Parser.parseExpression(Expr, EndLoc))
- return nullptr;
+ return true;
if (auto *RE = dyn_cast<X86MCExpr>(Expr)) {
// Segment Register. Reset Expr and copy value to register.
Expr = nullptr;
@@ -2241,21 +2657,27 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
// Sanity check register.
if (Reg == X86::EIZ || Reg == X86::RIZ)
- return ErrorOperand(
+ return Error(
Loc, "%eiz and %riz can only be used as index registers",
SMRange(Loc, EndLoc));
if (Reg == X86::RIP)
- return ErrorOperand(Loc, "%rip can only be used as a base register",
- SMRange(Loc, EndLoc));
+ return Error(Loc, "%rip can only be used as a base register",
+ SMRange(Loc, EndLoc));
// Return register that are not segment prefixes immediately.
- if (!Parser.parseOptionalToken(AsmToken::Colon))
- return X86Operand::CreateReg(Reg, Loc, EndLoc);
+ if (!Parser.parseOptionalToken(AsmToken::Colon)) {
+ Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc));
+ return false;
+ }
if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(Reg))
- return ErrorOperand(Loc, "invalid segment register");
+ return Error(Loc, "invalid segment register");
+ // Accept a '*' absolute memory reference after the segment. Place it
+ // before the full memory operand.
+ if (getLexer().is(AsmToken::Star))
+ Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
}
}
// This is a Memory operand.
- return ParseMemOperand(Reg, Expr, Loc, EndLoc);
+ return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands);
}
}
}
@@ -2305,8 +2727,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
}
// true on failure, false otherwise
-bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
- const MCParsedAsmOperand &Op) {
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
if (getLexer().is(AsmToken::LCurly)) {
// Eat "{" and mark the current place.
@@ -2316,21 +2737,26 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
// Parse memory broadcasting ({1to<NUM>}).
if (getLexer().getTok().getIntVal() != 1)
return TokError("Expected 1to<NUM> at this point");
- Parser.Lex(); // Eat "1" of 1to8
- if (!getLexer().is(AsmToken::Identifier) ||
- !getLexer().getTok().getIdentifier().startswith("to"))
+ StringRef Prefix = getLexer().getTok().getString();
+ Parser.Lex(); // Eat first token of 1to8
+ if (!getLexer().is(AsmToken::Identifier))
return TokError("Expected 1to<NUM> at this point");
// Recognize only reasonable suffixes.
+ SmallVector<char, 5> BroadcastVector;
+ StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier())
+ .toStringRef(BroadcastVector);
+ if (!BroadcastString.startswith("1to"))
+ return TokError("Expected 1to<NUM> at this point");
const char *BroadcastPrimitive =
- StringSwitch<const char*>(getLexer().getTok().getIdentifier())
- .Case("to2", "{1to2}")
- .Case("to4", "{1to4}")
- .Case("to8", "{1to8}")
- .Case("to16", "{1to16}")
- .Default(nullptr);
+ StringSwitch<const char *>(BroadcastString)
+ .Case("1to2", "{1to2}")
+ .Case("1to4", "{1to4}")
+ .Case("1to8", "{1to8}")
+ .Case("1to16", "{1to16}")
+ .Default(nullptr);
if (!BroadcastPrimitive)
return TokError("Invalid memory broadcast primitive.");
- Parser.Lex(); // Eat "toN" of 1toN
+ Parser.Lex(); // Eat trailing token of 1toN
if (!getLexer().is(AsmToken::RCurly))
return TokError("Expected } at this point");
Parser.Lex(); // Eat "}"
@@ -2390,10 +2816,9 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
/// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'. The '%ds:' prefix
/// has already been parsed if present. disp may be provided as well.
-std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
- const MCExpr *&Disp,
- const SMLoc &StartLoc,
- SMLoc &EndLoc) {
+bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
+ SMLoc StartLoc, SMLoc EndLoc,
+ OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc Loc;
// Based on the initial passed values, we may be in any of these cases, we are
@@ -2455,7 +2880,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
// Parse immediate if we're not at a mem operand yet.
if (!isAtMemOperand()) {
if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(Disp, EndLoc))
- return nullptr;
+ return true;
assert(!isa<X86MCExpr>(Disp) && "Expected non-register here.");
} else {
// Disp is implicitly zero if we haven't parsed it yet.
@@ -2468,9 +2893,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
if (!parseOptionalToken(AsmToken::LParen)) {
if (SegReg == 0)
- return X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc);
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
- StartLoc, EndLoc);
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+ else
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ 0, 0, 1, StartLoc, EndLoc));
+ return false;
}
// If we reached here, then eat the '(' and Process
@@ -2484,14 +2912,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
if (getLexer().isNot(AsmToken::Comma) && getLexer().isNot(AsmToken::RParen)) {
if (Parser.parseExpression(E, EndLoc) ||
check(!isa<X86MCExpr>(E), BaseLoc, "expected register here"))
- return nullptr;
+ return true;
// Sanity check register.
BaseReg = cast<X86MCExpr>(E)->getRegNo();
if (BaseReg == X86::EIZ || BaseReg == X86::RIZ)
- return ErrorOperand(BaseLoc,
- "eiz and riz can only be used as index registers",
- SMRange(BaseLoc, EndLoc));
+ return Error(BaseLoc, "eiz and riz can only be used as index registers",
+ SMRange(BaseLoc, EndLoc));
}
if (parseOptionalToken(AsmToken::Comma)) {
@@ -2503,14 +2930,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
// "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
if (getLexer().isNot(AsmToken::RParen)) {
if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(E, EndLoc))
- return nullptr;
+ return true;
if (!isa<X86MCExpr>(E)) {
// We've parsed an unexpected Scale Value instead of an index
// register. Interpret it as an absolute.
int64_t ScaleVal;
if (!E->evaluateAsAbsolute(ScaleVal, getStreamer().getAssemblerPtr()))
- return ErrorOperand(Loc, "expected absolute expression");
+ return Error(Loc, "expected absolute expression");
if (ScaleVal != 1)
Warning(Loc, "scale factor without index register is ignored");
Scale = 1;
@@ -2518,10 +2945,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
IndexReg = cast<X86MCExpr>(E)->getRegNo();
if (BaseReg == X86::RIP)
- return ErrorOperand(
- Loc, "%rip as base register can not have an index register");
+ return Error(Loc,
+ "%rip as base register can not have an index register");
if (IndexReg == X86::RIP)
- return ErrorOperand(Loc, "%rip is not allowed as an index register");
+ return Error(Loc, "%rip is not allowed as an index register");
if (parseOptionalToken(AsmToken::Comma)) {
// Parse the scale amount:
@@ -2532,15 +2959,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
int64_t ScaleVal;
if (Parser.parseTokenLoc(Loc) ||
Parser.parseAbsoluteExpression(ScaleVal))
- return ErrorOperand(Loc, "expected scale expression");
+ return Error(Loc, "expected scale expression");
Scale = (unsigned)ScaleVal;
// Validate the scale amount.
if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
Scale != 1)
- return ErrorOperand(Loc,
- "scale factor in 16-bit address must be 1");
+ return Error(Loc, "scale factor in 16-bit address must be 1");
if (checkScale(Scale, ErrMsg))
- return ErrorOperand(Loc, ErrMsg);
+ return Error(Loc, ErrMsg);
}
}
}
@@ -2549,23 +2975,30 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
// Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
if (parseToken(AsmToken::RParen, "unexpected token in memory operand"))
- return nullptr;
+ return true;
// This is to support otherwise illegal operand (%dx) found in various
// unofficial manuals examples (e.g. "out[s]?[bwl]? %al, (%dx)") and must now
// be supported. Mark such DX variants separately fix only in special cases.
if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 && SegReg == 0 &&
- isa<MCConstantExpr>(Disp) && cast<MCConstantExpr>(Disp)->getValue() == 0)
- return X86Operand::CreateDXReg(BaseLoc, BaseLoc);
+ isa<MCConstantExpr>(Disp) &&
+ cast<MCConstantExpr>(Disp)->getValue() == 0) {
+ Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc));
+ return false;
+ }
if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
ErrMsg))
- return ErrorOperand(BaseLoc, ErrMsg);
+ return Error(BaseLoc, ErrMsg);
if (SegReg || BaseReg || IndexReg)
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
- IndexReg, Scale, StartLoc, EndLoc);
- return X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc);
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ BaseReg, IndexReg, Scale, StartLoc,
+ EndLoc));
+ else
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+ return false;
}
// Parse either a standard primary expression or a register.
@@ -2582,7 +3015,7 @@ bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
Res = X86MCExpr::create(RegNo, Parser.getContext());
return false;
}
- return Parser.parsePrimaryExpr(Res, EndLoc);
+ return Parser.parsePrimaryExpr(Res, EndLoc, nullptr);
}
bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -2592,6 +3025,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Reset the forced VEX encoding.
ForcedVEXEncoding = VEXEncoding_Default;
+ ForcedDispEncoding = DispEncoding_Default;
// Parse pseudo prefixes.
while (1) {
@@ -2604,12 +3038,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(Parser.getTok().getLoc(), "Expected '}'");
Parser.Lex(); // Eat curly.
- if (Prefix == "vex" || Prefix == "vex2")
+ if (Prefix == "vex")
ForcedVEXEncoding = VEXEncoding_VEX;
+ else if (Prefix == "vex2")
+ ForcedVEXEncoding = VEXEncoding_VEX2;
else if (Prefix == "vex3")
ForcedVEXEncoding = VEXEncoding_VEX3;
else if (Prefix == "evex")
ForcedVEXEncoding = VEXEncoding_EVEX;
+ else if (Prefix == "disp8")
+ ForcedDispEncoding = DispEncoding_Disp8;
+ else if (Prefix == "disp32")
+ ForcedDispEncoding = DispEncoding_Disp32;
else
return Error(NameLoc, "unknown prefix");
@@ -2626,10 +3066,36 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
continue;
}
+ // Parse MASM style pseudo prefixes.
+ if (isParsingMSInlineAsm()) {
+ if (Name.equals_lower("vex"))
+ ForcedVEXEncoding = VEXEncoding_VEX;
+ else if (Name.equals_lower("vex2"))
+ ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Name.equals_lower("vex3"))
+ ForcedVEXEncoding = VEXEncoding_VEX3;
+ else if (Name.equals_lower("evex"))
+ ForcedVEXEncoding = VEXEncoding_EVEX;
+ if (ForcedVEXEncoding != VEXEncoding_Default) {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Expected identifier");
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ NameLoc = Parser.getTok().getLoc();
+ Parser.Lex();
+ }
+ }
break;
}
+ // Support the suffix syntax for overriding displacement size as well.
+ if (Name.consume_back(".d32")) {
+ ForcedDispEncoding = DispEncoding_Disp32;
+ } else if (Name.consume_back(".d8")) {
+ ForcedDispEncoding = DispEncoding_Disp8;
+ }
+
StringRef PatchedName = Name;
// Hack to skip "short" following Jcc.
@@ -2797,11 +3263,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// repz repnz <insn> ; GAS errors for the use of two similar prefixes
// lock addq %rax, %rbx ; Destination operand must be of memory type
// xacquire <insn> ; xacquire must be accompanied by 'lock'
- bool isPrefix = StringSwitch<bool>(Name)
- .Cases("rex64", "data32", "data16", true)
- .Cases("xacquire", "xrelease", true)
- .Cases("acquire", "release", isParsingIntelSyntax())
- .Default(false);
+ bool IsPrefix =
+ StringSwitch<bool>(Name)
+ .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
+ .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
+ .Cases("xacquire", "xrelease", true)
+ .Cases("acquire", "release", isParsingIntelSyntax())
+ .Default(false);
auto isLockRepeatNtPrefix = [](StringRef N) {
return StringSwitch<bool>(N)
@@ -2856,6 +3324,22 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(NameLoc, "'data32' is not supported in 64-bit mode");
// Hack to 'data16' for the table lookup.
PatchedName = "data16";
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ StringRef Next = Parser.getTok().getString();
+ getLexer().Lex();
+ // data32 effectively changes the instruction suffix.
+ // TODO Generalize.
+ if (Next == "callw")
+ Next = "calll";
+ if (Next == "ljmpw")
+ Next = "ljmpl";
+
+ Name = Next;
+ PatchedName = Name;
+ ForcedDataPrefix = X86::Mode32Bit;
+ IsPrefix = false;
+ }
}
Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
@@ -2871,20 +3355,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
// just want to parse the "lock" as the first instruction and the "incl" as
// the next one.
- if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) {
// Parse '*' modifier.
if (getLexer().is(AsmToken::Star))
Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
// Read the operands.
while(1) {
- if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
- Operands.push_back(std::move(Op));
- if (HandleAVX512Operand(Operands, *Operands.back()))
- return true;
- } else {
- return true;
- }
+ if (ParseOperand(Operands))
+ return true;
+ if (HandleAVX512Operand(Operands))
+ return true;
+
// check for comma and eat it
if (getLexer().is(AsmToken::Comma))
Parser.Lex();
@@ -2910,7 +3392,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Consume the EndOfStatement or the prefix separator Slash
if (getLexer().is(AsmToken::EndOfStatement) ||
- (isPrefix && getLexer().is(AsmToken::Slash)))
+ (IsPrefix && getLexer().is(AsmToken::Slash)))
Parser.Lex();
else if (CurlyAsEndOfStatement)
// Add an actual EndOfStatement before the curly brace
@@ -3064,39 +3546,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return HadVerifyError;
}
- // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to
- // "shift <op>".
- if ((Name.startswith("shr") || Name.startswith("sar") ||
- Name.startswith("shl") || Name.startswith("sal") ||
- Name.startswith("rcl") || Name.startswith("rcr") ||
- Name.startswith("rol") || Name.startswith("ror")) &&
- Operands.size() == 3) {
- if (isParsingIntelSyntax()) {
- // Intel syntax
- X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
- if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
- cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
- Operands.pop_back();
- } else {
- X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
- if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
- cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
- Operands.erase(Operands.begin() + 1);
- }
- }
-
- // Transforms "int $3" into "int3" as a size optimization. We can't write an
- // instalias with an immediate operand yet.
- if (Name == "int" && Operands.size() == 2) {
- X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
- if (Op1.isImm())
- if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
- if (CE->getValue() == 3) {
- Operands.erase(Operands.begin() + 1);
- static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
- }
- }
-
// Transforms "xlat mem8" into "xlatb"
if ((Name == "xlat" || Name == "xlatb") && Operands.size() == 2) {
X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
@@ -3118,6 +3567,26 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
switch (Inst.getOpcode()) {
default: return false;
+ case X86::JMP_1:
+ // {disp32} forces a larger displacement as if the instruction was relaxed.
+ // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+ // This matches GNU assembler.
+ if (ForcedDispEncoding == DispEncoding_Disp32) {
+ Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4);
+ return true;
+ }
+
+ return false;
+ case X86::JCC_1:
+ // {disp32} forces a larger displacement as if the instruction was relaxed.
+ // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+ // This matches GNU assembler.
+ if (ForcedDispEncoding == DispEncoding_Disp32) {
+ Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4);
+ return true;
+ }
+
+ return false;
case X86::VMOVZPQILo2PQIrr:
case X86::VMOVAPDrr:
case X86::VMOVAPDYrr:
@@ -3176,6 +3645,122 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
Inst.setOpcode(NewOpc);
return true;
}
+ case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri:
+ case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri:
+ case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
+ case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: {
+ // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+ // FIXME: It would be great if we could just do this with an InstAlias.
+ if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::RCR8ri: NewOpc = X86::RCR8r1; break;
+ case X86::RCR16ri: NewOpc = X86::RCR16r1; break;
+ case X86::RCR32ri: NewOpc = X86::RCR32r1; break;
+ case X86::RCR64ri: NewOpc = X86::RCR64r1; break;
+ case X86::RCL8ri: NewOpc = X86::RCL8r1; break;
+ case X86::RCL16ri: NewOpc = X86::RCL16r1; break;
+ case X86::RCL32ri: NewOpc = X86::RCL32r1; break;
+ case X86::RCL64ri: NewOpc = X86::RCL64r1; break;
+ case X86::ROR8ri: NewOpc = X86::ROR8r1; break;
+ case X86::ROR16ri: NewOpc = X86::ROR16r1; break;
+ case X86::ROR32ri: NewOpc = X86::ROR32r1; break;
+ case X86::ROR64ri: NewOpc = X86::ROR64r1; break;
+ case X86::ROL8ri: NewOpc = X86::ROL8r1; break;
+ case X86::ROL16ri: NewOpc = X86::ROL16r1; break;
+ case X86::ROL32ri: NewOpc = X86::ROL32r1; break;
+ case X86::ROL64ri: NewOpc = X86::ROL64r1; break;
+ case X86::SAR8ri: NewOpc = X86::SAR8r1; break;
+ case X86::SAR16ri: NewOpc = X86::SAR16r1; break;
+ case X86::SAR32ri: NewOpc = X86::SAR32r1; break;
+ case X86::SAR64ri: NewOpc = X86::SAR64r1; break;
+ case X86::SHR8ri: NewOpc = X86::SHR8r1; break;
+ case X86::SHR16ri: NewOpc = X86::SHR16r1; break;
+ case X86::SHR32ri: NewOpc = X86::SHR32r1; break;
+ case X86::SHR64ri: NewOpc = X86::SHR64r1; break;
+ case X86::SHL8ri: NewOpc = X86::SHL8r1; break;
+ case X86::SHL16ri: NewOpc = X86::SHL16r1; break;
+ case X86::SHL32ri: NewOpc = X86::SHL32r1; break;
+ case X86::SHL64ri: NewOpc = X86::SHL64r1; break;
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi:
+ case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi:
+ case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi:
+ case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi:
+ case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi:
+ case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi:
+ case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: {
+ // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+ // FIXME: It would be great if we could just do this with an InstAlias.
+ if (!Inst.getOperand(X86::AddrNumOperands).isImm() ||
+ Inst.getOperand(X86::AddrNumOperands).getImm() != 1)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::RCR8mi: NewOpc = X86::RCR8m1; break;
+ case X86::RCR16mi: NewOpc = X86::RCR16m1; break;
+ case X86::RCR32mi: NewOpc = X86::RCR32m1; break;
+ case X86::RCR64mi: NewOpc = X86::RCR64m1; break;
+ case X86::RCL8mi: NewOpc = X86::RCL8m1; break;
+ case X86::RCL16mi: NewOpc = X86::RCL16m1; break;
+ case X86::RCL32mi: NewOpc = X86::RCL32m1; break;
+ case X86::RCL64mi: NewOpc = X86::RCL64m1; break;
+ case X86::ROR8mi: NewOpc = X86::ROR8m1; break;
+ case X86::ROR16mi: NewOpc = X86::ROR16m1; break;
+ case X86::ROR32mi: NewOpc = X86::ROR32m1; break;
+ case X86::ROR64mi: NewOpc = X86::ROR64m1; break;
+ case X86::ROL8mi: NewOpc = X86::ROL8m1; break;
+ case X86::ROL16mi: NewOpc = X86::ROL16m1; break;
+ case X86::ROL32mi: NewOpc = X86::ROL32m1; break;
+ case X86::ROL64mi: NewOpc = X86::ROL64m1; break;
+ case X86::SAR8mi: NewOpc = X86::SAR8m1; break;
+ case X86::SAR16mi: NewOpc = X86::SAR16m1; break;
+ case X86::SAR32mi: NewOpc = X86::SAR32m1; break;
+ case X86::SAR64mi: NewOpc = X86::SAR64m1; break;
+ case X86::SHR8mi: NewOpc = X86::SHR8m1; break;
+ case X86::SHR16mi: NewOpc = X86::SHR16m1; break;
+ case X86::SHR32mi: NewOpc = X86::SHR32m1; break;
+ case X86::SHR64mi: NewOpc = X86::SHR64m1; break;
+ case X86::SHL8mi: NewOpc = X86::SHL8m1; break;
+ case X86::SHL16mi: NewOpc = X86::SHL16m1; break;
+ case X86::SHL32mi: NewOpc = X86::SHL32m1; break;
+ case X86::SHL64mi: NewOpc = X86::SHL64m1; break;
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ for (int i = 0; i != X86::AddrNumOperands; ++i)
+ TmpInst.addOperand(Inst.getOperand(i));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::INT: {
+ // Transforms "int $3" into "int3" as a size optimization. We can't write an
+ // instalias with an immediate operand yet.
+ if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3)
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::INT3);
+ Inst = TmpInst;
+ return true;
+ }
}
}
@@ -3275,6 +3860,33 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
}
}
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
+ // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
+ if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
+ MCPhysReg HReg = X86::NoRegister;
+ bool UsesRex = MCID.TSFlags & X86II::REX_W;
+ unsigned NumOps = Inst.getNumOperands();
+ for (unsigned i = 0; i != NumOps; ++i) {
+ const MCOperand &MO = Inst.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ HReg = Reg;
+ if (X86II::isX86_64NonExtLowByteReg(Reg) ||
+ X86II::isX86_64ExtendedReg(Reg))
+ UsesRex = true;
+ }
+
+ if (UsesRex && HReg != X86::NoRegister) {
+ StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
+ return Error(Ops[0]->getStartLoc(),
+ "can't encode '" + RegName + "' in an instruction requiring "
+ "REX prefix");
+ }
+ }
+
return false;
}
@@ -3468,10 +4080,18 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_Unsupported;
if ((ForcedVEXEncoding == VEXEncoding_VEX ||
+ ForcedVEXEncoding == VEXEncoding_VEX2 ||
ForcedVEXEncoding == VEXEncoding_VEX3) &&
(MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
return Match_Unsupported;
+ // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+ if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+ (ForcedVEXEncoding != VEXEncoding_VEX &&
+ ForcedVEXEncoding != VEXEncoding_VEX2 &&
+ ForcedVEXEncoding != VEXEncoding_VEX3))
+ return Match_Unsupported;
+
// These instructions match ambiguously with their VEX encoded counterparts
// and appear first in the matching table. Reject them unless we're forcing
// EVEX encoding.
@@ -3510,19 +4130,39 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
- // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
- // encoder.
- if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+ // encoder and printer.
+ if (ForcedVEXEncoding == VEXEncoding_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX3)
Prefixes |= X86::IP_USE_VEX3;
+ else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
if (Prefixes)
Inst.setFlags(Prefixes);
+ // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
+ // when matching the instruction.
+ if (ForcedDataPrefix == X86::Mode32Bit)
+ SwitchMode(X86::Mode32Bit);
// First, try a direct match.
FeatureBitset MissingFeatures;
unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
MissingFeatures, MatchingInlineAsm,
isParsingIntelSyntax());
+ if (ForcedDataPrefix == X86::Mode32Bit) {
+ SwitchMode(X86::Mode16Bit);
+ ForcedDataPrefix = 0;
+ }
switch (OriginalError) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
@@ -3631,6 +4271,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
unsigned NumSuccessfulMatches =
std::count(std::begin(Match), std::end(Match), Match_Success);
if (NumSuccessfulMatches == 1) {
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
emitInstruction(Inst, Operands, Out);
@@ -3744,10 +4393,22 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
- // If VEX3 encoding is forced, we need to pass the USE_VEX3 flag to the
- // encoder.
- if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+ // encoder and printer.
+ if (ForcedVEXEncoding == VEXEncoding_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX3)
Prefixes |= X86::IP_USE_VEX3;
+ else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
if (Prefixes)
Inst.setFlags(Prefixes);
@@ -3942,6 +4603,8 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
MCAsmParser &Parser = getParser();
StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal.startswith(".arch"))
+ return parseDirectiveArch();
if (IDVal.startswith(".code"))
return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
else if (IDVal.startswith(".att_syntax")) {
@@ -3966,7 +4629,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
"a '%' prefix in .intel_syntax");
}
return false;
- } else if (IDVal == ".even")
+ } else if (IDVal == ".nops")
+ return parseDirectiveNops(DirectiveID.getLoc());
+ else if (IDVal == ".even")
return parseDirectiveEven(DirectiveID.getLoc());
else if (IDVal == ".cv_fpo_proc")
return parseDirectiveFPOProc(DirectiveID.getLoc());
@@ -3982,20 +4647,67 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
else if (IDVal == ".cv_fpo_endproc")
return parseDirectiveFPOEndProc(DirectiveID.getLoc());
- else if (IDVal == ".seh_pushreg")
+ else if (IDVal == ".seh_pushreg" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg")))
return parseDirectiveSEHPushReg(DirectiveID.getLoc());
- else if (IDVal == ".seh_setframe")
+ else if (IDVal == ".seh_setframe" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".setframe")))
return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
- else if (IDVal == ".seh_savereg")
+ else if (IDVal == ".seh_savereg" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".savereg")))
return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
- else if (IDVal == ".seh_savexmm")
+ else if (IDVal == ".seh_savexmm" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128")))
return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
- else if (IDVal == ".seh_pushframe")
+ else if (IDVal == ".seh_pushframe" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe")))
return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
return true;
}
+bool X86AsmParser::parseDirectiveArch() {
+ // Ignore .arch for now.
+ getParser().parseStringToEndOfStatement();
+ return false;
+}
+
+/// parseDirectiveNops
+/// ::= .nops size[, control]
+bool X86AsmParser::parseDirectiveNops(SMLoc L) {
+ int64_t NumBytes = 0, Control = 0;
+ SMLoc NumBytesLoc, ControlLoc;
+ const MCSubtargetInfo STI = getSTI();
+ NumBytesLoc = getTok().getLoc();
+ if (getParser().checkForValidSection() ||
+ getParser().parseAbsoluteExpression(NumBytes))
+ return true;
+
+ if (parseOptionalToken(AsmToken::Comma)) {
+ ControlLoc = getTok().getLoc();
+ if (getParser().parseAbsoluteExpression(Control))
+ return true;
+ }
+ if (getParser().parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.nops' directive"))
+ return true;
+
+ if (NumBytes <= 0) {
+ Error(NumBytesLoc, "'.nops' directive with non-positive size");
+ return false;
+ }
+
+ if (Control < 0) {
+ Error(ControlLoc, "'.nops' directive with negative NOP size");
+ return false;
+ }
+
+ /// Emit nops
+ getParser().getStreamer().emitNops(NumBytes, Control, L);
+
+ return false;
+}
+
/// parseDirectiveEven
/// ::= .even
bool X86AsmParser::parseDirectiveEven(SMLoc L) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index a7fa1eb9a5ee..05e482a6b66e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1662,9 +1662,9 @@ namespace X86 {
sib = 504,
sib64 = 505
};
-}
+} // namespace X86
-}
+} // namespace llvm
static bool translateInstruction(MCInst &target,
InternalInstruction &source,
@@ -1689,7 +1689,7 @@ private:
DisassemblerMode fMode;
};
-}
+} // namespace
X86GenericDisassembler::X86GenericDisassembler(
const MCSubtargetInfo &STI,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 0134b4efce72..c685d7e0db81 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -16,6 +16,7 @@
#include "X86InstComments.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
@@ -384,6 +385,16 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
raw_ostream &O) {
+ // Do not print the exact form of the memory operand if it references a known
+ // binary object.
+ if (SymbolizeOperands && MIA) {
+ uint64_t Target;
+ if (MIA->evaluateBranch(*MI, 0, 0, Target))
+ return;
+ if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+ return;
+ }
+
const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 51ddae61d251..f7a850571260 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -36,6 +36,7 @@ public:
raw_ostream &O);
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index bf3b6bcb5463..95012a148d83 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -109,7 +109,7 @@ cl::opt<unsigned> X86PadMaxPrefixSize(
cl::desc("Maximum number of prefixes to use for padding"));
cl::opt<bool> X86PadForAlign(
- "x86-pad-for-align", cl::init(true), cl::Hidden,
+ "x86-pad-for-align", cl::init(false), cl::Hidden,
cl::desc("Pad previous instructions to implement align directives"));
cl::opt<bool> X86PadForBranchAlign(
@@ -207,6 +207,8 @@ public:
void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
+ unsigned getMaximumNopSize() const override;
+
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
} // end anonymous namespace
@@ -955,6 +957,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
if (!X86PadForAlign && !X86PadForBranchAlign)
return;
+ // The processed regions are delimitered by LabeledFragments. -g may have more
+ // MCSymbols and therefore different relaxation results. X86PadForAlign is
+ // disabled by default to eliminate the -g vs non -g difference.
DenseSet<MCFragment *> LabeledFragments;
for (const MCSymbol &S : Asm.symbols())
LabeledFragments.insert(S.getFragment(false));
@@ -1067,6 +1072,21 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
}
}
+unsigned X86AsmBackend::getMaximumNopSize() const {
+ if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
+ return 1;
+ if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
+ return 7;
+ if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+ return 15;
+ if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+ return 11;
+ // FIXME: handle 32-bit mode
+ // 15-bytes is the longest single NOP instruction, but 10-bytes is
+ // commonly the longest that can be efficiently decoded.
+ return 10;
+}
+
/// Write a sequence of optimal nops to the output, covering \p Count
/// bytes.
/// \return - true on success, false on failure
@@ -1094,23 +1114,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
"\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
};
- // This CPU doesn't support long nops. If needed add more.
- // FIXME: We could generated something better than plain 0x90.
- if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) {
- for (uint64_t i = 0; i < Count; ++i)
- OS << '\x90';
- return true;
- }
-
- // 15-bytes is the longest single NOP instruction, but 10-bytes is
- // commonly the longest that can be efficiently decoded.
- uint64_t MaxNopLength = 10;
- if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
- MaxNopLength = 7;
- else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
- MaxNopLength = 15;
- else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
- MaxNopLength = 11;
+ uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
// Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
// length.
@@ -1237,7 +1241,7 @@ namespace CU {
UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
};
-} // end CU namespace
+} // namespace CU
class DarwinX86AsmBackend : public X86AsmBackend {
const MCRegisterInfo &MRI;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 79f07d3c7792..4db1bfc25177 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -55,13 +55,18 @@ namespace X86 {
/// The constants to describe instr prefixes if there are
enum IPREFIXES {
IP_NO_PREFIX = 0,
- IP_HAS_OP_SIZE = 1,
- IP_HAS_AD_SIZE = 2,
- IP_HAS_REPEAT_NE = 4,
- IP_HAS_REPEAT = 8,
- IP_HAS_LOCK = 16,
- IP_HAS_NOTRACK = 32,
- IP_USE_VEX3 = 64,
+ IP_HAS_OP_SIZE = 1U << 0,
+ IP_HAS_AD_SIZE = 1U << 1,
+ IP_HAS_REPEAT_NE = 1U << 2,
+ IP_HAS_REPEAT = 1U << 3,
+ IP_HAS_LOCK = 1U << 4,
+ IP_HAS_NOTRACK = 1U << 5,
+ IP_USE_VEX = 1U << 6,
+ IP_USE_VEX2 = 1U << 7,
+ IP_USE_VEX3 = 1U << 8,
+ IP_USE_EVEX = 1U << 9,
+ IP_USE_DISP8 = 1U << 10,
+ IP_USE_DISP32 = 1U << 11,
};
enum OperandType : unsigned {
@@ -947,7 +952,11 @@ namespace X86II {
// NOTRACK prefix
NoTrackShift = EVEX_RCShift + 1,
- NOTRACK = 1ULL << NoTrackShift
+ NOTRACK = 1ULL << NoTrackShift,
+
+ // Force VEX encoding
+ ExplicitVEXShift = NoTrackShift + 1,
+ ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
};
/// \returns true if the instruction with given opcode is a prefix.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 292dd17e2f51..fa937d381613 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -94,6 +94,12 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
"32 bit reloc applied to a field with a different size");
}
+static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+ if (Type != RT64_64)
+ Ctx.reportError(Loc,
+ "64 bit reloc applied to a field with a different size");
+}
+
static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
MCSymbolRefExpr::VariantKind Modifier,
X86_64RelType Type, bool IsPCRel,
@@ -212,6 +218,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
return ELF::R_X86_64_REX_GOTPCRELX;
}
llvm_unreachable("unexpected relocation type!");
+ case MCSymbolRefExpr::VK_X86_PLTOFF:
+ checkIs64(Ctx, Loc, Type);
+ return ELF::R_X86_64_PLTOFF64;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 33d70fdb1214..d8dbbbbf2779 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -295,6 +295,10 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
/// \see MCInstPrinter::printInst
void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
unsigned OpNo, raw_ostream &O) {
+ // Do not print the numberic target address when symbolizing.
+ if (SymbolizeOperands)
+ return;
+
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isImm()) {
if (PrintBranchImmAsAddress) {
@@ -342,6 +346,21 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
O << "\trepne\t";
else if (Flags & X86::IP_HAS_REPEAT)
O << "\trep\t";
+
+ // These all require a pseudo prefix
+ if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
+ O << "\t{vex}";
+ else if (Flags & X86::IP_USE_VEX2)
+ O << "\t{vex2}";
+ else if (Flags & X86::IP_USE_VEX3)
+ O << "\t{vex3}";
+ else if (Flags & X86::IP_USE_EVEX)
+ O << "\t{evex}";
+
+ if (Flags & X86::IP_USE_DISP8)
+ O << "\t{disp8}";
+ else if (Flags & X86::IP_USE_DISP32)
+ O << "\t{disp32}";
}
void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index d1eb4d09851d..d5b205ad9a63 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -16,6 +16,7 @@
#include "X86InstComments.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -342,6 +343,15 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
raw_ostream &O) {
+ // Do not print the exact form of the memory operand if it references a known
+ // binary object.
+ if (SymbolizeOperands && MIA) {
+ uint64_t Target;
+ if (MIA->evaluateBranch(*MI, 0, 0, Target))
+ return;
+ if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+ return;
+ }
const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index 82baf611df03..aa4d0545ea46 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -37,6 +37,7 @@ public:
raw_ostream &O);
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 7dea0760a831..260253a5302d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -93,7 +93,8 @@ private:
bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
const MCSubtargetInfo &STI, raw_ostream &OS) const;
- bool emitREXPrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const;
+ bool emitREXPrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
};
} // end anonymous namespace
@@ -113,33 +114,28 @@ static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
}
}
-/// \returns true if this signed displacement fits in a 8-bit sign-extended
-/// field.
-static bool isDisp8(int Value) { return Value == (int8_t)Value; }
-
-/// \returns true if this signed displacement fits in a 8-bit compressed
-/// dispacement field.
-static bool isCDisp8(uint64_t TSFlags, int Value, int &CValue) {
- assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
- "Compressed 8-bit displacement is only valid for EVEX inst.");
+/// Determine if this immediate can fit in a disp8 or a compressed disp8 for
+/// EVEX instructions. \p will be set to the value to pass to the ImmOffset
+/// parameter of emitImmediate.
+static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
+ bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
- unsigned CD8_Scale =
+ int CD8_Scale =
(TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
- if (CD8_Scale == 0) {
- CValue = Value;
- return isDisp8(Value);
- }
+ if (!HasEVEX || CD8_Scale == 0)
+ return isInt<8>(Value);
+
+ assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!");
+ if (Value & (CD8_Scale - 1)) // Unaligned offset
+ return false;
- unsigned Mask = CD8_Scale - 1;
- assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
- if (Value & Mask) // Unaligned offset
+ int CDisp8 = Value / CD8_Scale;
+ if (!isInt<8>(CDisp8))
return false;
- Value /= (int)CD8_Scale;
- bool Ret = (Value == (int8_t)Value);
- if (Ret)
- CValue = Value;
- return Ret;
+ // ImmOffset will be added to Value in emitImmediate leaving just CDisp8.
+ ImmOffset = CDisp8 - Value;
+ return true;
}
/// \returns the appropriate fixup kind to use for an immediate in an
@@ -164,17 +160,18 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
/// \returns true if the specified instruction has a 16-bit memory operand.
static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
const MCSubtargetInfo &STI) {
- const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
- const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
- const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+
+ unsigned BaseReg = Base.getReg();
+ unsigned IndexReg = Index.getReg();
- if (STI.hasFeature(X86::Mode16Bit) && BaseReg.getReg() == 0 && Disp.isImm() &&
- Disp.getImm() < 0x10000)
+ if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
return true;
- if ((BaseReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
- (IndexReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ if ((BaseReg != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
+ (IndexReg != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
return true;
return false;
}
@@ -390,7 +387,6 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
unsigned BaseReg = Base.getReg();
- bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
// Handle %rip relative addressing.
if (BaseReg == X86::RIP ||
@@ -402,16 +398,33 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
emitByte(modRMByte(0, RegOpcodeField, 5), OS);
unsigned Opcode = MI.getOpcode();
- // movq loads are handled with a special relocation form which allows the
- // linker to eliminate some loads for GOT references which end up in the
- // same linkage unit.
- unsigned FixupKind = [=]() {
+ unsigned FixupKind = [&]() {
+ // Enable relaxed relocation only for a MCSymbolRefExpr. We cannot use a
+ // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4).
+ if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr())))
+ return X86::reloc_riprel_4byte;
+
+ // Certain loads for GOT references can be relocated against the symbol
+ // directly if the symbol ends up in the same linkage unit.
switch (Opcode) {
default:
return X86::reloc_riprel_4byte;
case X86::MOV64rm:
+ // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
+ // special case because COFF and Mach-O don't support ELF's more
+ // flexible R_X86_64_REX_GOTPCRELX relaxation.
assert(HasREX);
return X86::reloc_riprel_4byte_movq_load;
+ case X86::ADC32rm:
+ case X86::ADD32rm:
+ case X86::AND32rm:
+ case X86::CMP32rm:
+ case X86::MOV32rm:
+ case X86::OR32rm:
+ case X86::SBB32rm:
+ case X86::SUB32rm:
+ case X86::TEST32mr:
+ case X86::XOR32rm:
case X86::CALL64m:
case X86::JMP64m:
case X86::TAILJMPm64:
@@ -484,7 +497,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
}
- if (Disp.isImm() && isDisp8(Disp.getImm())) {
+ if (Disp.isImm() && isInt<8>(Disp.getImm())) {
if (Disp.getImm() == 0 && RMfield != 6) {
// There is no displacement; just the register.
emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
@@ -498,6 +511,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// This is the [REG]+disp16 case.
emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
} else {
+ assert(IndexReg.getReg() == 0 && "Unexpected index register!");
// There is no BaseReg; this is the plain [disp16] case.
emitByte(modRMByte(0, RegOpcodeField, 6), OS);
}
@@ -507,12 +521,18 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
return;
}
- // Determine whether a SIB byte is needed.
- // If no BaseReg, issue a RIP relative instruction only if the MCE can
- // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
- // 2-7) and absolute references.
+ // Check for presence of {disp8} or {disp32} pseudo prefixes.
+ bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8;
+ bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32;
- if ( // The SIB byte must be used if there is an index register.
+ // We only allow no displacement if no pseudo prefix is present.
+ bool AllowNoDisp = !UseDisp8 && !UseDisp32;
+ // Disp8 is allowed unless the {disp32} prefix is present.
+ bool AllowDisp8 = !UseDisp32;
+
+ // Determine whether a SIB byte is needed.
+ if (// The SIB byte must be used if there is an index register or the
+ // encoding requires a SIB byte.
!ForceSIB && IndexReg.getReg() == 0 &&
// The SIB byte must be used if the base is ESP/RSP/R12, all of which
// encode to an R/M value of 4, which indicates that a SIB byte is
@@ -528,12 +548,12 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
return;
}
- // If the base is not EBP/ESP and there is no displacement, use simple
- // indirect register encoding, this handles addresses like [EAX]. The
- // encoding for [EBP] with no displacement means [disp32] so we handle it
- // by emitting a displacement of 0 below.
+ // If the base is not EBP/ESP/R12/R13 and there is no displacement, use
+ // simple indirect register encoding, this handles addresses like [EAX].
+ // The encoding for [EBP] or[R13] with no displacement means [disp32] so we
+ // handle it by emitting a displacement of 0 later.
if (BaseRegNo != N86::EBP) {
- if (Disp.isImm() && Disp.getImm() == 0) {
+ if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) {
emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
return;
}
@@ -552,24 +572,22 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
}
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
- if (Disp.isImm()) {
- if (!HasEVEX && isDisp8(Disp.getImm())) {
- emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
- emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
- return;
- }
- // Try EVEX compressed 8-bit displacement first; if failed, fall back to
- // 32-bit displacement.
- int CDisp8 = 0;
- if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+ // Including a compressed disp8 for EVEX instructions that support it.
+ // This also handles the 0 displacement for [EBP] or [R13]. We can't use
+ // disp8 if the {disp32} pseudo prefix is present.
+ if (Disp.isImm() && AllowDisp8) {
+ int ImmOffset = 0;
+ if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
- CDisp8 - Disp.getImm());
+ ImmOffset);
return;
}
}
- // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32].
+ // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix
+ // prevented using disp8 above.
emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
unsigned Opcode = MI.getOpcode();
unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
@@ -585,64 +603,47 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
bool ForceDisp32 = false;
bool ForceDisp8 = false;
- int CDisp8 = 0;
int ImmOffset = 0;
if (BaseReg == 0) {
// If there is no base register, we emit the special case SIB byte with
// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ BaseRegNo = 5;
emitByte(modRMByte(0, RegOpcodeField, 4), OS);
ForceDisp32 = true;
- } else if (!Disp.isImm()) {
- // Emit the normal disp32 encoding.
- emitByte(modRMByte(2, RegOpcodeField, 4), OS);
- ForceDisp32 = true;
- } else if (Disp.getImm() == 0 &&
- // Base reg can't be anything that ends up with '5' as the base
- // reg, it is the magic [*] nomenclature that indicates no base.
+ } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp &&
+ // Base reg can't be EBP/RBP/R13 as that would end up with '5' as
+ // the base field, but that is the magic [*] nomenclature that
+ // indicates no base when mod=0. For these cases we'll emit a 0
+ // displacement instead.
BaseRegNo != N86::EBP) {
// Emit no displacement ModR/M byte
emitByte(modRMByte(0, RegOpcodeField, 4), OS);
- } else if (!HasEVEX && isDisp8(Disp.getImm())) {
- // Emit the disp8 encoding.
- emitByte(modRMByte(1, RegOpcodeField, 4), OS);
- ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
- } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
- // Emit the disp8 encoding.
+ } else if (Disp.isImm() && AllowDisp8 &&
+ isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
+ // Displacement fits in a byte or matches an EVEX compressed disp8, use
+ // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless
+ // {disp32} pseudo prefix was used.
emitByte(modRMByte(1, RegOpcodeField, 4), OS);
- ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
- ImmOffset = CDisp8 - Disp.getImm();
+ ForceDisp8 = true;
} else {
- // Emit the normal disp32 encoding.
+ // Otherwise, emit the normal disp32 encoding.
emitByte(modRMByte(2, RegOpcodeField, 4), OS);
+ ForceDisp32 = true;
}
// Calculate what the SS field value should be...
static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3};
unsigned SS = SSTable[Scale.getImm()];
- if (BaseReg == 0) {
- // Handle the SIB byte for the case where there is no base, see Intel
- // Manual 2A, table 2-7. The displacement has already been output.
- unsigned IndexRegNo;
- if (IndexReg.getReg())
- IndexRegNo = getX86RegNum(IndexReg);
- else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
- IndexRegNo = 4;
- emitSIBByte(SS, IndexRegNo, 5, OS);
- } else {
- unsigned IndexRegNo;
- if (IndexReg.getReg())
- IndexRegNo = getX86RegNum(IndexReg);
- else
- IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
- emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS);
- }
+ unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4;
+
+ emitSIBByte(SS, IndexRegNo, BaseRegNo, OS);
// Do we need to output a displacement?
if (ForceDisp8)
emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
ImmOffset);
- else if (ForceDisp32 || Disp.getImm() != 0)
+ else if (ForceDisp32)
emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
StartByte, OS, Fixups);
}
@@ -1200,6 +1201,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
///
/// \returns true if REX prefix is used, otherwise returns false.
bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI,
raw_ostream &OS) const {
uint8_t REX = [&, MemOperand]() {
uint8_t REX = 0;
@@ -1220,15 +1222,28 @@ bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
// If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
for (unsigned i = CurOp; i != NumOps; ++i) {
const MCOperand &MO = MI.getOperand(i);
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
- UsesHighByteReg = true;
- if (X86II::isX86_64NonExtLowByteReg(Reg))
- // FIXME: The caller of determineREXPrefix slaps this prefix onto
- // anything that returns non-zero.
- REX |= 0x40; // REX fixed encoding prefix
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH ||
+ Reg == X86::DH)
+ UsesHighByteReg = true;
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ // FIXME: The caller of determineREXPrefix slaps this prefix onto
+ // anything that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ } else if (MO.isExpr() &&
+ STI.getTargetTriple().getEnvironment() == Triple::GNUX32) {
+ // GOTTPOFF and TLSDESC relocations require a REX prefix to allow
+ // linker optimizations: even if the instructions we see may not require
+ // any prefix, they may be replaced by instructions that do. This is
+ // handled as a special case here so that it also works for hand-written
+ // assembly without the user needing to write REX, as with GNU as.
+ const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF ||
+ Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) {
+ REX |= 0x40; // REX fixed encoding prefix
+ }
+ }
}
switch (TSFlags & X86II::FormMask) {
@@ -1351,7 +1366,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
"REX.W requires 64bit mode.");
bool HasREX = STI.hasFeature(X86::Mode64Bit)
- ? emitREXPrefix(MemOperand, MI, OS)
+ ? emitREXPrefix(MemOperand, MI, STI, OS)
: false;
// 0x0F escape code must be emitted just before the opcode.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 81110ba666e9..5cf8d77519d9 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -44,8 +44,10 @@ using namespace llvm;
std::string X86_MC::ParseX86Triple(const Triple &TT) {
std::string FS;
- if (TT.getArch() == Triple::x86_64)
- FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+ // SSE2 should default to enabled in 64-bit mode, but can be turned off
+ // explicitly.
+ if (TT.isArch64Bit())
+ FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2";
else if (TT.getEnvironment() != Triple::CODE16)
FS = "-64bit-mode,+32bit-mode,-16bit-mode";
else
@@ -290,11 +292,10 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
if (!FS.empty())
ArchFS = (Twine(ArchFS) + "," + FS).str();
- std::string CPUName = std::string(CPU);
- if (CPUName.empty())
- CPUName = "generic";
+ if (CPU.empty())
+ CPU = "generic";
- return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
+ return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
}
static MCInstrInfo *createX86MCInstrInfo() {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index e8c72be1d9b6..35604cd3ec0a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -85,11 +85,11 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T,
/// Implements X86-only directives for assembly emission.
MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S,
formatted_raw_ostream &OS,
- MCInstPrinter *InstPrint,
- bool isVerboseAsm);
+ MCInstPrinter *InstPrinter,
+ bool IsVerboseAsm);
/// Implements X86-only directives for object files.
-MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &OS,
+MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
const MCSubtargetInfo &STI);
/// Construct an X86 Windows COFF machine code streamer which will generate
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index b67a7508fe72..b98e58d653db 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -68,7 +68,7 @@ public:
FixedValue);
}
};
-}
+} // namespace
static bool isFixupKindRIPRel(unsigned Kind) {
return Kind == X86::reloc_riprel_4byte ||
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
index 62c1c399a606..201b22d6232d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -568,4 +568,4 @@ void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
}
}
-} // llvm namespace
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 3bebcc24fd3a..c29211246123 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -26,6 +26,7 @@ public:
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
void EmitWinEHHandlerData(SMLoc Loc) override;
+ void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
void EmitWindowsUnwindTables() override;
void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
void finishImpl() override;
@@ -37,7 +38,11 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section.
if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
- EHStreamer.EmitUnwindInfo(*this, CurFrame);
+ EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+ EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
}
void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
@@ -58,7 +63,7 @@ void X86WinCOFFStreamer::finishImpl() {
MCWinCOFFStreamer::finishImpl();
}
-}
+} // namespace
MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
std::unique_ptr<MCAsmBackend> &&AB,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.h b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
index 91ba4e3d091e..e17b9ba5500b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
@@ -67,9 +67,6 @@ FunctionPass *createX86OptimizeLEAs();
/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
FunctionPass *createX86FixupSetCC();
-/// Return a pass that folds conditional branch jumps.
-FunctionPass *createX86CondBrFolding();
-
/// Return a pass that avoids creating store forward block issues in the hardware.
FunctionPass *createX86AvoidStoreForwardingBlocks();
@@ -79,6 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
+FunctionPass *createX86TileConfigPass();
+
+FunctionPass *createX86PreTileConfigPass();
+
/// Return a pass that inserts int3 at the end of the function if it ends with a
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
@@ -154,7 +155,6 @@ void initializeX86AvoidSFBPassPass(PassRegistry &);
void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86CondBrFoldingPassPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
void initializeX86ExpandPseudoPass(PassRegistry &);
@@ -166,6 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
+void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86TileConfigPass(PassRegistry &);
+void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
namespace X86AS {
enum : unsigned {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.td b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
index dc1ff72add49..c492d686c52e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
@@ -171,6 +171,9 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
+def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+ "Support AVX_VNNI encoding",
+ [FeatureAVX2]>;
def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
"Support bfloat16 floating point",
[FeatureBWI]>;
@@ -234,8 +237,8 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
"Support RDSEED instruction">;
-def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
- "Support LAHF and SAHF instructions">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
+ "Support LAHF and SAHF instructions in 64-bit mode">;
def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
"Enable MONITORX/MWAITX timer functionality">;
def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
@@ -244,11 +247,6 @@ def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
"Enable Cache Demote">;
def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
"Support ptwrite instruction">;
-// FIXME: This feature is deprecated in 10.0 and should not be used for
-// anything, but removing it would break IR files that may contain it in a
-// target-feature attribute.
-def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
- "Deprecated. Support MPX instructions">;
def FeatureAMXTILE : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
"Support AMX-TILE instructions">;
def FeatureAMXINT8 : SubtargetFeature<"amx-int8", "HasAMXINT8", "true",
@@ -284,10 +282,20 @@ def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
"Has ENQCMD instructions">;
+def FeatureKL : SubtargetFeature<"kl", "HasKL", "true",
+ "Support Key Locker kl Instructions",
+ [FeatureSSE2]>;
+def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true",
+ "Support Key Locker wide Instructions",
+ [FeatureKL]>;
+def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true",
+ "Has hreset instruction">;
def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true",
"Has serialize instruction">;
def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
"Support TSXLDTRK instructions">;
+def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
+ "Has UINTR Instructions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -377,6 +385,12 @@ def FeatureERMSB
"ermsb", "HasERMSB", "true",
"REP MOVS/STOS are fast">;
+// Icelake and newer processors have Fast Short REP MOV.
+def FeatureFSRM
+ : SubtargetFeature<
+ "fsrm", "HasFSRM", "true",
+ "REP MOVSB of short lengths is faster">;
+
// Bulldozer and newer processors can merge CMP/TEST (but not other
// instructions) with conditional branches.
def FeatureBranchFusion
@@ -504,12 +518,6 @@ def FeatureUseGLMDivSqrtCosts
: SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
"Use Goldmont specific floating point div/sqrt costs">;
-// Merge branches using three-way conditional code.
-def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
- "ThreewayBranchProfitable", "true",
- "Merge branches to a three-way "
- "conditional branch">;
-
// Enable use of alias analysis during code generation.
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
"Use alias analysis during codegen">;
@@ -557,59 +565,59 @@ include "X86SchedSkylakeServer.td"
//===----------------------------------------------------------------------===//
def ProcessorFeatures {
+ // x86-64 and x86-64-v[234]
+ list<SubtargetFeature> X86_64V1Features = [
+ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, Feature64Bit
+ ];
+ list<SubtargetFeature> X86_64V2Features = !listconcat(
+ X86_64V1Features,
+ [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]);
+ list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
+ FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT,
+ FeatureMOVBE, FeatureXSAVE
+ ]);
+ list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
+ FeatureBWI,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureVLX,
+ ]);
+
// Nehalem
- list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE42,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePOPCNT,
- FeatureLAHFSAHF,
- FeatureMacroFusion,
- FeatureInsertVZEROUPPER];
- list<SubtargetFeature> NHMSpecificFeatures = [];
- list<SubtargetFeature> NHMFeatures =
- !listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
+ list<SubtargetFeature> NHMFeatures = X86_64V2Features;
+ list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
+ FeatureInsertVZEROUPPER];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
- list<SubtargetFeature> WSMSpecificFeatures = [];
- list<SubtargetFeature> WSMInheritableFeatures =
- !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
+ list<SubtargetFeature> WSMTuning = NHMTuning;
list<SubtargetFeature> WSMFeatures =
- !listconcat(WSMInheritableFeatures, WSMSpecificFeatures);
+ !listconcat(NHMFeatures, WSMAdditionalFeatures);
// Sandybridge
list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
- FeatureSlowDivide64,
FeatureXSAVE,
- FeatureXSAVEOPT,
- FeatureSlow3OpsLEA,
- FeatureFastScalarFSQRT,
- FeatureFastSHLDRotate,
- FeatureMergeToThreeWayBranch,
- FeatureFast15ByteNOP];
- list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> SNBInheritableFeatures =
- !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
+ FeatureXSAVEOPT];
+ list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowUAMem32,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SNBFeatures =
- !listconcat(SNBInheritableFeatures, SNBSpecificFeatures);
+ !listconcat(WSMFeatures, SNBAdditionalFeatures);
// Ivybridge
list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
FeatureF16C,
FeatureFSGSBase];
- list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> IVBInheritableFeatures =
- !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
+ list<SubtargetFeature> IVBTuning = SNBTuning;
list<SubtargetFeature> IVBFeatures =
- !listconcat(IVBInheritableFeatures, IVBSpecificFeatures);
+ !listconcat(SNBFeatures, IVBAdditionalFeatures);
// Haswell
list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
@@ -619,77 +627,89 @@ def ProcessorFeatures {
FeatureFMA,
FeatureINVPCID,
FeatureLZCNT,
- FeatureMOVBE,
- FeatureFastVariableShuffle];
- list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps];
- list<SubtargetFeature> HSWInheritableFeatures =
- !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
+ FeatureMOVBE];
+ list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> HSWFeatures =
- !listconcat(HSWInheritableFeatures, HSWSpecificFeatures);
+ !listconcat(IVBFeatures, HSWAdditionalFeatures);
// Broadwell
list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
FeatureRDSEED,
FeaturePRFCHW];
- list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps];
- list<SubtargetFeature> BDWInheritableFeatures =
- !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
+ list<SubtargetFeature> BDWTuning = HSWTuning;
list<SubtargetFeature> BDWFeatures =
- !listconcat(BDWInheritableFeatures, BDWSpecificFeatures);
+ !listconcat(HSWFeatures, BDWAdditionalFeatures);
// Skylake
list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT,
- FeatureFastVectorFSQRT];
- list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
- FeaturePOPCNTFalseDeps,
- FeatureSGX];
- list<SubtargetFeature> SKLInheritableFeatures =
- !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
+ FeatureSGX];
+ list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKLFeatures =
- !listconcat(SKLInheritableFeatures, SKLSpecificFeatures);
+ !listconcat(BDWFeatures, SKLAdditionalFeatures);
// Skylake-AVX512
- list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
- FeaturePrefer256Bit,
+ list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureAVX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
FeatureVLX,
FeaturePKU,
FeatureCLWB];
- list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> SKXInheritableFeatures =
- !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
+ list<SubtargetFeature> SKXTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePrefer256Bit,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKXFeatures =
- !listconcat(SKXInheritableFeatures, SKXSpecificFeatures);
+ !listconcat(BDWFeatures, SKXAdditionalFeatures);
// Cascadelake
list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
- list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> CLXInheritableFeatures =
- !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
+ list<SubtargetFeature> CLXTuning = SKXTuning;
list<SubtargetFeature> CLXFeatures =
- !listconcat(CLXInheritableFeatures, CLXSpecificFeatures);
+ !listconcat(SKXFeatures, CLXAdditionalFeatures);
// Cooperlake
list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
- list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> CPXInheritableFeatures =
- !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
+ list<SubtargetFeature> CPXTuning = SKXTuning;
list<SubtargetFeature> CPXFeatures =
- !listconcat(CPXInheritableFeatures, CPXSpecificFeatures);
+ !listconcat(CLXFeatures, CPXAdditionalFeatures);
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
- FeaturePrefer256Bit,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -697,13 +717,20 @@ def ProcessorFeatures {
FeaturePKU,
FeatureVBMI,
FeatureIFMA,
- FeatureSHA,
- FeatureSGX];
- list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
- list<SubtargetFeature> CNLInheritableFeatures =
- !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
+ FeatureSHA];
+ list<SubtargetFeature> CNLTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePrefer256Bit,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> CNLFeatures =
- !listconcat(CNLInheritableFeatures, CNLSpecificFeatures);
+ !listconcat(SKLFeatures, CNLAdditionalFeatures);
// Icelake
list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
@@ -714,72 +741,99 @@ def ProcessorFeatures {
FeatureVPOPCNTDQ,
FeatureGFNI,
FeatureCLWB,
- FeatureRDPID];
- list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
- list<SubtargetFeature> ICLInheritableFeatures =
- !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
+ FeatureRDPID,
+ FeatureFSRM];
+ list<SubtargetFeature> ICLTuning = CNLTuning;
list<SubtargetFeature> ICLFeatures =
- !listconcat(ICLInheritableFeatures, ICLSpecificFeatures);
+ !listconcat(CNLFeatures, ICLAdditionalFeatures);
// Icelake Server
- list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
- FeatureWBNOINVD,
- FeatureHasFastGather];
+ list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
+ FeatureWBNOINVD];
+ list<SubtargetFeature> ICXTuning = CNLTuning;
list<SubtargetFeature> ICXFeatures =
- !listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
+ !listconcat(ICLFeatures, ICXAdditionalFeatures);
//Tigerlake
list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureSHSTK];
- list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather];
- list<SubtargetFeature> TGLInheritableFeatures =
- !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures);
+ list<SubtargetFeature> TGLTuning = CNLTuning;
list<SubtargetFeature> TGLFeatures =
- !listconcat(ICLFeatures, TGLInheritableFeatures );
+ !listconcat(ICLFeatures, TGLAdditionalFeatures );
+
+ //Sapphirerapids
+ list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE,
+ FeatureAMXINT8,
+ FeatureAMXBF16,
+ FeatureBF16,
+ FeatureSERIALIZE,
+ FeatureCLDEMOTE,
+ FeatureWAITPKG,
+ FeaturePTWRITE,
+ FeatureAVXVNNI,
+ FeatureTSXLDTRK,
+ FeatureENQCMD,
+ FeatureSHSTK,
+ FeatureVP2INTERSECT,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureUINTR];
+ list<SubtargetFeature> SPRTuning = ICXTuning;
+ list<SubtargetFeature> SPRFeatures =
+ !listconcat(ICXFeatures, SPRAdditionalFeatures);
+
+ // Alderlake
+ list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+ FeatureCLDEMOTE,
+ FeatureHRESET,
+ FeaturePTWRITE,
+ FeatureSERIALIZE,
+ FeatureWAITPKG];
+ list<SubtargetFeature> ADLTuning = SKLTuning;
+ list<SubtargetFeature> ADLFeatures =
+ !listconcat(SKLFeatures, ADLAdditionalFeatures);
// Atom
- list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeatureSlowTwoMemOps,
- FeatureLAHFSAHF,
- FeatureInsertVZEROUPPER];
- list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
- FeatureSlowUAMem16,
- FeatureLEAForSP,
- FeatureSlowDivide32,
- FeatureSlowDivide64,
- FeatureLEAUsesAG,
- FeaturePadShortFunctions];
- list<SubtargetFeature> AtomFeatures =
- !listconcat(AtomInheritableFeatures, AtomSpecificFeatures);
+ list<SubtargetFeature> AtomFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureSlowTwoMemOps,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions,
+ FeatureInsertVZEROUPPER];
// Silvermont
list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
FeaturePOPCNT,
FeaturePCLMUL,
FeaturePRFCHW,
- FeatureSlowLEA,
- FeatureSlowIncDec,
FeatureRDRAND];
- list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
- FeatureSlowDivide64,
- FeatureSlowPMULLD,
- FeatureFast7ByteNOP,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> SLMInheritableFeatures =
- !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
+ list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureSlowDivide64,
+ FeatureSlowPMULLD,
+ FeatureFast7ByteNOP,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SLMFeatures =
- !listconcat(SLMInheritableFeatures, SLMSpecificFeatures);
+ !listconcat(AtomFeatures, SLMAdditionalFeatures);
// Goldmont
list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
@@ -791,31 +845,33 @@ def ProcessorFeatures {
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureFSGSBase];
- list<SubtargetFeature> GLMSpecificFeatures = [FeatureUseGLMDivSqrtCosts,
- FeaturePOPCNTFalseDeps];
- list<SubtargetFeature> GLMInheritableFeatures =
- !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
+ list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLMFeatures =
- !listconcat(GLMInheritableFeatures, GLMSpecificFeatures);
+ !listconcat(SLMFeatures, GLMAdditionalFeatures);
// Goldmont Plus
list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
FeatureRDPID,
FeatureSGX];
- list<SubtargetFeature> GLPSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
- list<SubtargetFeature> GLPInheritableFeatures =
- !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
+ list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLPFeatures =
- !listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
+ !listconcat(GLMFeatures, GLPAdditionalFeatures);
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
- list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
- list<SubtargetFeature> TRMInheritableFeatures =
- !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures);
+ list<SubtargetFeature> TRMTuning = GLPTuning;
list<SubtargetFeature> TRMFeatures =
- !listconcat(TRMInheritableFeatures, TRMSpecificFeatures);
+ !listconcat(GLPFeatures, TRMAdditionalFeatures);
// Knights Landing
list<SubtargetFeature> KNLFeatures = [FeatureX87,
@@ -827,13 +883,10 @@ def ProcessorFeatures {
Feature64Bit,
FeatureCMPXCHG16B,
FeaturePOPCNT,
- FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
- FeatureSlow3OpsLEA,
- FeatureSlowIncDec,
FeatureAES,
FeatureRDRAND,
FeatureF16C,
@@ -850,56 +903,56 @@ def ProcessorFeatures {
FeatureBMI,
FeatureBMI2,
FeatureFMA,
- FeaturePRFCHW,
- FeaturePreferMaskRegisters,
- FeatureSlowTwoMemOps,
- FeatureHasFastGather,
- FeatureSlowPMADDWD];
+ FeaturePRFCHW];
+ list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
+ FeatureSlow3OpsLEA,
+ FeatureSlowIncDec,
+ FeatureSlowTwoMemOps,
+ FeaturePreferMaskRegisters,
+ FeatureHasFastGather,
+ FeatureSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
list<SubtargetFeature> KNMFeatures =
!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
// Barcelona
- list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureSSE4A,
- Feature3DNowA,
- FeatureFXSR,
- FeatureNOPL,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureCMOV,
- Feature64Bit,
- FeatureFastScalarShiftMasks,
- FeatureInsertVZEROUPPER];
- list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
+ list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureSSE4A,
+ Feature3DNowA,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF,
+ FeatureCMOV,
+ Feature64Bit];
+ list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
// Bobcat
- list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureSSE4A,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast15ByteNOP,
- FeatureFastScalarShiftMasks,
- FeatureFastVectorShiftMasks];
- list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER];
- list<SubtargetFeature> BtVer1Features =
- !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures);
+ list<SubtargetFeature> BtVer1Features = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@@ -910,38 +963,39 @@ def ProcessorFeatures {
FeatureMOVBE,
FeatureXSAVE,
FeatureXSAVEOPT];
- list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
- FeatureFastBEXTR,
- FeatureFastHorizontalOps];
- list<SubtargetFeature> BtVer2InheritableFeatures =
- !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
+ list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFastHorizontalOps,
+ FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks,
+ FeatureSlowSHLD];
list<SubtargetFeature> BtVer2Features =
- !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);
+ !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
// Bulldozer
- list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureXOP,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureLWP,
- FeatureSlowSHLD,
- FeatureLAHFSAHF,
- FeatureFast11ByteNOP,
- FeatureFastScalarShiftMasks,
- FeatureBranchFusion,
- FeatureInsertVZEROUPPER];
- list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
+ list<SubtargetFeature> BdVer1Features = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureXOP,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureLWP,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
+ FeatureFast11ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureBranchFusion,
+ FeatureInsertVZEROUPPER];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -949,16 +1003,16 @@ def ProcessorFeatures {
FeatureTBM,
FeatureFMA,
FeatureFastBEXTR];
- list<SubtargetFeature> BdVer2InheritableFeatures =
- !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
- list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;
+ list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
+ list<SubtargetFeature> BdVer2Features =
+ !listconcat(BdVer1Features, BdVer2AdditionalFeatures);
// Steamroller
list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
FeatureFSGSBase];
- list<SubtargetFeature> BdVer3InheritableFeatures =
- !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
- list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;
+ list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning;
+ list<SubtargetFeature> BdVer3Features =
+ !listconcat(BdVer2Features, BdVer3AdditionalFeatures);
// Excavator
list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
@@ -966,9 +1020,9 @@ def ProcessorFeatures {
FeatureMOVBE,
FeatureRDRAND,
FeatureMWAITX];
- list<SubtargetFeature> BdVer4InheritableFeatures =
- !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
- list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;
+ list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning;
+ list<SubtargetFeature> BdVer4Features =
+ !listconcat(BdVer3Features, BdVer4AdditionalFeatures);
// AMD Zen Processors common ISAs
@@ -987,13 +1041,8 @@ def ProcessorFeatures {
FeatureFSGSBase,
FeatureFXSR,
FeatureNOPL,
- FeatureFastLZCNT,
FeatureLAHFSAHF,
FeatureLZCNT,
- FeatureFastBEXTR,
- FeatureFast15ByteNOP,
- FeatureBranchFusion,
- FeatureFastScalarShiftMasks,
FeatureMMX,
FeatureMOVBE,
FeatureMWAITX,
@@ -1004,56 +1053,85 @@ def ProcessorFeatures {
FeatureRDSEED,
FeatureSHA,
FeatureSSE4A,
- FeatureSlowSHLD,
- FeatureInsertVZEROUPPER,
FeatureX87,
FeatureXSAVE,
FeatureXSAVEC,
FeatureXSAVEOPT,
FeatureXSAVES];
+ list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFast15ByteNOP,
+ FeatureBranchFusion,
+ FeatureFastScalarShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureWBNOINVD];
+ list<SubtargetFeature> ZN2Tuning = ZNTuning;
list<SubtargetFeature> ZN2Features =
!listconcat(ZNFeatures, ZN2AdditionalFeatures);
+ list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM,
+ FeatureINVPCID,
+ FeaturePKU,
+ FeatureVAES,
+ FeatureVPCLMULQDQ];
+ list<SubtargetFeature> ZN3Tuning = ZNTuning;
+ list<SubtargetFeature> ZN3Features =
+ !listconcat(ZN2Features, ZN3AdditionalFeatures);
}
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
-class Proc<string Name, list<SubtargetFeature> Features>
- : ProcessorModel<Name, GenericModel, Features>;
+class Proc<string Name, list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, GenericModel, Features, TuneFeatures>;
+
+class ProcModel<string Name, SchedMachineModel Model,
+ list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, Model, Features, TuneFeatures>;
// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
// if i386/i486 is specifically requested.
-def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
-def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16,
- FeatureInsertVZEROUPPER]>;
-def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16,
- FeatureInsertVZEROUPPER]>;
-def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B, FeatureMMX,
- FeatureInsertVZEROUPPER]>;
-
-def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureCMOV, FeatureFXSR,
- FeatureNOPL, FeatureInsertVZEROUPPER]>;
+// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
+// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
+// It has no effect on code generation.
+def : ProcModel<"generic", SandyBridgeModel,
+ [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
+ [FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowIncDec,
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i386", [FeatureX87],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i486", [FeatureX87],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureNOPL],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
+ FeatureFXSR, FeatureNOPL],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium3", "pentium3m"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV,
- FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -1066,35 +1144,35 @@ foreach P = ["pentium3", "pentium3m"] in {
// measure to avoid performance surprises, in case clang's default cpu
// changes slightly.
-def : ProcessorModel<"pentium-m", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"pentium-m", GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
- def : ProcessorModel<P, GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
+ def : ProcModel<P, GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
// Intel Quark.
-def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>;
+def : Proc<"lakemont", [FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
// Intel Core Duo.
-def : ProcessorModel<"yonah", SandyBridgeModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"yonah", SandyBridgeModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
// NetBurst.
-def : ProcessorModel<"prescott", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
-def : ProcessorModel<"nocona", GenericPostRAModel, [
+def : ProcModel<"prescott", GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"nocona", GenericPostRAModel, [
FeatureX87,
- FeatureSlowUAMem16,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
@@ -1103,13 +1181,15 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
+],
+[
+ FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
]>;
// Intel Core 2 Solo/Duo.
-def : ProcessorModel<"core2", SandyBridgeModel, [
+def : ProcModel<"core2", SandyBridgeModel, [
FeatureX87,
- FeatureSlowUAMem16,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
@@ -1118,13 +1198,15 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
- FeatureLAHFSAHF,
+ FeatureLAHFSAHF
+],
+[
FeatureMacroFusion,
+ FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
]>;
-def : ProcessorModel<"penryn", SandyBridgeModel, [
+def : ProcModel<"penryn", SandyBridgeModel, [
FeatureX87,
- FeatureSlowUAMem16,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
@@ -1133,140 +1215,171 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
- FeatureLAHFSAHF,
+ FeatureLAHFSAHF
+],
+[
FeatureMacroFusion,
+ FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
]>;
// Atom CPUs.
foreach P = ["bonnell", "atom"] in {
- def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
+ def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures,
+ ProcessorFeatures.AtomTuning>;
}
foreach P = ["silvermont", "slm"] in {
- def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
+ def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures,
+ ProcessorFeatures.SLMTuning>;
}
-def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
-def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
-def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;
+def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures,
+ ProcessorFeatures.GLMTuning>;
+def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures,
+ ProcessorFeatures.GLPTuning>;
+def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures,
+ ProcessorFeatures.TRMTuning>;
// "Arrandale" along with corei3 and corei5
foreach P = ["nehalem", "corei7"] in {
- def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures,
+ ProcessorFeatures.NHMTuning>;
}
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : ProcessorModel<"westmere", SandyBridgeModel,
- ProcessorFeatures.WSMFeatures>;
+def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures,
+ ProcessorFeatures.WSMTuning>;
foreach P = ["sandybridge", "corei7-avx"] in {
- def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures,
+ ProcessorFeatures.SNBTuning>;
}
foreach P = ["ivybridge", "core-avx-i"] in {
- def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures,
+ ProcessorFeatures.IVBTuning>;
}
foreach P = ["haswell", "core-avx2"] in {
- def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
+ def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures,
+ ProcessorFeatures.HSWTuning>;
}
-def : ProcessorModel<"broadwell", BroadwellModel,
- ProcessorFeatures.BDWFeatures>;
+def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures,
+ ProcessorFeatures.BDWTuning>;
-def : ProcessorModel<"skylake", SkylakeClientModel,
- ProcessorFeatures.SKLFeatures>;
+def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures,
+ ProcessorFeatures.SKLTuning>;
// FIXME: define KNL scheduler model
-def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
-def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;
+def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures,
+ ProcessorFeatures.KNLTuning>;
+def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures,
+ ProcessorFeatures.KNLTuning>;
foreach P = ["skylake-avx512", "skx"] in {
- def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
+ def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures,
+ ProcessorFeatures.SKXTuning>;
}
-def : ProcessorModel<"cascadelake", SkylakeServerModel,
- ProcessorFeatures.CLXFeatures>;
-def : ProcessorModel<"cooperlake", SkylakeServerModel,
- ProcessorFeatures.CPXFeatures>;
-def : ProcessorModel<"cannonlake", SkylakeServerModel,
- ProcessorFeatures.CNLFeatures>;
-def : ProcessorModel<"icelake-client", SkylakeServerModel,
- ProcessorFeatures.ICLFeatures>;
-def : ProcessorModel<"icelake-server", SkylakeServerModel,
- ProcessorFeatures.ICXFeatures>;
-def : ProcessorModel<"tigerlake", SkylakeServerModel,
- ProcessorFeatures.TGLFeatures>;
+def : ProcModel<"cascadelake", SkylakeServerModel,
+ ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>;
+def : ProcModel<"cooperlake", SkylakeServerModel,
+ ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>;
+def : ProcModel<"cannonlake", SkylakeServerModel,
+ ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
+def : ProcModel<"icelake-client", SkylakeServerModel,
+ ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
+def : ProcModel<"icelake-server", SkylakeServerModel,
+ ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
+def : ProcModel<"tigerlake", SkylakeServerModel,
+ ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>;
+def : ProcModel<"sapphirerapids", SkylakeServerModel,
+ ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+def : ProcModel<"alderlake", SkylakeClientModel,
+ ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>;
// AMD CPUs.
-def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNow, FeatureInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNow, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
foreach P = ["athlon", "athlon-tbird"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
- Feature3DNowA, FeatureNOPL, FeatureSlowSHLD,
- FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
+ FeatureNOPL],
+ [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
- FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
- FeatureSlowSHLD, FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
+ [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
- Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
- FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
+ [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
- def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
- Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
- FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
- FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
+ Feature64Bit],
+ [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
- def : Proc<P, ProcessorFeatures.BarcelonaFeatures>;
+ def : Proc<P, ProcessorFeatures.BarcelonaFeatures,
+ ProcessorFeatures.BarcelonaTuning>;
}
// Bobcat
-def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
+def : Proc<"btver1", ProcessorFeatures.BtVer1Features,
+ ProcessorFeatures.BtVer1Tuning>;
// Jaguar
-def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;
+def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features,
+ ProcessorFeatures.BtVer2Tuning>;
// Bulldozer
-def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
+def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
+ ProcessorFeatures.BdVer1Tuning>;
// Piledriver
-def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
+def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
+ ProcessorFeatures.BdVer2Tuning>;
// Steamroller
-def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
+def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
+ ProcessorFeatures.BdVer3Tuning>;
// Excavator
-def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;
-
-def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
-def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>;
-
-def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNowA, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureInsertVZEROUPPER]>;
-def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
- FeatureInsertVZEROUPPER]>;
-def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
- FeatureInsertVZEROUPPER]>;
-def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX, FeatureSSE1, FeatureFXSR,
- FeatureCMOV, FeatureInsertVZEROUPPER]>;
+def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
+ ProcessorFeatures.BdVer4Tuning>;
+
+def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
+ ProcessorFeatures.ZNTuning>;
+def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
+ ProcessorFeatures.ZN2Tuning>;
+def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features,
+ ProcessorFeatures.ZN3Tuning>;
+
+def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"winchip-c6", [FeatureX87, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2", [FeatureX87, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3", [FeatureX87, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -1278,15 +1391,8 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
// covers a huge swath of x86 processors. If there are specific scheduling
// knobs which need to be tuned differently for AMD chips, we might consider
// forming a common base for them.
-def : ProcessorModel<"x86-64", SandyBridgeModel, [
- FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSE2,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
+def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
+[
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureSlowIncDec,
@@ -1294,6 +1400,16 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureInsertVZEROUPPER
]>;
+// x86-64 micro-architecture levels.
+def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features,
+ ProcessorFeatures.SNBTuning>;
+// Close to Haswell.
+def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features,
+ ProcessorFeatures.HSWTuning>;
+// Close to the AVX-512 level implemented by Xeon Scalable Processors.
+def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features,
+ ProcessorFeatures.SKXTuning>;
+
//===----------------------------------------------------------------------===//
// Calling Conventions
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
index aa03217d155d..2d434bda5530 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -404,7 +404,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
O << ']';
}
-static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
Register Reg = MO.getReg();
bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
@@ -446,9 +446,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
return false;
}
-static bool printAsmVRegister(X86AsmPrinter &P, const MachineOperand &MO,
- char Mode, raw_ostream &O) {
- unsigned Reg = MO.getReg();
+static bool printAsmVRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ Register Reg = MO.getReg();
bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
unsigned Index;
@@ -560,7 +560,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 't': // Print V8SFmode register
case 'g': // Print V16SFmode register
if (MO.isReg())
- return printAsmVRegister(*this, MO, ExtraCode[0], O);
+ return printAsmVRegister(MO, ExtraCode[0], O);
PrintOperand(MI, OpNo, O);
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
index eb485fa2ecef..a3b74c8ee387 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -134,9 +134,9 @@ public:
}
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- const char *ExtraCode, raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- const char *ExtraCode, raw_ostream &OS) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool doInitialization(Module &M) override {
SMShadowTracker.reset(0);
@@ -145,7 +145,7 @@ public:
return AsmPrinter::doInitialization(M);
}
- bool runOnMachineFunction(MachineFunction &F) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
void emitFunctionBodyStart() override;
void emitFunctionBodyEnd() override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 9f1fece1b9dd..fdc65acffe3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -154,7 +154,7 @@ static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
}
-static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
+static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
switch (LdOpcode) {
case X86::MOVUPSrm:
case X86::MOVAPSrm:
@@ -206,7 +206,7 @@ static bool isPotentialBlockedMemCpyPair(int LdOpcode, int StOpcode) {
}
}
-static bool isPotentialBlockingStoreInst(int Opcode, int LoadOpcode) {
+static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
bool PBlock = false;
PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index caa1f7952475..fae4e688c8b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -105,7 +105,7 @@ private:
void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
- unsigned Reg);
+ Register Reg);
enum InstClassification { Convert, Skip, Exit };
@@ -202,7 +202,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
Align StackAlign = TFL->getStackAlign();
int64_t Advantage = 0;
- for (auto CC : CallSeqVector) {
+ for (const auto &CC : CallSeqVector) {
// Call sites where no parameters are passed on the stack
// do not affect the cost, since there needs to be no
// stack adjustment.
@@ -265,7 +265,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
if (!isProfitable(MF, CallSeqVector))
return false;
- for (auto CC : CallSeqVector) {
+ for (const auto &CC : CallSeqVector) {
if (CC.UsePush) {
adjustCallSequence(MF, CC);
Changed = true;
@@ -288,13 +288,13 @@ X86CallFrameOptimization::classifyInstruction(
case X86::AND16mi8:
case X86::AND32mi8:
case X86::AND64mi8: {
- MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands);
+ const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
return ImmOp.getImm() == 0 ? Convert : Exit;
}
case X86::OR16mi8:
case X86::OR32mi8:
case X86::OR64mi8: {
- MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands);
+ const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
return ImmOp.getImm() == -1 ? Convert : Exit;
}
case X86::MOV32mi:
@@ -336,7 +336,7 @@ X86CallFrameOptimization::classifyInstruction(
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (!Register::isPhysicalRegister(Reg))
+ if (!Reg.isPhysical())
continue;
if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
return Exit;
@@ -454,7 +454,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (Register::isPhysicalRegister(Reg))
+ if (Reg.isPhysical())
UsedRegs.insert(Reg);
}
}
@@ -506,7 +506,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
// replace uses.
for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx];
- MachineOperand PushOp = Store->getOperand(X86::AddrNumOperands);
+ const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands);
MachineBasicBlock::iterator Push = nullptr;
unsigned PushOpcode;
switch (Store->getOpcode()) {
@@ -563,8 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
unsigned NumOps = DefMov->getDesc().getNumOperands();
for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
Push->addOperand(DefMov->getOperand(i));
- Push->cloneMergedMemRefs(MF, {&*DefMov, &*Store});
-
+ Push->cloneMergedMemRefs(MF, {DefMov, &*Store});
DefMov->eraseFromParent();
} else {
PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
@@ -600,7 +599,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
}
MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
- MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+ MachineBasicBlock::iterator FrameSetup, Register Reg) {
// Do an extremely restricted form of load folding.
// ISel will often create patterns like:
// movl 4(%edi), %eax
@@ -611,7 +610,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
// movl %eax, (%esp)
// call
// Get rid of those with prejudice.
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
return nullptr;
// Make sure this is the only use of Reg.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
index 319dc9470604..53f57565d56e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -95,15 +95,14 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
namespace {
-struct OutgoingValueHandler : public CallLowering::ValueHandler {
- OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+ X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
+ CCAssignFn *AssignFn)
+ : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
- bool isIncomingArgumentHandler() const override { return false; }
-
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
@@ -135,9 +134,10 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
unsigned ValSize = VA.getValVT().getSizeInBits();
unsigned LocSize = VA.getLocVT().getSizeInBits();
if (PhysRegSize > ValSize && LocSize == ValSize) {
- assert((PhysRegSize == 128 || PhysRegSize == 80) && "We expect that to be 128 bit");
- auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg);
- ExtReg = MIB.getReg(0);
+ assert((PhysRegSize == 128 || PhysRegSize == 80) &&
+ "We expect that to be 128 bit");
+ ExtReg =
+ MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0);
} else
ExtReg = extendRegister(ValVReg, VA);
@@ -149,9 +149,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MachineFunction &MF = MIRBuilder.getMF();
Register ExtReg = extendRegister(ValVReg, VA);
- auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
- VA.getLocVT().getStoreSize(),
- inferAlignFromPtrInfo(MF, MPO));
+ auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
+ VA.getLocVT().getStoreSize(),
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
@@ -184,9 +184,9 @@ protected:
} // end anonymous namespace
-bool X86CallLowering::lowerReturn(
- MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<Register> VRegs) const {
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
"Return value without a vreg");
auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
@@ -195,7 +195,7 @@ bool X86CallLowering::lowerReturn(
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- auto &DL = MF.getDataLayout();
+ const DataLayout &DL = MF.getDataLayout();
LLVMContext &Ctx = Val->getType()->getContext();
const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
@@ -215,7 +215,7 @@ bool X86CallLowering::lowerReturn(
return false;
}
- OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
}
@@ -226,14 +226,12 @@ bool X86CallLowering::lowerReturn(
namespace {
-struct IncomingValueHandler : public CallLowering::ValueHandler {
- IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn),
+struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
+ X86IncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn),
DL(MIRBuilder.getMF().getDataLayout()) {}
- bool isIncomingArgumentHandler() const override { return true; }
-
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
@@ -248,7 +246,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
- auto MMO = MF.getMachineMemOperand(
+ auto *MMO = MF.getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
@@ -298,10 +296,10 @@ protected:
const DataLayout &DL;
};
-struct FormalArgHandler : public IncomingValueHandler {
+struct FormalArgHandler : public X86IncomingValueHandler {
FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
CCAssignFn *AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+ : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -309,10 +307,10 @@ struct FormalArgHandler : public IncomingValueHandler {
}
};
-struct CallReturnHandler : public IncomingValueHandler {
+struct CallReturnHandler : public X86IncomingValueHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
CCAssignFn *AssignFn, MachineInstrBuilder &MIB)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -324,9 +322,10 @@ protected:
} // end anonymous namespace
-bool X86CallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const {
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
if (F.arg_empty())
return true;
@@ -340,8 +339,7 @@ bool X86CallLowering::lowerFormalArguments(
SmallVector<ArgInfo, 8> SplitArgs;
unsigned Idx = 0;
- for (auto &Arg : F.args()) {
-
+ for (const auto &Arg : F.args()) {
// TODO: handle not simple cases.
if (Arg.hasAttribute(Attribute::ByVal) ||
Arg.hasAttribute(Attribute::InReg) ||
@@ -380,10 +378,10 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- auto &DL = F.getParent()->getDataLayout();
+ const DataLayout &DL = F.getParent()->getDataLayout();
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const TargetInstrInfo &TII = *STI.getInstrInfo();
- auto TRI = STI.getRegisterInfo();
+ const X86RegisterInfo *TRI = STI.getRegisterInfo();
// Handle only Linux C, X86_64_SysV calling conventions for now.
if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
@@ -421,7 +419,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
}
// Do the actual argument marshalling.
- OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
index b5ea7782896b..9390122d7647 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
@@ -29,10 +29,12 @@ public:
X86CallLowering(const X86TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<Register> VRegs) const override;
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
index c899db60e016..c80a5d5bb332 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -330,5 +330,15 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
+static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ if (LocVT != MVT::i64) {
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::ZExt;
+ }
+ return false;
+}
+
// Provides entry points of CC_X86 and RetCC_X86.
#include "X86GenCallingConv.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
index 802e694999b6..3735fab818ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
@@ -336,6 +336,9 @@ def RetCC_X86_64_C : CallingConv<[
// MMX vector types are always returned in XMM0.
CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+ // Pointers are always returned in full 64-bit registers.
+ CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
CCDelegateTo<RetCC_X86Common>
@@ -518,6 +521,9 @@ def CC_X86_64_C : CallingConv<[
CCIfCC<"CallingConv::Swift",
CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
+ // Pointers are always passed in full 64-bit registers.
+ CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
// The first 6 integer arguments are passed in integer registers.
CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
@@ -1096,7 +1102,7 @@ def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)
// All GPRs - except r11
def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
- R8, R9, R10, RSP)>;
+ R8, R9, R10)>;
// All registers - except r11
def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
@@ -1154,17 +1160,16 @@ def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15,
def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
// Register calling convention preserves few GPR and XMM8-15
-def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>;
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
(sequence "XMM%u", 4, 7))>;
def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>;
def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>;
-def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
(sequence "R%u", 10, 15))>;
def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,
(sequence "XMM%u", 8, 15))>;
-def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
(sequence "R%u", 12, 15))>;
def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
(sequence "XMM%u", 8, 15))>;
-
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
index fe5cb3ae2bf6..a2de0dc08292 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -439,7 +439,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
if (!MO.isReg() || !MO.isUse())
continue;
Register Reg = MO.getReg();
- auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)];
+ auto &RDM = RegDefMaps[Reg.isVirtual()];
if (MachineInstr *DefMI = RDM.lookup(Reg)) {
OperandToDefMap[&MO] = DefMI;
DepthInfo Info = DepthMap.lookup(DefMI);
@@ -459,7 +459,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
if (!MO.isReg() || !MO.isDef())
continue;
Register Reg = MO.getReg();
- RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI;
+ RegDefMaps[Reg.isVirtual()][Reg] = &MI;
}
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
@@ -537,7 +537,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
// This is another conservative check to avoid converting CMOV instruction
// used with tree-search like algorithm, where the branch is unpredicted.
auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
- if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
+ if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
unsigned Op = UIs.begin()->getOpcode();
if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
WorthOpGroup = false;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp
deleted file mode 100644
index 7ede94664bf6..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CondBrFolding.cpp
+++ /dev/null
@@ -1,579 +0,0 @@
-//===---- X86CondBrFolding.cpp - optimize conditional branches ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file defines a pass that optimizes condition branches on x86 by taking
-// advantage of the three-way conditional code generated by compare
-// instructions.
-// Currently, it tries to hoisting EQ and NE conditional branch to a dominant
-// conditional branch condition where the same EQ/NE conditional code is
-// computed. An example:
-// bb_0:
-// cmp %0, 19
-// jg bb_1
-// jmp bb_2
-// bb_1:
-// cmp %0, 40
-// jg bb_3
-// jmp bb_4
-// bb_4:
-// cmp %0, 20
-// je bb_5
-// jmp bb_6
-// Here we could combine the two compares in bb_0 and bb_4 and have the
-// following code:
-// bb_0:
-// cmp %0, 20
-// jg bb_1
-// jl bb_2
-// jmp bb_5
-// bb_1:
-// cmp %0, 40
-// jg bb_3
-// jmp bb_6
-// For the case of %0 == 20 (bb_5), we eliminate two jumps, and the control
-// height for bb_6 is also reduced. bb_4 is gone after the optimization.
-//
-// There are plenty of this code patterns, especially from the switch case
-// lowing where we generate compare of "pivot-1" for the inner nodes in the
-// binary search tree.
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/BranchProbability.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-condbr-folding"
-
-STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded");
-
-namespace {
-class X86CondBrFoldingPass : public MachineFunctionPass {
-public:
- X86CondBrFoldingPass() : MachineFunctionPass(ID) { }
- StringRef getPassName() const override { return "X86 CondBr Folding"; }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineBranchProbabilityInfo>();
- }
-
-public:
- static char ID;
-};
-} // namespace
-
-char X86CondBrFoldingPass::ID = 0;
-INITIALIZE_PASS(X86CondBrFoldingPass, "X86CondBrFolding", "X86CondBrFolding", false, false)
-
-FunctionPass *llvm::createX86CondBrFolding() {
- return new X86CondBrFoldingPass();
-}
-
-namespace {
-// A class the stores the auxiliary information for each MBB.
-struct TargetMBBInfo {
- MachineBasicBlock *TBB;
- MachineBasicBlock *FBB;
- MachineInstr *BrInstr;
- MachineInstr *CmpInstr;
- X86::CondCode BranchCode;
- unsigned SrcReg;
- int CmpValue;
- bool Modified;
- bool CmpBrOnly;
-};
-
-// A class that optimizes the conditional branch by hoisting and merge CondCode.
-class X86CondBrFolding {
-public:
- X86CondBrFolding(const X86InstrInfo *TII,
- const MachineBranchProbabilityInfo *MBPI,
- MachineFunction &MF)
- : TII(TII), MBPI(MBPI), MF(MF) {}
- bool optimize();
-
-private:
- const X86InstrInfo *TII;
- const MachineBranchProbabilityInfo *MBPI;
- MachineFunction &MF;
- std::vector<std::unique_ptr<TargetMBBInfo>> MBBInfos;
- SmallVector<MachineBasicBlock *, 4> RemoveList;
-
- void optimizeCondBr(MachineBasicBlock &MBB,
- SmallVectorImpl<MachineBasicBlock *> &BranchPath);
- void replaceBrDest(MachineBasicBlock *MBB, MachineBasicBlock *OrigDest,
- MachineBasicBlock *NewDest);
- void fixupModifiedCond(MachineBasicBlock *MBB);
- std::unique_ptr<TargetMBBInfo> analyzeMBB(MachineBasicBlock &MBB);
- static bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- int &CmpValue);
- bool findPath(MachineBasicBlock *MBB,
- SmallVectorImpl<MachineBasicBlock *> &BranchPath);
- TargetMBBInfo *getMBBInfo(MachineBasicBlock *MBB) const {
- return MBBInfos[MBB->getNumber()].get();
- }
-};
-} // namespace
-
-// Find a valid path that we can reuse the CondCode.
-// The resulted path (if return true) is stored in BranchPath.
-// Return value:
-// false: is no valid path is found.
-// true: a valid path is found and the targetBB can be reached.
-bool X86CondBrFolding::findPath(
- MachineBasicBlock *MBB, SmallVectorImpl<MachineBasicBlock *> &BranchPath) {
- TargetMBBInfo *MBBInfo = getMBBInfo(MBB);
- assert(MBBInfo && "Expecting a candidate MBB");
- int CmpValue = MBBInfo->CmpValue;
-
- MachineBasicBlock *PredMBB = *MBB->pred_begin();
- MachineBasicBlock *SaveMBB = MBB;
- while (PredMBB) {
- TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB);
- if (!PredMBBInfo || PredMBBInfo->SrcReg != MBBInfo->SrcReg)
- return false;
-
- assert(SaveMBB == PredMBBInfo->TBB || SaveMBB == PredMBBInfo->FBB);
- bool IsFalseBranch = (SaveMBB == PredMBBInfo->FBB);
-
- X86::CondCode CC = PredMBBInfo->BranchCode;
- assert(CC == X86::COND_L || CC == X86::COND_G || CC == X86::COND_E);
- int PredCmpValue = PredMBBInfo->CmpValue;
- bool ValueCmpTrue = ((CmpValue < PredCmpValue && CC == X86::COND_L) ||
- (CmpValue > PredCmpValue && CC == X86::COND_G) ||
- (CmpValue == PredCmpValue && CC == X86::COND_E));
- // Check if both the result of value compare and the branch target match.
- if (!(ValueCmpTrue ^ IsFalseBranch)) {
- LLVM_DEBUG(dbgs() << "Dead BB detected!\n");
- return false;
- }
-
- BranchPath.push_back(PredMBB);
- // These are the conditions on which we could combine the compares.
- if ((CmpValue == PredCmpValue) ||
- (CmpValue == PredCmpValue - 1 && CC == X86::COND_L) ||
- (CmpValue == PredCmpValue + 1 && CC == X86::COND_G))
- return true;
-
- // If PredMBB has more than on preds, or not a pure cmp and br, we bailout.
- if (PredMBB->pred_size() != 1 || !PredMBBInfo->CmpBrOnly)
- return false;
-
- SaveMBB = PredMBB;
- PredMBB = *PredMBB->pred_begin();
- }
- return false;
-}
-
-// Fix up any PHI node in the successor of MBB.
-static void fixPHIsInSucc(MachineBasicBlock *MBB, MachineBasicBlock *OldMBB,
- MachineBasicBlock *NewMBB) {
- if (NewMBB == OldMBB)
- return;
- for (auto MI = MBB->instr_begin(), ME = MBB->instr_end();
- MI != ME && MI->isPHI(); ++MI)
- for (unsigned i = 2, e = MI->getNumOperands() + 1; i != e; i += 2) {
- MachineOperand &MO = MI->getOperand(i);
- if (MO.getMBB() == OldMBB)
- MO.setMBB(NewMBB);
- }
-}
-
-// Utility function to set branch probability for edge MBB->SuccMBB.
-static inline bool setBranchProb(MachineBasicBlock *MBB,
- MachineBasicBlock *SuccMBB,
- BranchProbability Prob) {
- auto MBBI = std::find(MBB->succ_begin(), MBB->succ_end(), SuccMBB);
- if (MBBI == MBB->succ_end())
- return false;
- MBB->setSuccProbability(MBBI, Prob);
- return true;
-}
-
-// Utility function to find the unconditional br instruction in MBB.
-static inline MachineBasicBlock::iterator
-findUncondBrI(MachineBasicBlock *MBB) {
- return std::find_if(MBB->begin(), MBB->end(), [](MachineInstr &MI) -> bool {
- return MI.getOpcode() == X86::JMP_1;
- });
-}
-
-// Replace MBB's original successor, OrigDest, with NewDest.
-// Also update the MBBInfo for MBB.
-void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB,
- MachineBasicBlock *OrigDest,
- MachineBasicBlock *NewDest) {
- TargetMBBInfo *MBBInfo = getMBBInfo(MBB);
- MachineInstr *BrMI;
- if (MBBInfo->TBB == OrigDest) {
- BrMI = MBBInfo->BrInstr;
- MachineInstrBuilder MIB =
- BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1))
- .addMBB(NewDest).addImm(MBBInfo->BranchCode);
- MBBInfo->TBB = NewDest;
- MBBInfo->BrInstr = MIB.getInstr();
- } else { // Should be the unconditional jump stmt.
- MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB);
- BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1))
- .addMBB(NewDest);
- MBBInfo->FBB = NewDest;
- BrMI = &*UncondBrI;
- }
- fixPHIsInSucc(NewDest, OrigDest, MBB);
- BrMI->eraseFromParent();
- MBB->addSuccessor(NewDest);
- setBranchProb(MBB, NewDest, MBPI->getEdgeProbability(MBB, OrigDest));
- MBB->removeSuccessor(OrigDest);
-}
-
-// Change the CondCode and BrInstr according to MBBInfo.
-void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) {
- TargetMBBInfo *MBBInfo = getMBBInfo(MBB);
- if (!MBBInfo->Modified)
- return;
-
- MachineInstr *BrMI = MBBInfo->BrInstr;
- X86::CondCode CC = MBBInfo->BranchCode;
- MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI),
- TII->get(X86::JCC_1))
- .addMBB(MBBInfo->TBB).addImm(CC);
- BrMI->eraseFromParent();
- MBBInfo->BrInstr = MIB.getInstr();
-
- MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB);
- BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1))
- .addMBB(MBBInfo->FBB);
- MBB->erase(UncondBrI);
- MBBInfo->Modified = false;
-}
-
-//
-// Apply the transformation:
-// RootMBB -1-> ... PredMBB -3-> MBB -5-> TargetMBB
-// \-2-> \-4-> \-6-> FalseMBB
-// ==>
-// RootMBB -1-> ... PredMBB -7-> FalseMBB
-// TargetMBB <-8-/ \-2-> \-4->
-//
-// Note that PredMBB and RootMBB could be the same.
-// And in the case of dead TargetMBB, we will not have TargetMBB and edge 8.
-//
-// There are some special handling where the RootMBB is COND_E in which case
-// we directly short-cycle the brinstr.
-//
-void X86CondBrFolding::optimizeCondBr(
- MachineBasicBlock &MBB, SmallVectorImpl<MachineBasicBlock *> &BranchPath) {
-
- X86::CondCode CC;
- TargetMBBInfo *MBBInfo = getMBBInfo(&MBB);
- assert(MBBInfo && "Expecting a candidate MBB");
- MachineBasicBlock *TargetMBB = MBBInfo->TBB;
- BranchProbability TargetProb = MBPI->getEdgeProbability(&MBB, MBBInfo->TBB);
-
- // Forward the jump from MBB's predecessor to MBB's false target.
- MachineBasicBlock *PredMBB = BranchPath.front();
- TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB);
- assert(PredMBBInfo && "Expecting a candidate MBB");
- if (PredMBBInfo->Modified)
- fixupModifiedCond(PredMBB);
- CC = PredMBBInfo->BranchCode;
- // Don't do this if depth of BranchPath is 1 and PredMBB is of COND_E.
- // We will short-cycle directly for this case.
- if (!(CC == X86::COND_E && BranchPath.size() == 1))
- replaceBrDest(PredMBB, &MBB, MBBInfo->FBB);
-
- MachineBasicBlock *RootMBB = BranchPath.back();
- TargetMBBInfo *RootMBBInfo = getMBBInfo(RootMBB);
- assert(RootMBBInfo && "Expecting a candidate MBB");
- if (RootMBBInfo->Modified)
- fixupModifiedCond(RootMBB);
- CC = RootMBBInfo->BranchCode;
-
- if (CC != X86::COND_E) {
- MachineBasicBlock::iterator UncondBrI = findUncondBrI(RootMBB);
- // RootMBB: Cond jump to the original not-taken MBB.
- X86::CondCode NewCC;
- switch (CC) {
- case X86::COND_L:
- NewCC = X86::COND_G;
- break;
- case X86::COND_G:
- NewCC = X86::COND_L;
- break;
- default:
- llvm_unreachable("unexpected condtional code.");
- }
- BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
- TII->get(X86::JCC_1))
- .addMBB(RootMBBInfo->FBB).addImm(NewCC);
-
- // RootMBB: Jump to TargetMBB
- BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI),
- TII->get(X86::JMP_1))
- .addMBB(TargetMBB);
- RootMBB->addSuccessor(TargetMBB);
- fixPHIsInSucc(TargetMBB, &MBB, RootMBB);
- RootMBB->erase(UncondBrI);
- } else {
- replaceBrDest(RootMBB, RootMBBInfo->TBB, TargetMBB);
- }
-
- // Fix RootMBB's CmpValue to MBB's CmpValue to TargetMBB. Don't set Imm
- // directly. Move MBB's stmt to here as the opcode might be different.
- if (RootMBBInfo->CmpValue != MBBInfo->CmpValue) {
- MachineInstr *NewCmp = MBBInfo->CmpInstr;
- NewCmp->removeFromParent();
- RootMBB->insert(RootMBBInfo->CmpInstr, NewCmp);
- RootMBBInfo->CmpInstr->eraseFromParent();
- }
-
- // Fix branch Probabilities.
- auto fixBranchProb = [&](MachineBasicBlock *NextMBB) {
- BranchProbability Prob;
- for (auto &I : BranchPath) {
- MachineBasicBlock *ThisMBB = I;
- if (!ThisMBB->hasSuccessorProbabilities() ||
- !ThisMBB->isSuccessor(NextMBB))
- break;
- Prob = MBPI->getEdgeProbability(ThisMBB, NextMBB);
- if (Prob.isUnknown())
- break;
- TargetProb = Prob * TargetProb;
- Prob = Prob - TargetProb;
- setBranchProb(ThisMBB, NextMBB, Prob);
- if (ThisMBB == RootMBB) {
- setBranchProb(ThisMBB, TargetMBB, TargetProb);
- }
- ThisMBB->normalizeSuccProbs();
- if (ThisMBB == RootMBB)
- break;
- NextMBB = ThisMBB;
- }
- return true;
- };
- if (CC != X86::COND_E && !TargetProb.isUnknown())
- fixBranchProb(MBBInfo->FBB);
-
- if (CC != X86::COND_E)
- RemoveList.push_back(&MBB);
-
- // Invalidate MBBInfo just in case.
- MBBInfos[MBB.getNumber()] = nullptr;
- MBBInfos[RootMBB->getNumber()] = nullptr;
-
- LLVM_DEBUG(dbgs() << "After optimization:\nRootMBB is: " << *RootMBB << "\n");
- if (BranchPath.size() > 1)
- LLVM_DEBUG(dbgs() << "PredMBB is: " << *(BranchPath[0]) << "\n");
-}
-
-// Driver function for optimization: find the valid candidate and apply
-// the transformation.
-bool X86CondBrFolding::optimize() {
- bool Changed = false;
- LLVM_DEBUG(dbgs() << "***** X86CondBr Folding on Function: " << MF.getName()
- << " *****\n");
- // Setup data structures.
- MBBInfos.resize(MF.getNumBlockIDs());
- for (auto &MBB : MF)
- MBBInfos[MBB.getNumber()] = analyzeMBB(MBB);
-
- for (auto &MBB : MF) {
- TargetMBBInfo *MBBInfo = getMBBInfo(&MBB);
- if (!MBBInfo || !MBBInfo->CmpBrOnly)
- continue;
- if (MBB.pred_size() != 1)
- continue;
- LLVM_DEBUG(dbgs() << "Work on MBB." << MBB.getNumber()
- << " CmpValue: " << MBBInfo->CmpValue << "\n");
- SmallVector<MachineBasicBlock *, 4> BranchPath;
- if (!findPath(&MBB, BranchPath))
- continue;
-
-#ifndef NDEBUG
- LLVM_DEBUG(dbgs() << "Found one path (len=" << BranchPath.size() << "):\n");
- int Index = 1;
- LLVM_DEBUG(dbgs() << "Target MBB is: " << MBB << "\n");
- for (auto I = BranchPath.rbegin(); I != BranchPath.rend(); ++I, ++Index) {
- MachineBasicBlock *PMBB = *I;
- TargetMBBInfo *PMBBInfo = getMBBInfo(PMBB);
- LLVM_DEBUG(dbgs() << "Path MBB (" << Index << " of " << BranchPath.size()
- << ") is " << *PMBB);
- LLVM_DEBUG(dbgs() << "CC=" << PMBBInfo->BranchCode
- << " Val=" << PMBBInfo->CmpValue
- << " CmpBrOnly=" << PMBBInfo->CmpBrOnly << "\n\n");
- }
-#endif
- optimizeCondBr(MBB, BranchPath);
- Changed = true;
- }
- NumFixedCondBrs += RemoveList.size();
- for (auto MBBI : RemoveList) {
- while (!MBBI->succ_empty())
- MBBI->removeSuccessor(MBBI->succ_end() - 1);
-
- MBBI->eraseFromParent();
- }
-
- return Changed;
-}
-
-// Analyze instructions that generate CondCode and extract information.
-bool X86CondBrFolding::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- int &CmpValue) {
- unsigned SrcRegIndex = 0;
- unsigned ValueIndex = 0;
- switch (MI.getOpcode()) {
- // TODO: handle test instructions.
- default:
- return false;
- case X86::CMP64ri32:
- case X86::CMP64ri8:
- case X86::CMP32ri:
- case X86::CMP32ri8:
- case X86::CMP16ri:
- case X86::CMP16ri8:
- case X86::CMP8ri:
- SrcRegIndex = 0;
- ValueIndex = 1;
- break;
- case X86::SUB64ri32:
- case X86::SUB64ri8:
- case X86::SUB32ri:
- case X86::SUB32ri8:
- case X86::SUB16ri:
- case X86::SUB16ri8:
- case X86::SUB8ri:
- SrcRegIndex = 1;
- ValueIndex = 2;
- break;
- }
- SrcReg = MI.getOperand(SrcRegIndex).getReg();
- if (!MI.getOperand(ValueIndex).isImm())
- return false;
- CmpValue = MI.getOperand(ValueIndex).getImm();
- return true;
-}
-
-// Analyze a candidate MBB and set the extract all the information needed.
-// The valid candidate will have two successors.
-// It also should have a sequence of
-// Branch_instr,
-// CondBr,
-// UnCondBr.
-// Return TargetMBBInfo if MBB is a valid candidate and nullptr otherwise.
-std::unique_ptr<TargetMBBInfo>
-X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
- MachineBasicBlock *TBB;
- MachineBasicBlock *FBB;
- MachineInstr *BrInstr;
- MachineInstr *CmpInstr;
- X86::CondCode CC;
- unsigned SrcReg;
- int CmpValue;
- bool Modified;
- bool CmpBrOnly;
-
- if (MBB.succ_size() != 2)
- return nullptr;
-
- CmpBrOnly = true;
- FBB = TBB = nullptr;
- CmpInstr = nullptr;
- MachineBasicBlock::iterator I = MBB.end();
- while (I != MBB.begin()) {
- --I;
- if (I->isDebugValue())
- continue;
- if (I->getOpcode() == X86::JMP_1) {
- if (FBB)
- return nullptr;
- FBB = I->getOperand(0).getMBB();
- continue;
- }
- if (I->isBranch()) {
- if (TBB)
- return nullptr;
- CC = X86::getCondFromBranch(*I);
- switch (CC) {
- default:
- return nullptr;
- case X86::COND_E:
- case X86::COND_L:
- case X86::COND_G:
- case X86::COND_NE:
- case X86::COND_LE:
- case X86::COND_GE:
- break;
- }
- TBB = I->getOperand(0).getMBB();
- BrInstr = &*I;
- continue;
- }
- if (analyzeCompare(*I, SrcReg, CmpValue)) {
- if (CmpInstr)
- return nullptr;
- CmpInstr = &*I;
- continue;
- }
- CmpBrOnly = false;
- break;
- }
-
- if (!TBB || !FBB || !CmpInstr)
- return nullptr;
-
- // Simplify CondCode. Note this is only to simplify the findPath logic
- // and will not change the instruction here.
- switch (CC) {
- case X86::COND_NE:
- CC = X86::COND_E;
- std::swap(TBB, FBB);
- Modified = true;
- break;
- case X86::COND_LE:
- if (CmpValue == INT_MAX)
- return nullptr;
- CC = X86::COND_L;
- CmpValue += 1;
- Modified = true;
- break;
- case X86::COND_GE:
- if (CmpValue == INT_MIN)
- return nullptr;
- CC = X86::COND_G;
- CmpValue -= 1;
- Modified = true;
- break;
- default:
- Modified = false;
- break;
- }
- return std::make_unique<TargetMBBInfo>(TargetMBBInfo{
- TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly});
-}
-
-bool X86CondBrFoldingPass::runOnMachineFunction(MachineFunction &MF) {
- const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.threewayBranchProfitable())
- return false;
- const X86InstrInfo *TII = ST.getInstrInfo();
- const MachineBranchProbabilityInfo *MBPI =
- &getAnalysis<MachineBranchProbabilityInfo>();
-
- X86CondBrFolding CondBr(TII, MBPI, MF);
- return CondBr.optimize();
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 488ee51f1d89..a2ae6345c006 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -141,7 +141,7 @@ public:
return false;
// It's illegal to replace an instruction that implicitly defines a register
// with an instruction that doesn't, unless that register dead.
- for (auto &MO : MI->implicit_operands())
+ for (const auto &MO : MI->implicit_operands())
if (MO.isReg() && MO.isDef() && !MO.isDead() &&
!TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg()))
return false;
@@ -180,7 +180,7 @@ public:
MachineRegisterInfo *MRI) const override {
assert(isLegal(MI, TII) && "Cannot convert instruction");
MachineBasicBlock *MBB = MI->getParent();
- auto &DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
Register Reg = MRI->createVirtualRegister(
TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
@@ -220,14 +220,12 @@ public:
// Don't allow copies to/flow GR8/GR16 physical registers.
// FIXME: Is there some better way to support this?
Register DstReg = MI->getOperand(0).getReg();
- if (Register::isPhysicalRegister(DstReg) &&
- (X86::GR8RegClass.contains(DstReg) ||
- X86::GR16RegClass.contains(DstReg)))
+ if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) ||
+ X86::GR16RegClass.contains(DstReg)))
return false;
Register SrcReg = MI->getOperand(1).getReg();
- if (Register::isPhysicalRegister(SrcReg) &&
- (X86::GR8RegClass.contains(SrcReg) ||
- X86::GR16RegClass.contains(SrcReg)))
+ if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) ||
+ X86::GR16RegClass.contains(SrcReg)))
return false;
return true;
@@ -237,7 +235,7 @@ public:
MachineRegisterInfo *MRI) const override {
assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY");
- for (auto &MO : MI->operands()) {
+ for (const auto &MO : MI->operands()) {
// Physical registers will not be converted. Assume that converting the
// COPY to the destination domain will eventually result in a actual
// instruction.
@@ -300,7 +298,7 @@ typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>>
class Closure {
private:
/// Virtual registers in the closure.
- DenseSet<unsigned> Edges;
+ DenseSet<Register> Edges;
/// Instructions in the closure.
SmallVector<MachineInstr *, 8> Instrs;
@@ -332,11 +330,9 @@ public:
bool empty() const { return Edges.empty(); }
- bool insertEdge(unsigned Reg) {
- return Edges.insert(Reg).second;
- }
+ bool insertEdge(Register Reg) { return Edges.insert(Reg).second; }
- using const_edge_iterator = DenseSet<unsigned>::const_iterator;
+ using const_edge_iterator = DenseSet<Register>::const_iterator;
iterator_range<const_edge_iterator> edges() const {
return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
}
@@ -352,7 +348,7 @@ public:
LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const {
dbgs() << "Registers: ";
bool First = true;
- for (unsigned Reg : Edges) {
+ for (Register Reg : Edges) {
if (!First)
dbgs() << ", ";
First = false;
@@ -407,10 +403,10 @@ private:
void initConverters();
/// Starting from \Reg, expand the closure as much as possible.
- void buildClosure(Closure &, unsigned Reg);
+ void buildClosure(Closure &, Register Reg);
/// Enqueue \p Reg to be considered for addition to the closure.
- void visitRegister(Closure &, unsigned Reg, RegDomain &Domain,
+ void visitRegister(Closure &, Register Reg, RegDomain &Domain,
SmallVectorImpl<unsigned> &Worklist);
/// Reassign the closure to \p Domain.
@@ -430,13 +426,13 @@ char X86DomainReassignment::ID = 0;
} // End anonymous namespace.
-void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
+void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
RegDomain &Domain,
SmallVectorImpl<unsigned> &Worklist) {
if (EnclosedEdges.count(Reg))
return;
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
return;
if (!MRI->hasOneDef(Reg))
@@ -507,7 +503,7 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
// Iterate all registers in the closure, replace them with registers in the
// destination domain.
- for (unsigned Reg : C.edges()) {
+ for (Register Reg : C.edges()) {
MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
for (auto &MO : MRI->use_operands(Reg)) {
if (MO.isReg())
@@ -517,13 +513,13 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
}
}
- for (auto MI : ToErase)
+ for (auto *MI : ToErase)
MI->eraseFromParent();
}
/// \returns true when \p Reg is used as part of an address calculation in \p
/// MI.
-static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
+static bool usedAsAddr(const MachineInstr &MI, Register Reg,
const TargetInstrInfo *TII) {
if (!MI.mayLoadOrStore())
return false;
@@ -537,14 +533,14 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
for (unsigned MemOpIdx = MemOpStart,
MemOpEnd = MemOpStart + X86::AddrNumOperands;
MemOpIdx < MemOpEnd; ++MemOpIdx) {
- auto &Op = MI.getOperand(MemOpIdx);
+ const MachineOperand &Op = MI.getOperand(MemOpIdx);
if (Op.isReg() && Op.getReg() == Reg)
return true;
}
return false;
}
-void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
+void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
SmallVector<unsigned, 4> Worklist;
RegDomain Domain = NoDomain;
visitRegister(C, Reg, Domain, Worklist);
@@ -594,7 +590,7 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
continue;
Register DefReg = DefOp.getReg();
- if (!Register::isVirtualRegister(DefReg)) {
+ if (!DefReg.isVirtual()) {
C.setAllIllegal();
continue;
}
@@ -753,7 +749,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
// Go over all virtual registers and calculate a closure.
unsigned ClosureID = 0;
for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
- unsigned Reg = Register::index2VirtReg(Idx);
+ Register Reg = Register::index2VirtReg(Idx);
// GPR only current source domain supported.
if (!isGPR(MRI->getRegClass(Reg)))
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
index 540ad98b6d54..97f843fa24eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -85,6 +85,8 @@ public:
private:
/// Machine instruction info used throughout the class.
const X86InstrInfo *TII = nullptr;
+
+ const X86Subtarget *ST = nullptr;
};
} // end anonymous namespace
@@ -94,8 +96,8 @@ char EvexToVexInstPass::ID = 0;
bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX512())
+ ST = &MF.getSubtarget<X86Subtarget>();
+ if (!ST->hasAVX512())
return false;
bool Changed = false;
@@ -144,10 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
}
// Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+ const X86Subtarget *ST) {
(void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
+ case X86::VPDPBUSDSZ256m:
+ case X86::VPDPBUSDSZ256r:
+ case X86::VPDPBUSDSZ128m:
+ case X86::VPDPBUSDSZ128r:
+ case X86::VPDPBUSDZ256m:
+ case X86::VPDPBUSDZ256r:
+ case X86::VPDPBUSDZ128m:
+ case X86::VPDPBUSDZ128r:
+ case X86::VPDPWSSDSZ256m:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ128m:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDZ256m:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ128m:
+ case X86::VPDPWSSDZ128r:
+ // These can only VEX convert if AVXVNNI is enabled.
+ return ST->hasAVXVNNI();
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
@@ -250,7 +271,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
(Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
: makeArrayRef(X86EvexToVex128CompressTable);
- auto I = llvm::lower_bound(Table, MI.getOpcode());
+ const auto *I = llvm::lower_bound(Table, MI.getOpcode());
if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
return false;
@@ -259,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
if (usesExtendedRegister(MI))
return false;
- if (!performCustomAdjustments(MI, NewOpc))
+ if (!performCustomAdjustments(MI, NewOpc, ST))
return false;
MI.setDesc(TII->get(NewOpc));
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index c47ef4708e91..15af0fb2e888 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -334,32 +334,28 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBB.erase(MBBI);
return true;
}
- case X86::LCMPXCHG8B_SAVE_EBX:
case X86::LCMPXCHG16B_SAVE_RBX: {
// Perform the following transformation.
// SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
// =>
- // [E|R]BX = InArg
+ // RBX = InArg
// actualcmpxchg Addr
- // [E|R]BX = SaveRbx
+ // RBX = SaveRbx
const MachineOperand &InArg = MBBI->getOperand(6);
Register SaveRbx = MBBI->getOperand(7).getReg();
- unsigned ActualInArg =
- Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
// Copy the input argument of the pseudo into the argument of the
// actual instruction.
- TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
- InArg.isKill());
+ // NOTE: We don't copy the kill flag since the input might be the same reg
+ // as one of the other operands of LCMPXCHG16B.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false);
// Create the actual instruction.
- unsigned ActualOpc =
- Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
- MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
+ MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B));
// Copy the operands related to the address.
for (unsigned Idx = 1; Idx < 6; ++Idx)
NewInstr->addOperand(MBBI->getOperand(Idx));
// Finally, restore the value of RBX.
- TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx,
/*SrcIsKill*/ true);
// Delete the pseudo.
@@ -442,9 +438,68 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBB.erase(MBBI);
return true;
}
+ case X86::MWAITX_SAVE_RBX: {
+ // Perform the following transformation.
+ // SaveRbx = pseudomwaitx InArg, SaveRbx
+ // =>
+ // [E|R]BX = InArg
+ // actualmwaitx
+ // [E|R]BX = SaveRbx
+ const MachineOperand &InArg = MBBI->getOperand(1);
+ // Copy the input argument of the pseudo into the argument of the
+ // actual instruction.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill());
+ // Create the actual instruction.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr));
+ // Finally, restore the value of RBX.
+ Register SaveRbx = MBBI->getOperand(2).getReg();
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true);
+ // Delete the pseudo.
+ MBBI->eraseFromParent();
+ return true;
+ }
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
+ case X86::PLDTILECFG: {
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(X86::LDTILECFG));
+ return true;
+ }
+ case X86::PSTTILECFG: {
+ MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
+ MI.setDesc(TII->get(X86::STTILECFG));
+ return true;
+ }
+ case X86::PTILELOADDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (unsigned i = 2; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILELOADD));
+ return true;
+ }
+ case X86::PTDPBSSDV: {
+ MI.RemoveOperand(7); // Remove $tmmcfg
+ MI.untieRegOperand(4);
+ for (unsigned i = 3; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TDPBSSD));
+ MI.tieOperands(0, 1);
+ return true;
+ }
+ case X86::PTILESTOREDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (int i = 1; i >= 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILESTORED));
+ return true;
+ }
+ case X86::PTILEZEROV: {
+ for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILEZERO));
+ return true;
+ }
}
llvm_unreachable("Previous switch has a fallthrough?");
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
index b305940139c0..caf158102230 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
@@ -779,14 +779,14 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
if (TLI.getPointerTy(DL) == MVT::i64) {
Opc = X86::MOV64rm;
RC = &X86::GR64RegClass;
-
- if (Subtarget->isPICStyleRIPRel())
- StubAM.Base.Reg = X86::RIP;
} else {
Opc = X86::MOV32rm;
RC = &X86::GR32RegClass;
}
+ if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL)
+ StubAM.Base.Reg = X86::RIP;
+
LoadReg = createResultReg(RC);
MachineInstrBuilder LoadMI =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
@@ -1082,13 +1082,35 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
// If all else fails, try to materialize the value in a register.
if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ auto GetCallRegForValue = [this](const Value *V) {
+ Register Reg = getRegForValue(V);
+
+ // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
+ if (Reg && Subtarget->isTarget64BitILP32()) {
+ Register CopyReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr),
+ CopyReg)
+ .addReg(Reg);
+
+ Register ExtReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
+ .addImm(0)
+ .addReg(CopyReg)
+ .addImm(X86::sub_32bit);
+ Reg = ExtReg;
+ }
+
+ return Reg;
+ };
+
if (AM.Base.Reg == 0) {
- AM.Base.Reg = getRegForValue(V);
+ AM.Base.Reg = GetCallRegForValue(V);
return AM.Base.Reg != 0;
}
if (AM.IndexReg == 0) {
assert(AM.Scale == 1 && "Scale with no index!");
- AM.IndexReg = getRegForValue(V);
+ AM.IndexReg = GetCallRegForValue(V);
return AM.IndexReg != 0;
}
}
@@ -1231,13 +1253,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
if (SrcVT == MVT::i1) {
if (Outs[0].Flags.isSExt())
return false;
- SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+ // TODO
+ SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false);
SrcVT = MVT::i8;
}
unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
ISD::SIGN_EXTEND;
- SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
- SrcReg, /*TODO: Kill=*/false);
+ // TODO
+ SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg,
+ /*Op0IsKill=*/false);
}
// Make the copy.
@@ -1431,8 +1455,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
ResultReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
- X86::sub_8bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg,
+ /*Op0IsKill=*/true, X86::sub_8bit);
if (!ResultReg)
return false;
break;
@@ -1555,11 +1579,11 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
Result32).addReg(ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true,
- X86::sub_16bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+ /*Op0IsKill=*/true, X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
- ResultReg, /*Kill=*/true);
+ ResultReg, /*Op0IsKill=*/true);
if (ResultReg == 0)
return false;
}
@@ -1601,11 +1625,11 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
Result32).addReg(ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true,
- X86::sub_16bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+ /*Op0IsKill=*/true, X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
- ResultReg, /*Kill=*/true);
+ ResultReg, /*Op0IsKill=*/true);
if (ResultReg == 0)
return false;
}
@@ -1757,7 +1781,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), OpReg)
.addReg(KOpReg);
- OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true,
+ OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true,
X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -1989,7 +2013,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
// Now reference the 8-bit subreg of the result.
ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
- /*Kill=*/true, X86::sub_8bit);
+ /*Op0IsKill=*/true, X86::sub_8bit);
}
// Copy the result out of the physreg if we haven't already.
if (!ResultReg) {
@@ -2103,7 +2127,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CondReg)
.addReg(KCondReg, getKillRegState(CondIsKill));
- CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2257,12 +2281,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
- Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
- LHSReg, LHSIsKill);
- Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
- RHSReg, RHSIsKill);
- Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
- AndReg, /*IsKill=*/true);
+ Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg,
+ /*Op0IsKill=*/false, LHSReg, LHSIsKill);
+ Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg,
+ /*Op0IsKill=*/true, RHSReg, RHSIsKill);
+ Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true,
+ AndReg, /*Op1IsKill=*/true);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
@@ -2321,7 +2345,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CondReg)
.addReg(KCondReg, getKillRegState(CondIsKill));
- CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2578,7 +2602,7 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
unsigned Reg;
bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
- RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+ RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM);
assert(RV && "Failed to emit load or store??");
unsigned Size = VT.getSizeInBits()/8;
@@ -2642,15 +2666,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
// Explicitly zero-extend the input to 32-bit.
InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
- /*Kill=*/false);
+ /*Op0IsKill=*/false);
// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
- InputReg, /*Kill=*/true);
+ InputReg, /*Op0IsKill=*/true);
unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
: X86::VCVTPH2PSrr;
- InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true);
+ InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true);
// The result value is in the lower 32-bits of ResultReg.
// Emit an explicit copy from register class VR128 to register class FR32.
@@ -2706,10 +2730,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// movq (%rax), %rax
// movq (%rax), %rax
// ...
- unsigned DestReg;
unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
while (Depth--) {
- DestReg = createResultReg(RC);
+ Register DestReg = createResultReg(RC);
addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), DestReg), SrcReg);
SrcReg = DestReg;
@@ -2879,8 +2902,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
const Value *RHS = II->getArgOperand(1);
// Canonicalize immediate to the RHS.
- if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
- isCommutativeIntrinsic(II))
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
std::swap(LHS, RHS);
unsigned BaseOpc, CondCode;
@@ -3693,10 +3715,10 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
default: llvm_unreachable("Unexpected value type");
case MVT::i1:
case MVT::i8:
- return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+ return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true,
X86::sub_8bit);
case MVT::i16:
- return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+ return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true,
X86::sub_16bit);
case MVT::i32:
return SrcReg;
@@ -3793,7 +3815,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
.addConstantPoolIndex(CPI, 0, OpFlag);
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg);
- addDirectMem(MIB, AddrReg);
+ addRegReg(MIB, AddrReg, false, PICBase, false);
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getConstantPool(*FuncInfo.MF),
MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 78de041329e2..f8d822aebc5b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -187,8 +187,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
/// If so, return that super register in \p SuperDestReg.
bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
Register &SuperDestReg) const {
- auto *TRI = &TII->getRegisterInfo();
-
+ const X86RegisterInfo *TRI = &TII->getRegisterInfo();
Register OrigDestReg = OrigMI->getOperand(0).getReg();
SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
@@ -320,7 +319,7 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
// This is only correct if we access the same subregister index: otherwise,
// we could try to replace "movb %ah, %al" with "movl %eax, %eax".
- auto *TRI = &TII->getRegisterInfo();
+ const X86RegisterInfo *TRI = &TII->getRegisterInfo();
if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
return nullptr;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 424279038921..0054d5818a96 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -376,7 +376,8 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 ||
- !TII->isSafeToClobberEFLAGS(MBB, I))
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) !=
+ MachineBasicBlock::LQR_Dead)
return false;
Register DestReg = MI.getOperand(0).getReg();
@@ -449,6 +450,7 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
} else
return false;
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
return true;
@@ -484,6 +486,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+ MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1);
MBB.erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator>(NewMI);
@@ -505,7 +508,8 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (Segment.getReg() != 0 || !Offset.isImm() ||
- !TII->isSafeToClobberEFLAGS(MBB, I))
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+ MachineBasicBlock::LQR_Dead)
return;
const Register DstR = Dst.getReg();
const Register SrcR1 = Base.getReg();
@@ -536,6 +540,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
LLVM_DEBUG(NewMI->dump(););
}
if (NewMI) {
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
}
@@ -555,7 +560,8 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
- !TII->isSafeToClobberEFLAGS(MBB, MI) ||
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+ MachineBasicBlock::LQR_Dead ||
Segment.getReg() != X86::NoRegister)
return;
@@ -641,6 +647,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
}
}
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
return;
@@ -666,6 +673,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
.add(Index);
LLVM_DEBUG(NewMI->dump(););
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
return;
@@ -688,6 +696,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
.add(Base);
LLVM_DEBUG(NewMI->dump(););
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 09668d7c5468..269f8ce6bd7a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -97,28 +97,31 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
if (FlagsDefMI->readsRegister(X86::EFLAGS))
continue;
- ++NumSubstZexts;
- Changed = true;
-
// On 32-bit, we need to be careful to force an ABCD register.
const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
? &X86::GR32RegClass
: &X86::GR32_ABCDRegClass;
- Register ZeroReg = MRI->createVirtualRegister(RC);
- Register InsertReg = MRI->createVirtualRegister(RC);
+ if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
+ // If we cannot constrain the register, we would need an additional copy
+ // and are better off keeping the MOVZX32rr8 we have now.
+ continue;
+ }
+
+ ++NumSubstZexts;
+ Changed = true;
// Initialize a register with 0. This must go before the eflags def
+ Register ZeroReg = MRI->createVirtualRegister(RC);
BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
ZeroReg);
// X86 setcc only takes an output GR8, so fake a GR32 input by inserting
// the setcc result into the low byte of the zeroed register.
BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
- TII->get(X86::INSERT_SUBREG), InsertReg)
+ TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
.addReg(ZeroReg)
.addReg(MI.getOperand(0).getReg())
.addImm(X86::sub_8bit);
- MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);
ToErase.push_back(ZExt);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index 831695dabcd8..d43fd807a5a7 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -97,7 +97,7 @@ private:
CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
MachineBasicBlock::iterator CopyDefI);
- unsigned promoteCondToReg(MachineBasicBlock &MBB,
+ Register promoteCondToReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, X86::CondCode Cond);
std::pair<unsigned, bool>
@@ -739,8 +739,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
X86::CondCode Cond = X86::getCondFromSETCC(MI);
if (Cond != X86::COND_INVALID && !MI.mayStore() &&
- MI.getOperand(0).isReg() &&
- Register::isVirtualRegister(MI.getOperand(0).getReg())) {
+ MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) {
assert(MI.getOperand(0).isDef() &&
"A non-storing SETcc should always define a register!");
CondRegs[Cond] = MI.getOperand(0).getReg();
@@ -754,7 +753,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
return CondRegs;
}
-unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
+Register X86FlagsCopyLoweringPass::promoteCondToReg(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, X86::CondCode Cond) {
Register Reg = MRI->createVirtualRegister(PromoteRC);
@@ -982,5 +981,4 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
MIB.setMemRefs(SetCCI.memoperands());
SetCCI.eraseFromParent();
- return;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
index db6b68659493..866f11364004 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -28,6 +28,7 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
@@ -148,60 +149,6 @@ static unsigned getLEArOpcode(bool IsLP64) {
return IsLP64 ? X86::LEA64r : X86::LEA32r;
}
-/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
-/// when it reaches the "return" instruction. We can then pop a stack object
-/// to this register without worry about clobbering it.
-static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const X86RegisterInfo *TRI,
- bool Is64Bit) {
- const MachineFunction *MF = MBB.getParent();
- if (MF->callsEHReturn())
- return 0;
-
- const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
-
- if (MBBI == MBB.end())
- return 0;
-
- switch (MBBI->getOpcode()) {
- default: return 0;
- case TargetOpcode::PATCHABLE_RET:
- case X86::RET:
- case X86::RETL:
- case X86::RETQ:
- case X86::RETIL:
- case X86::RETIQ:
- case X86::TCRETURNdi:
- case X86::TCRETURNri:
- case X86::TCRETURNmi:
- case X86::TCRETURNdi64:
- case X86::TCRETURNri64:
- case X86::TCRETURNmi64:
- case X86::EH_RETURN:
- case X86::EH_RETURN64: {
- SmallSet<uint16_t, 8> Uses;
- for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MBBI->getOperand(i);
- if (!MO.isReg() || MO.isDef())
- continue;
- Register Reg = MO.getReg();
- if (!Reg)
- continue;
- for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
- Uses.insert(*AI);
- }
-
- for (auto CS : AvailableRegs)
- if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP &&
- CS != X86::ESP)
- return CS;
- }
- }
-
- return 0;
-}
-
static bool isEAXLiveIn(MachineBasicBlock &MBB) {
for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
unsigned Reg = RegMask.PhysReg;
@@ -288,7 +235,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
if (isSub && !isEAXLiveIn(MBB))
Reg = Rax;
else
- Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+ Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
unsigned AddSubRROpc =
@@ -345,7 +292,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
// need to find a dead register when using pop.
unsigned Reg = isSub
? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
- : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+ : TRI->findDeadCallerSavedReg(MBB, MBBI);
if (Reg) {
unsigned Opc = isSub
? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
@@ -490,9 +437,9 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
}
const MachineModuleInfo &MMI = MF.getMMI();
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- const unsigned FramePtr = TRI->getFrameRegister(MF);
- const unsigned MachineFramePtr =
- STI.isTarget64BitILP32() ? unsigned(getX86SubSuperRegister(FramePtr, 64))
+ const Register FramePtr = TRI->getFrameRegister(MF);
+ const Register MachineFramePtr =
+ STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
: FramePtr;
unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
// Offset = space for return address + size of the frame pointer itself.
@@ -1743,7 +1690,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
assert(Personality == EHPersonality::MSVC_CXX);
Register FrameReg;
int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
- int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+ int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
// ESP is the first field, so no extra displacement is needed.
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
false, EHRegOffset)
@@ -1764,8 +1711,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (IsWin64Prologue && IsFunclet)
Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
else
- Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
- SEHFrameOffset;
+ Offset =
+ getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
+ SEHFrameOffset;
HasWinCFI = true;
assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1837,7 +1785,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
Register UsedReg;
int Offset =
- getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+ .getFixed();
assert(UsedReg == BasePtr);
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
.addReg(FramePtr)
@@ -1915,7 +1864,8 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
Register SPReg;
int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
- /*IgnoreSPUpdates*/ true);
+ /*IgnoreSPUpdates*/ true)
+ .getFixed();
assert(Offset >= 0 && SPReg == TRI->getStackRegister());
return static_cast<unsigned>(Offset);
}
@@ -1970,7 +1920,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
const bool Is64BitILP32 = STI.isTarget64BitILP32();
Register FramePtr = TRI->getFrameRegister(MF);
- unsigned MachineFramePtr =
+ Register MachineFramePtr =
Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -2141,10 +2091,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
+
+ // Emit tilerelease for AMX kernel.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!MRI.reg_nodbg_empty(X86::TMMCFG))
+ BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
}
-int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const {
+StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
bool IsFixed = MFI.isFixedObjectIndex(FI);
@@ -2191,7 +2147,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
if (FI && FI == X86FI->getFAIndex())
- return -SEHFrameOffset;
+ return StackOffset::getFixed(-SEHFrameOffset);
// FPDelta is the offset from the "traditional" FP location of the old base
// pointer followed by return address and the location required by the
@@ -2207,23 +2163,23 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
if (FI < 0) {
// Skip the saved EBP.
- return Offset + SlotSize + FPDelta;
+ return StackOffset::getFixed(Offset + SlotSize + FPDelta);
} else {
assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
- return Offset + StackSize;
+ return StackOffset::getFixed(Offset + StackSize);
}
} else if (TRI->needsStackRealignment(MF)) {
if (FI < 0) {
// Skip the saved EBP.
- return Offset + SlotSize + FPDelta;
+ return StackOffset::getFixed(Offset + SlotSize + FPDelta);
} else {
assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
- return Offset + StackSize;
+ return StackOffset::getFixed(Offset + StackSize);
}
// FIXME: Support tail calls
} else {
if (!HasFP)
- return Offset + StackSize;
+ return StackOffset::getFixed(Offset + StackSize);
// Skip the saved EBP.
Offset += SlotSize;
@@ -2234,7 +2190,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
Offset -= TailCallReturnAddrDelta;
}
- return Offset + FPDelta;
+ return StackOffset::getFixed(Offset + FPDelta);
}
int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
@@ -2245,24 +2201,27 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
const auto it = WinEHXMMSlotInfo.find(FI);
if (it == WinEHXMMSlotInfo.end())
- return getFrameIndexReference(MF, FI, FrameReg);
+ return getFrameIndexReference(MF, FI, FrameReg).getFixed();
FrameReg = TRI->getStackRegister();
return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
it->second;
}
-int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
- int FI, Register &FrameReg,
- int Adjustment) const {
+StackOffset
+X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ int Adjustment) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
FrameReg = TRI->getStackRegister();
- return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
+ return StackOffset::getFixed(MFI.getObjectOffset(FI) -
+ getOffsetOfLocalArea() + Adjustment);
}
-int X86FrameLowering::getFrameIndexReferencePreferSP(
- const MachineFunction &MF, int FI, Register &FrameReg,
- bool IgnoreSPUpdates) const {
+StackOffset
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+ int FI, Register &FrameReg,
+ bool IgnoreSPUpdates) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
// Does not include any dynamic realign.
@@ -2939,8 +2898,8 @@ static unsigned getHiPELiteral(
// non-meta instructions between MBBI and MBB.end().
static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
MachineBasicBlock::const_iterator MBBI) {
- return std::all_of(
- MBB.succ_begin(), MBB.succ_end(),
+ return llvm::all_of(
+ MBB.successors(),
[](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
return MI.isMetaInstruction();
@@ -3101,7 +3060,6 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,
int Offset) const {
-
if (Offset <= 0)
return false;
@@ -3124,14 +3082,13 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
unsigned Regs[2];
unsigned FoundRegs = 0;
- auto &MRI = MBB.getParent()->getRegInfo();
- auto RegMask = Prev->getOperand(1);
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const MachineOperand &RegMask = Prev->getOperand(1);
auto &RegClass =
Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
// Try to find up to NumPops free registers.
for (auto Candidate : RegClass) {
-
// Poor man's liveness:
// Since we're immediately after a call, any register that is clobbered
// by the call and not defined by it can be considered dead.
@@ -3312,10 +3269,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
// If we may need to emit frameless compact unwind information, give
// up as this is currently broken: PR25614.
- return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
- // The lowering of segmented stack and HiPE only support entry blocks
- // as prologue blocks: PR26107.
- // This limitation may be lifted if we fix:
+ bool CompactUnwind =
+ MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
+ nullptr;
+ return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
+ !CompactUnwind) &&
+ // The lowering of segmented stack and HiPE only support entry
+ // blocks as prologue blocks: PR26107. This limitation may be
+ // lifted if we fix:
// - adjustForSegmentedStacks
// - adjustForHiPEPrologue
MF.getFunction().getCallingConv() != CallingConv::HiPE &&
@@ -3350,7 +3311,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
}
Register UsedReg;
- int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+ int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
int EndOffset = -EHRegOffset - EHRegSize;
FuncInfo.EHRegNodeEndOffset = EndOffset;
@@ -3373,7 +3334,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
// MOV32rm SavedEBPOffset(%esi), %ebp
assert(X86FI->getHasSEHFramePtrSave());
int Offset =
- getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+ .getFixed();
assert(UsedReg == BasePtr);
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
UsedReg, true, Offset)
@@ -3418,7 +3380,7 @@ struct X86FrameSortingObject {
// at the end of our list.
struct X86FrameSortingComparator {
inline bool operator()(const X86FrameSortingObject &A,
- const X86FrameSortingObject &B) {
+ const X86FrameSortingObject &B) const {
uint64_t DensityAScaled, DensityBScaled;
// For consistency in our comparison, all invalid objects are placed
@@ -3554,13 +3516,21 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// emitPrologue if it gets called and emits CFI.
MF.setHasWinCFI(false);
+ // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
+ // aligned. The format doesn't support misaligned stack adjustments.
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
+ MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
+
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
- const Function &F = MF.getFunction();
- if (!STI.is64Bit() || !MF.hasEHFunclets() ||
- classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX)
- return;
+ if (STI.is64Bit() && MF.hasEHFunclets() &&
+ classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
+ EHPersonality::MSVC_CXX) {
+ adjustFrameForMsvcCxxEh(MF);
+ }
+}
+void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
// relative to RSP after the prologue. Find the offset of the last fixed
// object, so that we can allocate a slot immediately following it. If there
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
index bb2e83205e71..26e80811af2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
@@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -102,16 +103,17 @@ public:
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
bool needsFrameIndexResolution(const MachineFunction &MF) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
Register &SPReg) const;
- int getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
- Register &SPReg, int Adjustment) const;
- int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
- Register &FrameReg,
- bool IgnoreSPUpdates) const override;
+ StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &SPReg, int Adjustment) const;
+ StackOffset
+ getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ bool IgnoreSPUpdates) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -222,12 +224,7 @@ private:
const DebugLoc &DL, uint64_t Offset,
uint64_t Align) const;
- /// Emit a stub to later inline the target stack probe.
- MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL,
- bool InProlog) const;
+ void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
/// Aligns the stack pointer by ANDing it with -MaxAlign.
void BuildStackAlignAND(MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4768c5aa543d..1df9a0d1700f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -17,6 +17,7 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ConstantRange.h"
@@ -44,6 +45,8 @@ static cl::opt<bool> EnablePromoteAnyextLoad(
"x86-promote-anyext-load", cl::init(true),
cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
+extern cl::opt<bool> IndirectBranchTracking;
+
//===----------------------------------------------------------------------===//
// Pattern Matcher Implementation
//===----------------------------------------------------------------------===//
@@ -204,7 +207,8 @@ namespace {
void Select(SDNode *N) override;
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
- bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+ bool AllowSegmentRegForX32 = false);
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
@@ -499,6 +503,8 @@ namespace {
bool tryShiftAmountMod(SDNode *N);
bool tryShrinkShlLogicImm(SDNode *N);
bool tryVPTERNLOG(SDNode *N);
+ bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC,
+ SDValue A, SDValue B, SDValue C, uint8_t Imm);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
bool tryMatchBitSelect(SDNode *N);
@@ -521,9 +527,9 @@ namespace {
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
- if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
- Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
- Opcode == X86ISD::VFPCLASS) {
+ if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
+ Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
+ Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
@@ -795,12 +801,69 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
return false;
}
+static bool isEndbrImm64(uint64_t Imm) {
+// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
+// i.g: 0xF3660F1EFA, 0xF3670F1EFA
+ if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
+ return false;
+
+ uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
+ 0x65, 0x66, 0x67, 0xf0, 0xf2};
+ int i = 24; // 24bit 0x0F1EFA has matched
+ while (i < 64) {
+ uint8_t Byte = (Imm >> i) & 0xFF;
+ if (Byte == 0xF3)
+ return true;
+ if (!llvm::is_contained(OptionalPrefixBytes, Byte))
+ return false;
+ i += 8;
+ }
+
+ return false;
+}
+
void X86DAGToDAGISel::PreprocessISelDAG() {
bool MadeChange = false;
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+ // This is for CET enhancement.
+ //
+ // ENDBR32 and ENDBR64 have specific opcodes:
+ // ENDBR32: F3 0F 1E FB
+ // ENDBR64: F3 0F 1E FA
+ // And we want that attackers won’t find unintended ENDBR32/64
+ // opcode matches in the binary
+ // Here’s an example:
+ // If the compiler had to generate asm for the following code:
+ // a = 0xF30F1EFA
+ // it could, for example, generate:
+ // mov 0xF30F1EFA, dword ptr[a]
+ // In such a case, the binary would include a gadget that starts
+ // with a fake ENDBR64 opcode. Therefore, we split such generation
+ // into multiple operations, let it not shows in the binary
+ if (N->getOpcode() == ISD::Constant) {
+ MVT VT = N->getSimpleValueType(0);
+ int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
+ int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
+ if (Imm == EndbrImm || isEndbrImm64(Imm)) {
+ // Check that the cf-protection-branch is enabled.
+ Metadata *CFProtectionBranch =
+ MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
+ if (CFProtectionBranch || IndirectBranchTracking) {
+ SDLoc dl(N);
+ SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
+ Complement = CurDAG->getNOT(dl, Complement, VT);
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+ }
+
// If this is a target specific AND node with no flag usages, turn it back
// into ISD::AND to enable test instruction matching.
if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
@@ -1005,6 +1068,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
case ISD::STRICT_FFLOOR:
case ISD::FTRUNC:
case ISD::STRICT_FTRUNC:
+ case ISD::FROUNDEVEN:
+ case ISD::STRICT_FROUNDEVEN:
case ISD::FNEARBYINT:
case ISD::STRICT_FNEARBYINT:
case ISD::FRINT:
@@ -1020,6 +1085,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
case ISD::FFLOOR: Imm = 0x9; break;
case ISD::STRICT_FTRUNC:
case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::STRICT_FROUNDEVEN:
+ case ISD::FROUNDEVEN: Imm = 0x8; break;
case ISD::STRICT_FNEARBYINT:
case ISD::FNEARBYINT: Imm = 0xC; break;
case ISD::STRICT_FRINT:
@@ -1032,11 +1099,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
{N->getValueType(0), MVT::Other},
{N->getOperand(0), N->getOperand(1),
- CurDAG->getTargetConstant(Imm, dl, MVT::i8)});
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
else
Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
N->getOperand(0),
- CurDAG->getTargetConstant(Imm, dl, MVT::i8));
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32));
--I;
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
++I;
@@ -1547,20 +1614,26 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
}
-bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+ bool AllowSegmentRegForX32) {
SDValue Address = N->getOperand(1);
// load gs:0 -> GS segment register.
// load fs:0 -> FS segment register.
//
- // This optimization is valid because the GNU TLS model defines that
- // gs:0 (or fs:0 on X86-64) contains its own address.
+ // This optimization is generally valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
+ // with 32-bit registers, as we get in ILP32 mode, those registers are first
+ // zero-extended to 64 bits and then added it to the base address, which gives
+ // unwanted results when the register holds a negative value.
// For more information see http://people.redhat.com/drepper/tls.pdf
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) {
if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
!IndirectTlsSegRefs &&
(Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
- Subtarget->isTargetFuchsia()))
+ Subtarget->isTargetFuchsia())) {
+ if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
+ return true;
switch (N->getPointerInfo().getAddrSpace()) {
case X86AS::GS:
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1571,6 +1644,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
// Address space X86AS::SS is not handled here, because it is not used to
// address TLS areas.
}
+ }
+ }
return true;
}
@@ -1654,6 +1729,21 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
if (matchAddressRecursively(N, AM, 0))
return true;
+ // Post-processing: Make a second attempt to fold a load, if we now know
+ // that there will not be any other register. This is only performed for
+ // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
+ // any foldable load the first time.
+ if (Subtarget->isTarget64BitILP32() &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
+ SDValue Save_Base_Reg = AM.Base_Reg;
+ if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
+ AM.Base_Reg = SDValue();
+ if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
+ AM.Base_Reg = Save_Base_Reg;
+ }
+ }
+
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
// a smaller encoding and avoids a scaled-index.
if (AM.Scale == 2 &&
@@ -2628,12 +2718,12 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
AM.Disp += GA->getOffset();
AM.SymbolFlags = GA->getTargetFlags();
- MVT VT = N.getSimpleValueType();
- if (VT == MVT::i32) {
+ if (Subtarget->is32Bit()) {
AM.Scale = 1;
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
}
+ MVT VT = N.getSimpleValueType();
getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -2723,7 +2813,10 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
return false;
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
- return CR && CR->getSignedMin().sge(-1ull << Width) &&
+ if (!CR)
+ return Width == 32 && TM.getCodeModel() == CodeModel::Small;
+
+ return CR->getSignedMin().sge(-1ull << Width) &&
CR->getSignedMax().slt(1ull << Width);
}
@@ -3117,7 +3210,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
- unsigned NewOpc =
+ unsigned NewOpc =
((Opc == X86ISD::ADD) == IsOne)
? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
@@ -3373,7 +3466,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// Match the shift amount as: (bitwidth - y). It should go away, too.
if (ShiftAmt.getOpcode() != ISD::SUB)
return false;
- auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+ auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
if (!V0 || V0->getZExtValue() != Bitwidth)
return false;
NBits = ShiftAmt.getOperand(1);
@@ -3926,6 +4019,129 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
return true;
}
+bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
+ SDNode *ParentBC, SDValue A, SDValue B,
+ SDValue C, uint8_t Imm) {
+ assert(A.isOperandOf(ParentA));
+ assert(B.isOperandOf(ParentBC));
+ assert(C.isOperandOf(ParentBC));
+
+ auto tryFoldLoadOrBCast =
+ [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment) {
+ if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+ return true;
+
+ // Not a load, check for broadcast which may be behind a bitcast.
+ if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+ P = L.getNode();
+ L = L.getOperand(0);
+ }
+
+ if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
+
+ // Only 32 and 64 bit broadcasts are supported.
+ auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+ unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
+ if (Size != 32 && Size != 64)
+ return false;
+
+ return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
+ };
+
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ FoldedLoad = true;
+ } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4)) {
+ FoldedLoad = true;
+ std::swap(A, C);
+ // Swap bits 1/4 and 3/6.
+ uint8_t OldImm = Imm;
+ Imm = OldImm & 0xa5;
+ if (OldImm & 0x02) Imm |= 0x10;
+ if (OldImm & 0x10) Imm |= 0x02;
+ if (OldImm & 0x08) Imm |= 0x40;
+ if (OldImm & 0x40) Imm |= 0x08;
+ } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4)) {
+ FoldedLoad = true;
+ std::swap(B, C);
+ // Swap bits 1/2 and 5/6.
+ uint8_t OldImm = Imm;
+ Imm = OldImm & 0x99;
+ if (OldImm & 0x02) Imm |= 0x04;
+ if (OldImm & 0x04) Imm |= 0x02;
+ if (OldImm & 0x20) Imm |= 0x40;
+ if (OldImm & 0x40) Imm |= 0x20;
+ }
+
+ SDLoc DL(Root);
+
+ SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+
+ MVT NVT = Root->getSimpleValueType(0);
+
+ MachineSDNode *MNode;
+ if (FoldedLoad) {
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+
+ unsigned Opc;
+ if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(C);
+ unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
+ assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
+
+ bool UseD = EltSize == 32;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
+ else
+ llvm_unreachable("Unexpected vector size!");
+ } else {
+ bool UseD = NVT.getVectorElementType() == MVT::i32;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
+ else
+ llvm_unreachable("Unexpected vector size!");
+ }
+
+ SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
+ MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+
+ // Update the chain.
+ ReplaceUses(C.getValue(1), SDValue(MNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
+ } else {
+ bool UseD = NVT.getVectorElementType() == MVT::i32;
+ unsigned Opc;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
+ else
+ llvm_unreachable("Unexpected vector size!");
+
+ MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
// Try to match two logic ops to a VPTERNLOG.
// FIXME: Handle inverted inputs?
// FIXME: Handle more complex patterns that use an operand more than once?
@@ -3941,68 +4157,65 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
return false;
- unsigned Opc1 = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- auto isLogicOp = [](unsigned Opc) {
- return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
- Opc == X86ISD::ANDNP;
+ auto getFoldableLogicOp = [](SDValue Op) {
+ // Peek through single use bitcast.
+ if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
+ Op = Op.getOperand(0);
+
+ if (!Op.hasOneUse())
+ return SDValue();
+
+ unsigned Opc = Op.getOpcode();
+ if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+ Opc == X86ISD::ANDNP)
+ return Op;
+
+ return SDValue();
};
- SDValue A, B, C;
- unsigned Opc2;
- if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) {
- Opc2 = N1.getOpcode();
+ SDValue A, FoldableOp;
+ if ((FoldableOp = getFoldableLogicOp(N1))) {
A = N0;
- B = N1.getOperand(0);
- C = N1.getOperand(1);
- } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
- Opc2 = N0.getOpcode();
+ } else if ((FoldableOp = getFoldableLogicOp(N0))) {
A = N1;
- B = N0.getOperand(0);
- C = N0.getOperand(1);
} else
return false;
- uint64_t Imm;
- switch (Opc1) {
+ SDValue B = FoldableOp.getOperand(0);
+ SDValue C = FoldableOp.getOperand(1);
+
+ // We can build the appropriate control immediate by performing the logic
+ // operation we're matching using these constants for A, B, and C.
+ const uint8_t TernlogMagicA = 0xf0;
+ const uint8_t TernlogMagicB = 0xcc;
+ const uint8_t TernlogMagicC = 0xaa;
+
+ uint8_t Imm;
+ switch (FoldableOp.getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
- case ISD::AND:
- switch (Opc2) {
- default: llvm_unreachable("Unexpected opcode!");
- case ISD::AND: Imm = 0x80; break;
- case ISD::OR: Imm = 0xe0; break;
- case ISD::XOR: Imm = 0x60; break;
- case X86ISD::ANDNP: Imm = 0x20; break;
- }
- break;
- case ISD::OR:
- switch (Opc2) {
- default: llvm_unreachable("Unexpected opcode!");
- case ISD::AND: Imm = 0xf8; break;
- case ISD::OR: Imm = 0xfe; break;
- case ISD::XOR: Imm = 0xf6; break;
- case X86ISD::ANDNP: Imm = 0xf2; break;
- }
- break;
- case ISD::XOR:
- switch (Opc2) {
- default: llvm_unreachable("Unexpected opcode!");
- case ISD::AND: Imm = 0x78; break;
- case ISD::OR: Imm = 0x1e; break;
- case ISD::XOR: Imm = 0x96; break;
- case X86ISD::ANDNP: Imm = 0xd2; break;
- }
+ case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
+ case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
+ case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
+ case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+ }
+
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::ANDNP:
+ if (A == N0)
+ Imm &= ~TernlogMagicA;
+ else
+ Imm = ~(Imm) & TernlogMagicA;
break;
+ case ISD::AND: Imm &= TernlogMagicA; break;
+ case ISD::OR: Imm |= TernlogMagicA; break;
+ case ISD::XOR: Imm ^= TernlogMagicA; break;
}
- SDLoc DL(N);
- SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C,
- CurDAG->getTargetConstant(Imm, DL, MVT::i8));
- ReplaceNode(N, New.getNode());
- SelectCode(New.getNode());
- return true;
+ return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm);
}
/// If the high bits of an 'and' operand are known zero, try setting the
@@ -4069,6 +4282,7 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
// A negative mask allows a smaller encoding. Create a new 'and' node.
SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
+ insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
ReplaceNode(And, NewAnd.getNode());
SelectCode(NewAnd.getNode());
@@ -4102,15 +4316,15 @@ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
VPTESTM_CASE(v64i8, BZ##SUFFIX) \
VPTESTM_CASE(v32i16, WZ##SUFFIX)
- if (FoldedLoad) {
+ if (FoldedBCast) {
switch (TestVT.SimpleTy) {
- VPTESTM_FULL_CASES(rm)
+ VPTESTM_BROADCAST_CASES(rmb)
}
}
- if (FoldedBCast) {
+ if (FoldedLoad) {
switch (TestVT.SimpleTy) {
- VPTESTM_BROADCAST_CASES(rmb)
+ VPTESTM_FULL_CASES(rm)
}
}
@@ -4169,79 +4383,56 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
}
}
- // Without VLX we need to widen the load.
+ // Without VLX we need to widen the operation.
bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
- // We can only fold loads if the sources are unique.
- bool CanFoldLoads = Src0 != Src1;
+ auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
+ SDValue &Base, SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ // If we need to widen, we can't fold the load.
+ if (!Widen)
+ if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+ return true;
- // Try to fold loads unless we need to widen.
- bool FoldedLoad = false;
- SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
- if (!Widen && CanFoldLoads) {
- Load = Src1;
- FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
- Tmp4);
- if (!FoldedLoad) {
- // And is computative.
- Load = Src0;
- FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
- Tmp3, Tmp4);
- if (FoldedLoad)
- std::swap(Src0, Src1);
- }
- }
+ // If we didn't fold a load, try to match broadcast. No widening limitation
+ // for this. But only 32 and 64 bit types are supported.
+ if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
+ return false;
- auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
// Look through single use bitcasts.
- if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
- Parent = Src.getNode();
- Src = Src.getOperand(0);
+ if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+ P = L.getNode();
+ L = L.getOperand(0);
}
- if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
- auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
- if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
- return Src;
- }
+ if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
- return SDValue();
+ auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+ if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
+ return false;
+
+ return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
};
- // If we didn't fold a load, try to match broadcast. No widening limitation
- // for this. But only 32 and 64 bit types are supported.
- bool FoldedBCast = false;
- if (!FoldedLoad && CanFoldLoads &&
- (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
- SDNode *ParentNode = N0.getNode();
- if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
- FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
- Tmp1, Tmp2, Tmp3, Tmp4);
- }
+ // We can only fold loads if the sources are unique.
+ bool CanFoldLoads = Src0 != Src1;
- // Try the other operand.
- if (!FoldedBCast) {
- SDNode *ParentNode = N0.getNode();
- if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
- FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
- Tmp1, Tmp2, Tmp3, Tmp4);
- if (FoldedBCast)
- std::swap(Src0, Src1);
- }
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (CanFoldLoads) {
+ FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4);
+ if (!FoldedLoad) {
+ // And is commutative.
+ FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
+ Tmp2, Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(Src0, Src1);
}
}
- auto getMaskRC = [](MVT MaskVT) {
- switch (MaskVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v2i1: return X86::VK2RegClassID;
- case MVT::v4i1: return X86::VK4RegClassID;
- case MVT::v8i1: return X86::VK8RegClassID;
- case MVT::v16i1: return X86::VK16RegClassID;
- case MVT::v32i1: return X86::VK32RegClassID;
- case MVT::v64i1: return X86::VK64RegClassID;
- }
- };
+ bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
bool IsMasked = InMask.getNode() != nullptr;
@@ -4260,13 +4451,12 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
CmpVT), 0);
Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
- assert(!FoldedLoad && "Shouldn't have folded the load");
if (!FoldedBCast)
Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
if (IsMasked) {
// Widen the mask.
- unsigned RegClass = getMaskRC(MaskVT);
+ unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
dl, MaskVT, InMask, RC), 0);
@@ -4278,23 +4468,23 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
IsMasked);
MachineSDNode *CNode;
- if (FoldedLoad || FoldedBCast) {
+ if (FoldedLoad) {
SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
if (IsMasked) {
SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
- Load.getOperand(0) };
+ Src1.getOperand(0) };
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
} else {
SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
- Load.getOperand(0) };
+ Src1.getOperand(0) };
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
}
// Update the chain.
- ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
+ ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
// Record the mem-refs
- CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
+ CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
} else {
if (IsMasked)
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
@@ -4304,7 +4494,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
// If we widened, we need to shrink the mask VT.
if (Widen) {
- unsigned RegClass = getMaskRC(ResVT);
+ unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
dl, ResVT, SDValue(CNode, 0), RC);
@@ -4360,8 +4550,9 @@ bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
ReplaceNode(N, Ternlog.getNode());
- SelectCode(Ternlog.getNode());
- return true;
+
+ return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
+ A, B, C, 0xCA);
}
void X86DAGToDAGISel::Select(SDNode *Node) {
@@ -4377,6 +4568,95 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default: break;
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = Node->getConstantOperandVal(1);
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_encodekey128:
+ case Intrinsic::x86_encodekey256: {
+ if (!Subtarget->hasKL())
+ break;
+
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
+ case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
+ }
+
+ SDValue Chain = Node->getOperand(0);
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
+ SDValue());
+ if (Opcode == X86::ENCODEKEY256)
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
+ Chain.getValue(1));
+
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ Opcode, dl, Node->getVTList(),
+ {Node->getOperand(2), Chain, Chain.getValue(1)});
+ ReplaceNode(Node, Res);
+ return;
+ }
+ case Intrinsic::x86_tileloadd64_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILELOADDV;
+ // _tile_loadd_internal(row, col, buf, STRIDE)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tdpbssd_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ SDValue Chain = Node->getOperand(0);
+ unsigned Opc = X86::PTDPBSSDV;
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Node->getOperand(4),
+ Node->getOperand(5),
+ Node->getOperand(6),
+ Node->getOperand(7),
+ CFG,
+ Chain};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tilezero_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILEZEROV;
+ SDValue Chain = Node->getOperand(0);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ }
+ break;
+ }
case ISD::INTRINSIC_VOID: {
unsigned IntNo = Node->getConstantOperandVal(1);
switch (IntNo) {
@@ -4431,6 +4711,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
+ case Intrinsic::x86_tilestored64_internal: {
+ unsigned Opc = X86::PTILESTOREDV;
+ // _tile_stored_internal(row, col, buf, STRIDE, c)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ Node->getOperand(6),
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {
@@ -4511,6 +4816,19 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
return;
break;
+ case X86ISD::VPTERNLOG: {
+ uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
+ if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2), Imm))
+ return;
+ break;
+ }
+
+ case X86ISD::ANDNP:
+ if (tryVPTERNLOG(Node))
+ return;
+ break;
+
case ISD::AND:
if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
// Try to form a masked VPTESTM. Operands can be in either order.
@@ -5609,6 +5927,62 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->RemoveDeadNode(Node);
return;
}
+ case X86ISD::AESENCWIDE128KL:
+ case X86ISD::AESDECWIDE128KL:
+ case X86ISD::AESENCWIDE256KL:
+ case X86ISD::AESDECWIDE256KL: {
+ if (!Subtarget->hasWIDEKL())
+ break;
+
+ unsigned Opcode;
+ switch (Node->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case X86ISD::AESENCWIDE128KL:
+ Opcode = X86::AESENCWIDE128KL;
+ break;
+ case X86ISD::AESDECWIDE128KL:
+ Opcode = X86::AESDECWIDE128KL;
+ break;
+ case X86ISD::AESENCWIDE256KL:
+ Opcode = X86::AESENCWIDE256KL;
+ break;
+ case X86ISD::AESDECWIDE256KL:
+ Opcode = X86::AESDECWIDE256KL;
+ break;
+ }
+
+ SDValue Chain = Node->getOperand(0);
+ SDValue Addr = Node->getOperand(1);
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
+ break;
+
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
+ SDValue());
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
+ Chain.getValue(1));
+
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ Opcode, dl, Node->getVTList(),
+ {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
+ CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
+ ReplaceNode(Node, Res);
+ return;
+ }
}
SelectCode(Node);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
index 56690c3c555b..0dd20235aa3c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -76,6 +77,14 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
" of the loop header PC will be 0)."),
cl::Hidden);
+static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
+ "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
+ cl::desc(
+ "Sets the preferable loop alignment for experiments (as log2 bytes) "
+ "for innermost loops only. If specified, this option overrides "
+ "alignment set by x86-experimental-pref-loop-alignment."),
+ cl::Hidden);
+
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -135,19 +144,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addBypassSlowDiv(64, 32);
}
- if (Subtarget.isTargetWindowsMSVC() ||
- Subtarget.isTargetWindowsItanium()) {
- // Setup Windows compiler runtime calls.
- setLibcallName(RTLIB::SDIV_I64, "_alldiv");
- setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
- setLibcallName(RTLIB::SREM_I64, "_allrem");
- setLibcallName(RTLIB::UREM_I64, "_aullrem");
- setLibcallName(RTLIB::MUL_I64, "_allmul");
- setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
- setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
- setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
- setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
- setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+ // Setup Windows compiler runtime calls.
+ if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
+ { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
+ { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
+ { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
+ { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
}
if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -193,8 +207,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
setOperationAction(ISD::ABS , MVT::i32 , Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
}
- setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
@@ -278,6 +293,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
+ if (Subtarget.hasSSE2()) {
+ // Custom lowering for saturating float to int conversions.
+ // We handle promotion to larger result types manually.
+ for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+ setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+ }
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+ }
+ }
+
// Handle address space casts between mixed sized pointers.
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
@@ -384,6 +412,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+ setOperationAction(ISD::PARITY, MVT::i8, Custom);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {
@@ -394,6 +423,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
+
+ setOperationAction(ISD::PARITY, MVT::i16, Custom);
+ setOperationAction(ISD::PARITY, MVT::i32, Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::PARITY, MVT::i64, Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -487,6 +521,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+ setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
@@ -915,9 +950,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
- setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
- setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
@@ -1081,6 +1114,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
setOperationAction(ISD::FROUND, RoundedTy, Custom);
}
@@ -1094,6 +1129,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
@@ -1134,6 +1171,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -1175,6 +1216,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
@@ -1302,6 +1345,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
@@ -1560,6 +1607,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
}
@@ -1688,10 +1737,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasVBMI2()) {
- for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
+
+ setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
}
}// useAVX512Regs
@@ -1858,20 +1914,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
}
- if (Subtarget.hasVBMI2()) {
- // TODO: Make these legal even without VLX?
- for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
- MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
- setOperationAction(ISD::FSHL, VT, Custom);
- setOperationAction(ISD::FSHR, VT, Custom);
- }
- }
-
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
+ if (Subtarget.hasAMXTILE()) {
+ addRegisterClass(MVT::x86amx, &X86::TILERegClass);
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1901,6 +1952,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, VT, Custom);
setOperationAction(ISD::SUBCARRY, VT, Custom);
setOperationAction(ISD::SETCCCARRY, VT, Custom);
+ setOperationAction(ISD::SADDO_CARRY, VT, Custom);
+ setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
}
if (!Subtarget.is64Bit()) {
@@ -1923,8 +1976,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UDIV, MVT::i128, Custom);
setOperationAction(ISD::SREM, MVT::i128, Custom);
setOperationAction(ISD::UREM, MVT::i128, Custom);
- setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
- setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
}
// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
@@ -2456,13 +2507,23 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
+ unsigned AddressSpace = getAddressSpace();
+ // Specially, some users may customize the base reg and offset.
+ unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
+ // If we don't set -stack-protector-guard-offset value:
// %fs:0x28, unless we're using a Kernel code model, in which case
// it's %gs:0x28. gs:0x14 on i386.
- unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
- return SegmentOffset(IRB, Offset, getAddressSpace());
+ if (Offset == (unsigned)-1)
+ Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+
+ const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
+ if (GuardReg == "fs")
+ AddressSpace = X86AS::FS;
+ else if (GuardReg == "gs")
+ AddressSpace = X86AS::GS;
+ return SegmentOffset(IRB, Offset, AddressSpace);
}
}
-
return TargetLowering::getIRStackGuard(IRB);
}
@@ -2484,8 +2545,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
}
return;
}
+
+ auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
+
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
- if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
+ if ((GuardMode == llvm::StackProtectorGuards::TLS ||
+ GuardMode == llvm::StackProtectorGuards::None)
+ && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
@@ -2531,17 +2597,6 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
return TargetLowering::getSafeStackPointerLocation(IRB);
}
-bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
- unsigned DestAS) const {
- assert(SrcAS != DestAS && "Expected different address spaces!");
-
- const TargetMachine &TM = getTargetMachine();
- if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
- return false;
-
- return SrcAS < 256 && DestAS < 256;
-}
-
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -3046,8 +3101,9 @@ SDValue X86TargetLowering::LowerCallResult(
// This truncation won't change the value.
DAG.getIntPtrConstant(1, dl));
- if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
+ if (VA.isExtInLoc()) {
if (VA.getValVT().isVector() &&
+ VA.getValVT().getScalarType() == MVT::i1 &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
@@ -3115,7 +3171,7 @@ argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
- SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+ SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
return DAG.getMemcpy(
Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
@@ -3364,8 +3420,8 @@ private:
void forwardMustTailParameters(SDValue &Chain);
- bool is64Bit() { return Subtarget.is64Bit(); }
- bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }
+ bool is64Bit() const { return Subtarget.is64Bit(); }
+ bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
X86MachineFunctionInfo *FuncInfo;
const SDLoc &DL;
@@ -3476,11 +3532,10 @@ void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
SaveXMMOps.push_back(Chain);
SaveXMMOps.push_back(ALVal);
SaveXMMOps.push_back(
- DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
+ DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
SaveXMMOps.push_back(
- DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
- LiveXMMRegs.end());
+ DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
+ llvm::append_range(SaveXMMOps, LiveXMMRegs);
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
MVT::Other, SaveXMMOps));
}
@@ -3754,7 +3809,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
- int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);
+ int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
@@ -3861,6 +3916,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
+ bool IsIndirectCall = (CI && CI->isIndirectCall());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
@@ -4100,9 +4156,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Subtarget.isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
- // GOT pointer.
+ // GOT pointer (except regcall).
if (!isTailCall) {
- RegsToPass.push_back(std::make_pair(
+ // Indirect call with RegCall calling convertion may use up all the
+ // general registers, so it is not suitable to bind EBX reister for
+ // GOT address, just let register allocator handle it.
+ if (CallConv != CallingConv::X86_RegCall)
+ RegsToPass.push_back(std::make_pair(
Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()))));
} else {
@@ -4269,7 +4329,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Ops.push_back(Callee);
if (isTailCall)
- Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
+ Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
@@ -4343,7 +4403,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return Ret;
}
- if (HasNoCfCheck && IsCFProtectionSupported) {
+ if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
} else {
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
@@ -4462,7 +4522,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!Register::isVirtualRegister(VR))
+ if (!VR.isVirtual())
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -4514,7 +4574,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
return false;
- if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
+ if (VA.getLocVT().getFixedSizeInBits() >
+ Arg.getValueSizeInBits().getFixedSize()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
@@ -5022,13 +5083,47 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
+ Info.flags = MachineMemOperand::MONone;
+ Info.offset = 0;
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
- if (!IntrData)
+ if (!IntrData) {
+ switch (Intrinsic) {
+ case Intrinsic::x86_aesenc128kl:
+ case Intrinsic::x86_aesdec128kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesenc256kl:
+ case Intrinsic::x86_aesdec256kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesencwide128kl:
+ case Intrinsic::x86_aesdecwide128kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesencwide256kl:
+ case Intrinsic::x86_aesdecwide256kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
return false;
-
- Info.flags = MachineMemOperand::MONone;
- Info.offset = 0;
+ }
switch (IntrData->Type) {
case TRUNCATE_TO_MEM_VI8:
@@ -5098,7 +5193,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
-
+
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -5271,6 +5366,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
+
return true;
}
@@ -5414,6 +5510,14 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
}
+/// Return true if every element in Mask is the undef sentinel value or equal to
+/// the specified value..
+static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
+ return llvm::all_of(Mask, [CmpVal](int M) {
+ return (M == SM_SentinelUndef) || (M == CmpVal);
+ });
+}
+
/// Val is either the undef or zero sentinel value.
static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
@@ -5820,7 +5924,7 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
- assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
+ assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type");
SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
@@ -6184,6 +6288,22 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
return DAG.getBitcast(VT, Vec);
}
+// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
+static unsigned getOpcode_EXTEND(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
switch (Opcode) {
@@ -6200,8 +6320,8 @@ static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
llvm_unreachable("Unknown opcode");
}
-static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
- SDValue In, SelectionDAG &DAG) {
+static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
+ SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
@@ -6253,8 +6373,10 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
return SDValue();
}
-void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
bool Lo, bool Unary) {
+ assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
+ "Illegal vector type to unpack");
assert(Mask.empty() && "Expected an empty shuffle mask vector");
int NumElts = VT.getVectorNumElements();
int NumEltsInLane = 128 / VT.getScalarSizeInBits();
@@ -6283,7 +6405,7 @@ void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
}
/// Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
@@ -6291,7 +6413,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
}
/// Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
@@ -6538,15 +6660,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a subvector broadcast.
- if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
- SmallVector<APInt, 16> SubEltBits;
- if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
- UndefElts, SubEltBits, AllowWholeUndefs,
- AllowPartialUndefs)) {
- UndefElts = APInt::getSplat(NumElts, UndefElts);
- while (EltBits.size() < NumElts)
- EltBits.append(SubEltBits.begin(), SubEltBits.end());
- return true;
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
+ Type *CstTy = Cst->getType();
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+ return false;
+ unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
+ unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+ APInt UndefSubElts(NumSubElts, 0);
+ SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
+ APInt(SubEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSubElts; ++i) {
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
+ UndefSubElts, i))
+ return false;
+ for (unsigned j = 1; j != NumSubVecs; ++j)
+ SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
+ }
+ UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
+ UndefSubElts);
+ return CastBitData(UndefSubElts, SubEltBits);
}
}
@@ -6567,23 +6704,26 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
// Insert constant bits from a base and sub vector sources.
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
- // TODO - support insert_subvector through bitcasts.
- if (EltSizeInBits != VT.getScalarSizeInBits())
- return false;
+ // If bitcasts to larger elements we might lose track of undefs - don't
+ // allow any to be safe.
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
- APInt UndefSubElts;
- SmallVector<APInt, 32> EltSubBits;
- if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+ APInt UndefSrcElts, UndefSubElts;
+ SmallVector<APInt, 32> EltSrcBits, EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
UndefSubElts, EltSubBits,
- AllowWholeUndefs, AllowPartialUndefs) &&
- getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
- UndefElts, EltBits, AllowWholeUndefs,
- AllowPartialUndefs)) {
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
+ UndefSrcElts, EltSrcBits,
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs)) {
unsigned BaseIdx = Op.getConstantOperandVal(2);
- UndefElts.insertBits(UndefSubElts, BaseIdx);
+ UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
- EltBits[BaseIdx + i] = EltSubBits[i];
- return true;
+ EltSrcBits[BaseIdx + i] = EltSubBits[i];
+ return CastBitData(UndefSrcElts, EltSrcBits);
}
}
@@ -6696,7 +6836,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
return false;
// Insert the extracted elements into the mask.
- for (APInt Elt : EltBits)
+ for (const APInt &Elt : EltBits)
RawMask.push_back(Elt.getZExtValue());
return true;
@@ -7375,44 +7515,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
return true;
}
case ISD::OR: {
- // Inspect each operand at the byte level. We can merge these into a
- // blend shuffle mask if for each byte at least one is masked out (zero).
- KnownBits Known0 =
- DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
- KnownBits Known1 =
- DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
- if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
- bool IsByteMask = true;
- APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
- APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
- for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
- unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
- unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
- if (LHS == 255 && RHS == 0)
- SelectMask.setBit(i);
- else if (LHS == 255 && RHS == 255)
- ZeroMask.setBit(i);
- else if (!(LHS == 0 && RHS == 255))
- IsByteMask = false;
- }
- if (IsByteMask) {
- for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
- for (unsigned j = 0; j != NumBytesPerElt; ++j) {
- unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
- int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
- Mask.push_back(Idx);
- }
- }
- Ops.push_back(N.getOperand(0));
- Ops.push_back(N.getOperand(1));
- return true;
- }
- }
-
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
- SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
- SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
+ SDValue N0 = peekThroughBitcasts(N.getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
@@ -7423,34 +7529,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
true))
return false;
- // Shuffle inputs must be the same size as the result.
- if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
- return VT.getSizeInBits() != Op.getValueSizeInBits();
- }))
- return false;
- if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
- return VT.getSizeInBits() != Op.getValueSizeInBits();
- }))
- return false;
-
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
- for (size_t i = 0; i != MaskSize; ++i) {
+ for (int i = 0; i != (int)MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
- Mask.push_back(Mask0[i]);
+ Mask.push_back(i);
else if (Mask0[i] == SM_SentinelZero)
- Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
+ Mask.push_back(i + MaskSize);
else
return false;
}
- Ops.append(SrcInputs0.begin(), SrcInputs0.end());
- Ops.append(SrcInputs1.begin(), SrcInputs1.end());
+ Ops.push_back(N0);
+ Ops.push_back(N1);
return true;
}
case ISD::INSERT_SUBVECTOR: {
@@ -7482,7 +7578,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// Subvector shuffle inputs must not be larger than the subvector.
if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
- return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
+ return SubVT.getFixedSizeInBits() <
+ SubInput.getValueSizeInBits().getFixedSize();
}))
return false;
@@ -7503,8 +7600,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
Ops.push_back(Src);
Ops.append(SubInputs.begin(), SubInputs.end());
- for (int i = 0; i != (int)NumElts; ++i)
- Mask.push_back(i);
+ if (ISD::isBuildVectorAllZeros(Src.getNode()))
+ Mask.append(NumElts, SM_SentinelZero);
+ else
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
@@ -7605,19 +7705,33 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
APInt EltsLHS, EltsRHS;
getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
- // If we know input saturation won't happen we can treat this
- // as a truncation shuffle.
+ // If we know input saturation won't happen (or we don't care for particular
+ // lanes), we can treat this as a truncation shuffle.
+ bool Offset0 = false, Offset1 = false;
if (Opcode == X86ISD::PACKSS) {
- if ((!N0.isUndef() &&
+ if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
- (!N1.isUndef() &&
+ (!(N1.isUndef() || EltsRHS.isNullValue()) &&
DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
+ // We can't easily fold ASHR into a shuffle, but if it was feeding a
+ // PACKSS then it was likely being used for sign-extension for a
+ // truncation, so just peek through and adjust the mask accordingly.
+ if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
+ N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset0 = true;
+ N0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
+ N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset1 = true;
+ N1 = N1.getOperand(0);
+ }
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!N0.isUndef() &&
+ if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
- (!N1.isUndef() &&
+ (!(N1.isUndef() || EltsRHS.isNullValue()) &&
!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
return false;
}
@@ -7629,6 +7743,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Ops.push_back(N1);
createPackShuffleMask(VT, Mask, IsUnary);
+
+ if (Offset0 || Offset1) {
+ for (int &M : Mask)
+ if ((Offset0 && isInRange(M, 0, NumElts)) ||
+ (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
+ ++M;
+ }
return true;
}
case X86ISD::VTRUNC: {
@@ -7916,7 +8037,7 @@ static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
}
// Use PINSRB/PINSRW/PINSRD to create a build vector.
-static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
+static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -7931,7 +8052,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
bool First = true;
for (unsigned i = 0; i < NumElts; ++i) {
- bool IsNonZero = (NonZeros & (1 << i)) != 0;
+ bool IsNonZero = NonZeroMask[i];
if (!IsNonZero)
continue;
@@ -7958,7 +8079,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
}
/// Custom lower build_vector of v16i8.
-static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -7967,7 +8088,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41())
- return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
+ return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
Subtarget);
SDLoc dl(Op);
@@ -7975,8 +8096,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; i += 2) {
- bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
- bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
+ bool ThisIsNonZero = NonZeroMask[i];
+ bool NextIsNonZero = NonZeroMask[i + 1];
if (!ThisIsNonZero && !NextIsNonZero)
continue;
@@ -8024,7 +8145,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
}
/// Custom lower build_vector of v8i16.
-static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8032,7 +8153,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
return SDValue();
// Use PINSRW to insert each byte directly.
- return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
+ return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
Subtarget);
}
@@ -8352,8 +8473,6 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Handle Special Cases - all undef or undef/zero.
if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
-
- // FIXME: Should we return this as a BUILD_VECTOR instead?
if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
@@ -8368,7 +8487,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
- int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
+ int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
+ int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// TODO: Support offsetting the base load.
@@ -8430,7 +8550,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// base pointer. If the vector contains zeros, then attempt to shuffle those
// elements.
if (FirstLoadedElt == 0 &&
- (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
+ (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
@@ -8518,6 +8638,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
if (!Subtarget.hasAVX2() && ScalarSize < 32)
continue;
+ // Don't attempt a 1:N subvector broadcast - it should be caught by
+ // combineConcatVectorOps, else will cause infinite loops.
+ if (RepeatSize > ScalarSize && SubElems == 1)
+ continue;
+
bool Match = true;
SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
for (unsigned i = 0; i != NumElems && Match; ++i) {
@@ -8549,9 +8674,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
- unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
- : X86ISD::VBROADCAST;
- SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
+ SDValue Broadcast = RepeatLoad;
+ if (RepeatSize > ScalarSize) {
+ while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
+ Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
+ } else {
+ Broadcast =
+ DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
+ }
return DAG.getBitcast(VT, Broadcast);
}
}
@@ -8622,43 +8752,6 @@ static bool isFoldableUseOfShuffle(SDNode *N) {
return false;
}
-// Check if the current node of build vector is a zero extended vector.
-// // If so, return the value extended.
-// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
-// // NumElt - return the number of zero extended identical values.
-// // EltType - return the type of the value include the zero extend.
-static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
- unsigned &NumElt, MVT &EltType) {
- SDValue ExtValue = Op->getOperand(0);
- unsigned NumElts = Op->getNumOperands();
- unsigned Delta = NumElts;
-
- for (unsigned i = 1; i < NumElts; i++) {
- if (Op->getOperand(i) == ExtValue) {
- Delta = i;
- break;
- }
- if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
- return SDValue();
- }
- if (!isPowerOf2_32(Delta) || Delta == 1)
- return SDValue();
-
- for (unsigned i = Delta; i < NumElts; i++) {
- if (i % Delta == 0) {
- if (Op->getOperand(i) != ExtValue)
- return SDValue();
- } else if (!(isNullConstant(Op->getOperand(i)) ||
- Op->getOperand(i).isUndef()))
- return SDValue();
- }
- unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
- unsigned ExtVTSize = EltSize * Delta;
- EltType = MVT::getIntegerVT(ExtVTSize);
- NumElt = NumElts / Delta;
- return ExtValue;
-}
-
/// Attempt to use the vbroadcast instruction to generate a splat value
/// from a splat BUILD_VECTOR which uses:
/// a. A single scalar load, or a constant.
@@ -8676,13 +8769,21 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
return SDValue();
MVT VT = BVOp->getSimpleValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
SDLoc dl(BVOp);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
+ // See if the build vector is a repeating sequence of scalars (inc. splat).
+ SDValue Ld;
BitVector UndefElements;
- SDValue Ld = BVOp->getSplatValue(&UndefElements);
+ SmallVector<SDValue, 16> Sequence;
+ if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
+ assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
+ if (Sequence.size() == 1)
+ Ld = Sequence[0];
+ }
// Attempt to use VBROADCASTM
// From this pattern:
@@ -8690,30 +8791,38 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// b. t1 = (build_vector t0 t0)
//
// Create (VBROADCASTM v2i1 X)
- if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
- MVT EltType = VT.getScalarType();
- unsigned NumElts = VT.getVectorNumElements();
- SDValue BOperand;
- SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
- if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
- (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
- Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
- if (ZeroExtended)
- BOperand = ZeroExtended.getOperand(0);
- else
- BOperand = Ld.getOperand(0).getOperand(0);
+ if (!Sequence.empty() && Subtarget.hasCDI()) {
+ // If not a splat, are the upper sequence values zeroable?
+ unsigned SeqLen = Sequence.size();
+ bool UpperZeroOrUndef =
+ SeqLen == 1 ||
+ llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
+ return !V || V.isUndef() || isNullConstant(V);
+ });
+ SDValue Op0 = Sequence[0];
+ if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
+ (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
+ SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
+ ? Op0.getOperand(0)
+ : Op0.getOperand(0).getOperand(0);
MVT MaskVT = BOperand.getSimpleValueType();
- if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
+ MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
+ if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
- SDValue Brdcst =
- DAG.getNode(X86ISD::VBROADCASTM, dl,
- MVT::getVectorVT(EltType, NumElts), BOperand);
- return DAG.getBitcast(VT, Brdcst);
+ MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ unsigned Scale = 512 / VT.getSizeInBits();
+ BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
+ }
+ SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
+ if (BcstVT.getSizeInBits() != VT.getSizeInBits())
+ Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
+ return DAG.getBitcast(VT, Bcst);
}
}
}
- unsigned NumElts = VT.getVectorNumElements();
unsigned NumUndefElts = UndefElements.count();
if (!Ld || (NumElts - NumUndefElts) <= 1) {
APInt SplatValue, Undef;
@@ -8755,18 +8864,19 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
}
if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
- MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+ MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
- Ld = DAG.getLoad(
- MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
- SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
- return DAG.getBitcast(VT, Brdcst);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), VCP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(
+ X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
}
}
}
@@ -8787,6 +8897,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+ // TODO: Handle broadcasts of non-constant sequences.
+
// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.
// FIXME: Is the use count needed for non-constant, non-load case?
@@ -10120,45 +10232,69 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
return VectorConstant;
- BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
- if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
- return AddSub;
- if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
- return HorizontalOp;
- if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
- return Broadcast;
- if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
- return BitOp;
-
unsigned EVTBits = EltVT.getSizeInBits();
-
- unsigned NumZero = 0;
- unsigned NumNonZero = 0;
- uint64_t NonZeros = 0;
+ APInt UndefMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt NonZeroMask = APInt::getNullValue(NumElems);
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
- if (Elt.isUndef())
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
continue;
+ }
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
NumConstants--;
}
- if (X86::isZeroNode(Elt))
- NumZero++;
- else {
- assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
- NonZeros |= ((uint64_t)1 << i);
- NumNonZero++;
+ if (X86::isZeroNode(Elt)) {
+ ZeroMask.setBit(i);
+ } else {
+ NonZeroMask.setBit(i);
}
}
- // All undef vector. Return an UNDEF. All zero vectors were handled above.
- if (NumNonZero == 0)
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NonZeroMask == 0) {
+ assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
return DAG.getUNDEF(VT);
+ }
+
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+
+ // If the upper elts of a ymm/zmm are undef/zero then we might be better off
+ // lowering to a smaller build vector and padding with undef/zero.
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ !isFoldableUseOfShuffle(BV)) {
+ unsigned UpperElems = NumElems / 2;
+ APInt UndefOrZeroMask = UndefMask | ZeroMask;
+ unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
+ if (NumUpperUndefsOrZeros >= UpperElems) {
+ if (VT.is512BitVector() &&
+ NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
+ UpperElems = NumElems - (NumElems / 4);
+ bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
+ MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
+ SDValue NewBV =
+ DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
+ return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
+ }
+ }
+
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+ return Broadcast;
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+ return BitOp;
+
+ unsigned NumZero = ZeroMask.countPopulation();
+ unsigned NumNonZero = NonZeroMask.countPopulation();
// If we are inserting one variable into a vector of non-zero constants, try
// to avoid loading each constant element as a scalar. Load the constants as a
@@ -10222,7 +10358,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
- unsigned Idx = countTrailingZeros(NonZeros);
+ unsigned Idx = NonZeroMask.countTrailingZeros();
SDValue Item = Op.getOperand(Idx);
// If we have a constant or non-constant insertion into the low element of
@@ -10286,7 +10422,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
- unsigned Idx = countTrailingZeros(NonZeros);
+ unsigned Idx = NonZeroMask.countTrailingZeros();
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
@@ -10355,7 +10491,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
- unsigned Idx = countTrailingZeros(NonZeros);
+ unsigned Idx = NonZeroMask.countTrailingZeros();
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
@@ -10365,12 +10501,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
- if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
- if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
DAG, Subtarget))
return V;
@@ -10383,7 +10519,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (NumElems == 4 && NumZero > 0) {
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < 4; ++i) {
- bool isZero = !(NonZeros & (1ULL << i));
+ bool isZero = !NonZeroMask[i];
if (isZero)
Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
@@ -10391,7 +10527,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
for (unsigned i = 0; i < 2; ++i) {
- switch ((NonZeros >> (i*2)) & 0x3) {
+ switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
default: llvm_unreachable("Unexpected NonZero count");
case 0:
Ops[i] = Ops[i*2]; // Must be a zero vector.
@@ -10408,8 +10544,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
}
- bool Reverse1 = (NonZeros & 0x3) == 2;
- bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
+ bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
int MaskVec[] = {
Reverse1 ? 1 : 0,
Reverse1 ? 0 : 1,
@@ -10681,6 +10817,35 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
}
+/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
+/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
+/// better support 'repeated mask + lane permute' style shuffles.
+static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask) {
+ assert(LaneSizeInBits && ScalarSizeInBits &&
+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+ "Illegal shuffle lane size");
+ int NumElts = Mask.size();
+ int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
+ int NumLanes = NumElts / NumEltsPerLane;
+ if (NumLanes > 1) {
+ for (int i = 0; i != NumLanes; ++i) {
+ int SrcLane = -1;
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ int M = Mask[(i * NumEltsPerLane) + j];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumEltsPerLane;
+ if (SrcLane >= 0 && SrcLane != Lane)
+ return true;
+ SrcLane = Lane;
+ }
+ }
+ }
+ return false;
+}
+
/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
@@ -10742,10 +10907,11 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
/// Test whether a target shuffle mask is equivalent within each sub-lane.
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
-static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
+ unsigned EltSizeInBits,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
- int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ int LaneSize = LaneSizeInBits / EltSizeInBits;
RepeatedMask.assign(LaneSize, SM_SentinelUndef);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
@@ -10776,6 +10942,67 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
return true;
}
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
+ Mask, RepeatedMask);
+}
+
+/// Checks whether the vector elements referenced by two shuffle masks are
+/// equivalent.
+static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
+ int Idx, int ExpectedIdx) {
+ assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
+ ExpectedIdx < MaskSize && "Out of range element index");
+ if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
+ return false;
+
+ switch (Op.getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ // TODO: Handle MaskSize != Op.getNumOperands()?
+ if (MaskSize == (int)Op.getNumOperands() &&
+ MaskSize == (int)ExpectedOp.getNumOperands())
+ return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
+ break;
+ case X86ISD::VBROADCAST:
+ case X86ISD::VBROADCAST_LOAD:
+ // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
+ return (Op == ExpectedOp &&
+ (int)Op.getValueType().getVectorNumElements() == MaskSize);
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
+ // TODO: Handle MaskSize != NumElts?
+ // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
+ if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
+ MVT VT = Op.getSimpleValueType();
+ int NumElts = VT.getVectorNumElements();
+ if (MaskSize == NumElts) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumHalfEltsPerLane = NumEltsPerLane / 2;
+ bool SameLane =
+ (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
+ bool SameElt =
+ (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
+ return SameLane && SameElt;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
/// Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@@ -10786,30 +11013,26 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
-static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
- ArrayRef<int> ExpectedMask) {
- if (Mask.size() != ExpectedMask.size())
- return false;
-
+static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
+ SDValue V1 = SDValue(),
+ SDValue V2 = SDValue()) {
int Size = Mask.size();
-
- // If the values are build vectors, we can look through them to find
- // equivalent inputs that make the shuffles equivalent.
- auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
- auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+ if (Size != (int)ExpectedMask.size())
+ return false;
for (int i = 0; i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
- if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
- auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
- auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
- if (!MaskBV || !ExpectedBV ||
- MaskBV->getOperand(Mask[i] % Size) !=
- ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ int MaskIdx = Mask[i];
+ int ExpectedIdx = ExpectedMask[i];
+ if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
+ SDValue MaskV = MaskIdx < Size ? V1 : V2;
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+ ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
return false;
}
}
-
return true;
}
@@ -10822,7 +11045,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
///
/// SM_SentinelZero is accepted as a valid negative index but must match in
/// both.
-static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask,
SDValue V1 = SDValue(),
SDValue V2 = SDValue()) {
@@ -10836,22 +11059,23 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
return false;
- // If the values are build vectors, we can look through them to find
- // equivalent inputs that make the shuffles equivalent.
- auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
- auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
- BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
- BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
+ // Don't use V1/V2 if they're not the same size as the shuffle mask type.
+ if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
+ V1 = SDValue();
+ if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
+ V2 = SDValue();
for (int i = 0; i < Size; ++i) {
- if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
+ int MaskIdx = Mask[i];
+ int ExpectedIdx = ExpectedMask[i];
+ if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
continue;
- if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
- auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
- auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
- if (MaskBV && ExpectedBV &&
- MaskBV->getOperand(Mask[i] % Size) ==
- ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ if (0 <= MaskIdx && 0 <= ExpectedIdx) {
+ SDValue MaskV = MaskIdx < Size ? V1 : V2;
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+ ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
continue;
}
// TODO - handle SM_Sentinel equivalences.
@@ -10863,20 +11087,25 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
// Attempt to create a shuffle mask from a VSELECT condition mask.
static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
SDValue Cond) {
- if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+ EVT CondVT = Cond.getValueType();
+ unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
+ unsigned NumElts = CondVT.getVectorNumElements();
+
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
+ true, false))
return false;
- unsigned Size = Cond.getValueType().getVectorNumElements();
- Mask.resize(Size, SM_SentinelUndef);
+ Mask.resize(NumElts, SM_SentinelUndef);
- for (int i = 0; i != (int)Size; ++i) {
- SDValue CondElt = Cond.getOperand(i);
+ for (int i = 0; i != (int)NumElts; ++i) {
Mask[i] = i;
// Arbitrarily choose from the 2nd operand if the select condition element
// is undef.
// TODO: Can we do better by matching patterns such as even/odd?
- if (CondElt.isUndef() || isNullConstant(CondElt))
- Mask[i] += Size;
+ if (UndefElts[i] || EltBits[i].isNullValue())
+ Mask[i] += NumElts;
}
return true;
@@ -10894,8 +11123,8 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
SmallVector<int, 8> Unpckhwd;
createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
/* Unary = */ false);
- bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
- isTargetShuffleEquivalent(Mask, Unpckhwd));
+ bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
+ isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
return IsUnpackwdMask;
}
@@ -10912,8 +11141,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
for (unsigned i = 0; i != 4; ++i) {
SmallVector<int, 16> UnpackMask;
createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
- if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
- isTargetShuffleEquivalent(CommutedMask, UnpackMask))
+ if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
+ isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
return true;
}
return false;
@@ -10948,6 +11177,15 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+ // If the mask only uses one non-undef element, then fully 'splat' it to
+ // improve later broadcast matching.
+ int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
+ assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
+
+ int FirstElt = Mask[FirstIndex];
+ if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
+ return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
+
unsigned Imm = 0;
Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
@@ -11097,7 +11335,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// Attempt to match the target mask against the unpack lo/hi mask patterns.
SmallVector<int, 64> Unpckl, Unpckh;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
- if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
+ (IsUnary ? V1 : V2))) {
UnpackOpcode = X86ISD::UNPCKL;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
@@ -11105,7 +11344,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
}
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
- if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
+ (IsUnary ? V1 : V2))) {
UnpackOpcode = X86ISD::UNPCKH;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
@@ -11143,14 +11383,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// If a binary shuffle, commute and try again.
if (!IsUnary) {
ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
std::swap(V1, V2);
return true;
}
ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
std::swap(V1, V2);
return true;
@@ -11167,21 +11407,21 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
- if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
SmallVector<int, 8> Unpckh;
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
- if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
// Commute and try again.
ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
return SDValue();
@@ -11197,9 +11437,9 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
unsigned UnpackOpcode;
- if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
UnpackOpcode = X86ISD::UNPCKL;
- else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
UnpackOpcode = X86ISD::UNPCKH;
else
return SDValue();
@@ -11215,7 +11455,6 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
// source into the lower elements and zeroing the upper elements.
-// TODO: Merge with matchShuffleAsVPMOV.
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
ArrayRef<int> Mask, const APInt &Zeroable,
const X86Subtarget &Subtarget) {
@@ -11252,22 +11491,51 @@ static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
return false;
}
-static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
- int Delta) {
- int Size = (int)Mask.size();
- int Split = Size / Delta;
- int TruncatedVectorStart = SwappedOps ? Size : 0;
+// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
+// element padding to the final DstVT.
+static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, bool ZeroUppers) {
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstSVT = DstVT.getScalarType();
+ unsigned NumDstElts = DstVT.getVectorNumElements();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
- // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
- if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
- return false;
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+ return SDValue();
- // The rest of the mask should not refer to the truncated vector's elements.
- if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
- TruncatedVectorStart + Size))
- return false;
+ // Perform a direct ISD::TRUNCATE if possible.
+ if (NumSrcElts == NumDstElts)
+ return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
- return true;
+ if (NumSrcElts > NumDstElts) {
+ MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+ return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
+ }
+
+ if ((NumSrcElts * DstEltSizeInBits) >= 128) {
+ MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+ return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ }
+
+ // Non-VLX targets must truncate from a 512-bit type, so we need to
+ // widen, truncate and then possibly extract the original subvector.
+ if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
+ SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
+ return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
+ }
+
+ // Fallback to a X86ISD::VTRUNC, padding if necessary.
+ MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
+ SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
+ if (DstVT != TruncVT)
+ Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ return Trunc;
}
// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
@@ -11283,66 +11551,99 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
// t18: v2i64 = bitcast t51
//
-// Without avx512vl, this is lowered to:
-//
-// vpmovqd %zmm0, %ymm0
-// vpshufb {{.*#+}} xmm0 =
-// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-//
-// But when avx512vl is available, one can just use a single vpmovdw
-// instruction.
-static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (VT != MVT::v16i8 && VT != MVT::v8i16)
+// One can just use a single vpmovdw instruction, without avx512vl we need to
+// use the zmm variant and extract the lower subvector, padding with zeroes.
+// TODO: Merge with lowerShuffleAsVTRUNC.
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
return SDValue();
- if (Mask.size() != VT.getVectorNumElements())
- return SDValue();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned NumSrcElts = NumElts / Scale;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
- bool SwappedOps = false;
+ SDValue Src = V1;
+ if (!Src.hasOneUse())
+ return SDValue();
- if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
- if (!ISD::isBuildVectorAllZeros(V1.getNode()))
+ Src = peekThroughOneUseBitcasts(Src);
+ if (Src.getOpcode() != ISD::TRUNCATE ||
+ Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
return SDValue();
+ Src = Src.getOperand(0);
- std::swap(V1, V2);
- SwappedOps = true;
+ // VPMOVWB is only available with avx512bw.
+ MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+ !Subtarget.hasBWI())
+ return SDValue();
+
+ bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
+ return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
}
- // Look for:
- //
- // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
- // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
- //
- // and similar ones.
- if (V1.getOpcode() != ISD::BITCAST)
- return SDValue();
- if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+}
+
+// Attempt to match binary shuffle patterns as a truncate.
+static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
return SDValue();
- SDValue Src = V1.getOperand(0).getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ // TODO: Support non-BWI VPMOVWB truncations?
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
- // The vptrunc** instructions truncating 128 bit and 256 bit vectors
- // are only available with avx512vl.
- if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
- return SDValue();
+ // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
+ // Bail if the V2 elements are undef.
+ unsigned NumHalfSrcElts = NumElts / Scale;
+ unsigned NumSrcElts = 2 * NumHalfSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
+ continue;
- // Down Convert Word to Byte is only available with avx512bw. The case with
- // 256-bit output doesn't contain a shuffle and is therefore not handled here.
- if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
- !Subtarget.hasBWI())
- return SDValue();
+ // The elements beyond the truncation must be undef/zero.
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (UpperElts > 0 &&
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ bool UndefUppers =
+ UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
- // The first half/quarter of the mask should refer to every second/fourth
- // element of the vector truncated and bitcasted.
- if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
- !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
- return SDValue();
+ // As we're using both sources then we need to concat them together
+ // and truncate from the double-sized src.
+ MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
+ SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ Src = DAG.getBitcast(SrcVT, Src);
+ return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
+ }
+
+ return SDValue();
}
/// Check whether a compaction lowering can be done by dropping even
@@ -11460,14 +11761,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false, NumStages);
- if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
+ if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
if (MatchPACK(V1, V2, PackVT))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true, NumStages);
- if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
+ if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
if (MatchPACK(V1, V1, PackVT))
return true;
}
@@ -12016,23 +12317,32 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
-/// blends.
-static SDValue lowerShuffleAsDecomposedShuffleBlend(
+/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
+static SDValue lowerShuffleAsDecomposedShuffleMerge(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+
// Shuffle the input elements into the desired positions in V1 and V2 and
- // blend them together.
- SmallVector<int, 32> V1Mask(Mask.size(), -1);
- SmallVector<int, 32> V2Mask(Mask.size(), -1);
- SmallVector<int, 32> BlendMask(Mask.size(), -1);
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= 0 && Mask[i] < Size) {
- V1Mask[i] = Mask[i];
- BlendMask[i] = i;
- } else if (Mask[i] >= Size) {
- V2Mask[i] = Mask[i] - Size;
- BlendMask[i] = i + Size;
+ // unpack/blend them together.
+ bool IsAlternating = true;
+ SmallVector<int, 32> V1Mask(NumElts, -1);
+ SmallVector<int, 32> V2Mask(NumElts, -1);
+ SmallVector<int, 32> FinalMask(NumElts, -1);
+ for (int i = 0; i < NumElts; ++i) {
+ int M = Mask[i];
+ if (M >= 0 && M < NumElts) {
+ V1Mask[i] = M;
+ FinalMask[i] = i;
+ IsAlternating &= (i & 1) == 0;
+ } else if (M >= NumElts) {
+ V2Mask[i] = M - NumElts;
+ FinalMask[i] = i + NumElts;
+ IsAlternating &= (i & 1) == 1;
}
+ }
// Try to lower with the simpler initial blend/unpack/rotate strategies unless
// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
@@ -12056,9 +12366,30 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(
return BlendPerm;
}
+ // If the final mask is an alternating blend of vXi8/vXi16, convert to an
+ // UNPCKL(SHUFFLE, SHUFFLE) pattern.
+ // TODO: It doesn't have to be alternating - but each lane mustn't have more
+ // than half the elements coming from each source.
+ if (IsAlternating && VT.getScalarSizeInBits() < 32) {
+ V1Mask.assign(NumElts, -1);
+ V2Mask.assign(NumElts, -1);
+ FinalMask.assign(NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumEltsPerLane)
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ int M = Mask[i + j];
+ if (M >= 0 && M < NumElts) {
+ V1Mask[i + (j / 2)] = M;
+ FinalMask[i + j] = i + (j / 2);
+ } else if (M >= NumElts) {
+ V2Mask[i + (j / 2)] = M - NumElts;
+ FinalMask[i + j] = i + (j / 2) + NumElts;
+ }
+ }
+ }
+
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
- return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
}
/// Try to lower a vector shuffle as a bit rotation.
@@ -12716,8 +13047,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
- ExtVT, InputV, DAG);
+ InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
+ DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -13325,7 +13656,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
MVT SVT = VT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
- SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+ SDValue NewAddr =
+ DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
// than MOVDDUP.
@@ -13498,7 +13830,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
- unsigned InsertPSMask;
+ unsigned InsertPSMask = 0;
if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
@@ -13686,8 +14018,8 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
- isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+ if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
+ isShuffleEquivalent(Mask, {1, 3}, V1, V2))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
@@ -13733,9 +14065,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
V1 = DAG.getBitcast(MVT::v4i32, V1);
- int WidenedMask[4] = {
- std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
- std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+ int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
+ Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
+ Mask[1] < 0 ? -1 : (Mask[1] * 2),
+ Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
return DAG.getBitcast(
MVT::v2i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -13795,7 +14128,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
@@ -13889,6 +14222,12 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
NewMask[2] = Mask[2] < 4 ? 1 : 3;
NewMask[3] = Mask[2] < 4 ? 3 : 1;
}
+ } else if (NumV2Elements == 3) {
+ // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
+ // we can get here due to other paths (e.g repeated mask matching) that we
+ // don't want to do another round of lowerVECTOR_SHUFFLE.
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
}
return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
@@ -13917,9 +14256,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use even/odd duplicate instructions for masks that match their pattern.
if (Subtarget.hasSSE3()) {
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+ if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
}
@@ -13933,9 +14272,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
// in SSE1 because otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
+ if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
+ if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
}
@@ -13977,9 +14316,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use low/high mov instructions. These are only valid in SSE1 because
// otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+ if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+ if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
}
@@ -14027,9 +14366,9 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
+ if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
Mask = UnpackLoMask;
- else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
+ else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -14087,7 +14426,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
@@ -14696,6 +15035,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return ZExt;
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
@@ -14776,6 +15120,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
@@ -14827,22 +15176,49 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// We can always bit-blend if we have to so the fallback strategy is to
- // decompose into single-input permutes and blends.
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ // decompose into single-input permutes and blends/unpacks.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG);
}
+// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
+// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
+// the active subvector is extracted.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
- MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT.changeTypeToInteger();
+ SDValue MaskNode;
+ MVT ShuffleVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
+ V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
+ ShuffleVT = V1.getSimpleValueType();
- SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+ // Adjust mask to correct indices for the second input.
+ int NumElts = VT.getVectorNumElements();
+ unsigned Scale = 512 / VT.getSizeInBits();
+ SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
+ for (int &M : AdjustedMask)
+ if (NumElts <= M)
+ M += (Scale - 1) * NumElts;
+ MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
+ MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
+ } else {
+ MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
+ }
+
+ SDValue Result;
if (V2.isUndef())
- return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+ Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
+ else
+ Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
+
+ if (VT != ShuffleVT)
+ Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
- return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+ return Result;
}
/// Generic lowering of v16i8 shuffles.
@@ -14880,6 +15256,15 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return ZExt;
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
@@ -15062,9 +15447,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
- // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
- if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
+ DAG);
+
+ // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
+ if (Subtarget.hasXOP()) {
+ SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
+ }
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
@@ -15120,9 +15512,9 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
}
- // Handle multi-input cases by blending single-input shuffles.
+ // Handle multi-input cases by blending/unpacking single-input shuffles.
if (NumV2Elements > 0)
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
@@ -15302,7 +15694,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
}
/// Either split a vector in halves or decompose the shuffles and the
-/// blend.
+/// blend/unpack.
///
/// This is provided as a good fallback for many lowerings of non-single-input
/// shuffles with more than one 128-bit lane. In those cases, we want to select
@@ -15337,8 +15729,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
return true;
};
if (DoBothBroadcast())
- return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG);
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
@@ -15354,9 +15746,9 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
- // Otherwise, just fall back to decomposed shuffles and a blend. This requires
- // that the decomposed single-input shuffles don't end up here.
- return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
+ // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
+ // requires that the decomposed single-input shuffles don't end up here.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
DAG);
}
@@ -15404,53 +15796,94 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
+ bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
+
+ /// Attempts to find a sublane permute with the given size
+ /// that gets all elements into their target lanes.
+ ///
+ /// If successful, fills CrossLaneMask and InLaneMask and returns true.
+ /// If unsuccessful, returns false and may overwrite InLaneMask.
+ auto getSublanePermute = [&](int NumSublanes) -> SDValue {
+ int NumSublanesPerLane = NumSublanes / NumLanes;
+ int NumEltsPerSublane = NumElts / NumSublanes;
+
+ SmallVector<int, 16> CrossLaneMask;
+ SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
+ // CrossLaneMask but one entry == one sublane.
+ SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
- SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
- SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
+ int SrcSublane = M / NumEltsPerSublane;
+ int DstLane = i / NumEltsPerLane;
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
- if (M < 0)
- continue;
+ // We only need to get the elements into the right lane, not sublane.
+ // So search all sublanes that make up the destination lane.
+ bool Found = false;
+ int DstSubStart = DstLane * NumSublanesPerLane;
+ int DstSubEnd = DstSubStart + NumSublanesPerLane;
+ for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
+ if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
+ continue;
- // Ensure that each lane comes from a single source lane.
- int SrcLane = M / NumEltsPerLane;
- int DstLane = i / NumEltsPerLane;
- if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
- return SDValue();
- SrcLaneMask[DstLane] = SrcLane;
+ Found = true;
+ CrossLaneMaskLarge[DstSublane] = SrcSublane;
+ int DstSublaneOffset = DstSublane * NumEltsPerSublane;
+ InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
+ break;
+ }
+ if (!Found)
+ return SDValue();
+ }
- PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
- }
+ // Fill CrossLaneMask using CrossLaneMaskLarge.
+ narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
- // Make sure we set all elements of the lane mask, to avoid undef propagation.
- SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
- for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
- int SrcLane = SrcLaneMask[DstLane];
- if (0 <= SrcLane)
- for (int j = 0; j != NumEltsPerLane; ++j) {
- LaneMask[(DstLane * NumEltsPerLane) + j] =
- (SrcLane * NumEltsPerLane) + j;
- }
- }
+ if (!CanUseSublanes) {
+ // If we're only shuffling a single lowest lane and the rest are identity
+ // then don't bother.
+ // TODO - isShuffleMaskInputInPlace could be extended to something like
+ // this.
+ int NumIdentityLanes = 0;
+ bool OnlyShuffleLowestLane = true;
+ for (int i = 0; i != NumLanes; ++i) {
+ int LaneOffset = i * NumEltsPerLane;
+ if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
+ i * NumEltsPerLane))
+ NumIdentityLanes++;
+ else if (CrossLaneMask[LaneOffset] != 0)
+ OnlyShuffleLowestLane = false;
+ }
+ if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ return SDValue();
+ }
- // If we're only shuffling a single lowest lane and the rest are identity
- // then don't bother.
- // TODO - isShuffleMaskInputInPlace could be extended to something like this.
- int NumIdentityLanes = 0;
- bool OnlyShuffleLowestLane = true;
- for (int i = 0; i != NumLanes; ++i) {
- if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
- i * NumEltsPerLane))
- NumIdentityLanes++;
- else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
- OnlyShuffleLowestLane = false;
- }
- if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
+ return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
+ InLaneMask);
+ };
+
+ // First attempt a solution with full lanes.
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
+ return V;
+
+ // The rest of the solutions use sublanes.
+ if (!CanUseSublanes)
+ return SDValue();
+
+ // Then attempt a solution with 64-bit sublanes (vpermq).
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
+ return V;
+
+ // If that doesn't work and we have fast variable shuffle,
+ // attempt 32-bit sublanes (vpermd).
+ if (!Subtarget.hasFastVariableShuffle())
return SDValue();
- SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
- return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
+ return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
@@ -15563,8 +15996,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
if (!IsLowZero && !IsHighZero) {
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
- if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
+ if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
@@ -16306,7 +16739,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
@@ -16367,7 +16800,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -16395,7 +16828,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
@@ -16477,7 +16910,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -16497,7 +16930,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
}
@@ -16530,9 +16963,9 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
"Repeated masks must be half the mask width!");
// Use even/odd duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+ if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
- if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+ if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
if (V2.isUndef())
@@ -16586,14 +17019,13 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
- if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG))
- return V;
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
+ DAG);
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
@@ -16626,9 +17058,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
!Subtarget.hasAVX512())
- if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
- Subtarget, DAG))
- return V;
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
+ DAG);
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -16713,7 +17144,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG);
}
@@ -16755,6 +17186,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -16807,9 +17243,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // AVX512BWVL can lower to VPERMW.
- if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+ // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
+ if (Subtarget.hasBWI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -16865,6 +17301,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -16907,9 +17348,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // AVX512VBMIVL can lower to VPERMB.
- if (Subtarget.hasVBMI() && Subtarget.hasVLX())
- return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -17036,9 +17477,9 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
+ bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
if (OnlyUsesV1 ||
- isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
+ isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
SDValue SubVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
@@ -17123,7 +17564,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (V2.isUndef()) {
// Use low duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
@@ -17163,7 +17604,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
@@ -17182,9 +17623,9 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
// Use even/odd duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+ if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
- if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+ if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
if (V2.isUndef())
@@ -17222,7 +17663,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
- return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
@@ -17270,12 +17711,14 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
// Try to use PALIGNR.
- if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
- Subtarget, DAG))
- return Rotate;
+ if (Subtarget.hasBWI())
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
+
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
@@ -17285,7 +17728,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
@@ -17362,7 +17805,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
@@ -17425,7 +17868,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
@@ -17481,7 +17924,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
- return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -17935,7 +18378,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
// Modify the new Mask to take all zeros from the all-zero vector.
// Choose indices that are blend-friendly.
bool UsedZeroVector = false;
- assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
+ assert(is_contained(WidenedMask, SM_SentinelZero) &&
"V2's non-undef elements are used?!");
for (int i = 0; i != NewNumElts; ++i)
if (WidenedMask[i] == SM_SentinelZero) {
@@ -17961,9 +18404,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
std::swap(V1, V2);
}
- if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
- return V;
-
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
@@ -17991,9 +18431,11 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
// Only non-legal VSELECTs reach this lowering, convert those into generic
// shuffles and re-use the shuffle lowering path for blends.
- SmallVector<int, 32> Mask;
- if (createShuffleMaskFromVSELECT(Mask, Cond))
- return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
+ if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+ SmallVector<int, 32> Mask;
+ if (createShuffleMaskFromVSELECT(Mask, Cond))
+ return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
+ }
return SDValue();
}
@@ -18107,7 +18549,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -18262,7 +18706,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -18456,10 +18901,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
Opc = X86ISD::PINSRB;
}
- if (N1.getValueType() != MVT::i32)
- N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
- if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
}
@@ -18707,9 +19151,12 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
if (GV) {
// Create a target global address if this is a global. If possible, fold the
// offset into the global address reference. Otherwise, ADD it on later.
+ // Suppress the folding if Offset is negative: movl foo-1, %eax is not
+ // allowed because if the address of foo is 0, the ELF R_X86_64_32
+ // relocation will compute to a negative value, which is invalid.
int64_t GlobalOffset = 0;
- if (OpFlags == X86II::MO_NO_FLAG &&
- X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
+ X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
std::swap(GlobalOffset, Offset);
}
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
@@ -18796,7 +19243,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
}
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
@@ -18804,10 +19251,17 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
X86::RAX, X86II::MO_TLSGD);
}
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
+static SDValue
+LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+ X86::EAX, X86II::MO_TLSGD);
+}
+
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
- SelectionDAG &DAG,
- const EVT PtrVT,
- bool is64Bit) {
+ SelectionDAG &DAG, const EVT PtrVT,
+ bool Is64Bit, bool Is64BitLP64) {
SDLoc dl(GA);
// Get the start address of the TLS block for this module.
@@ -18816,8 +19270,9 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
MFI->incNumLocalDynamicTLSAccesses();
SDValue Base;
- if (is64Bit) {
- Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+ if (Is64Bit) {
+ unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
@@ -18914,12 +19369,15 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
- if (Subtarget.is64Bit())
- return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ if (Subtarget.is64Bit()) {
+ if (Subtarget.isTarget64BitLP64())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+ }
return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
case TLSModel::LocalDynamic:
- return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
- Subtarget.is64Bit());
+ return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+ Subtarget.isTarget64BitLP64());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
@@ -19019,7 +19477,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
else
IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
- auto &DL = DAG.getDataLayout();
+ const DataLayout &DL = DAG.getDataLayout();
SDValue Scale =
DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
@@ -19112,15 +19570,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
if (IsFSHR)
std::swap(Op0, Op1);
+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+ if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
+ Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
+ Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
+ }
+
+ SDValue Funnel;
APInt APIntShiftAmt;
+ MVT ResultVT = Op0.getSimpleValueType();
if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
- return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
- Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ Funnel =
+ DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
+ Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ } else {
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
+ Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
+ ResultVT, Op0, Op1, Amt);
}
-
- return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
- Op0, Op1, Amt);
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
+ return Funnel;
}
assert(
(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -19472,7 +19944,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
}
if (VT == MVT::f128)
- return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+ return SDValue();
SDValue ValueToStore = Src;
if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
@@ -19553,6 +20025,10 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
+ // when converting 0 when rounding toward negative infinity. Caller will
+ // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
+ assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
// This algorithm is not obvious. Here it is what we're trying to output:
/*
movq %rax, %xmm0
@@ -19566,8 +20042,6 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
#endif
*/
- bool IsStrict = Op->isStrictFPOpcode();
- unsigned OpNo = IsStrict ? 1 : 0;
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
@@ -19589,48 +20063,30 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
// Load the 64-bit value into an XMM register.
SDValue XR1 =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
- SDValue CLod0 =
- DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /* Alignment = */ 16);
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
+ SDValue CLod0 = DAG.getLoad(
+ MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
SDValue Unpck1 =
getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
- SDValue CLod1 =
- DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /* Alignment = */ 16);
+ SDValue CLod1 = DAG.getLoad(
+ MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
- SDValue Sub;
- SDValue Chain;
// TODO: Are there any fast-math-flags to propagate here?
- if (IsStrict) {
- Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
- {Op.getOperand(0), XR2F, CLod1});
- Chain = Sub.getValue(1);
- } else
- Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (!IsStrict && Subtarget.hasSSE3() &&
+ if (Subtarget.hasSSE3() &&
shouldUseHorizontalOp(true, DAG, Subtarget)) {
- // FIXME: Do we need a STRICT version of FHADD?
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
- if (IsStrict) {
- Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
- {Chain, Shuffle, Sub});
- Chain = Result.getValue(1);
- } else
- Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0, dl));
- if (IsStrict)
- return DAG.getMergeValues({Result, Chain}, dl);
-
return Result;
}
@@ -19929,7 +20385,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
if (DstVT == MVT::f128)
- return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
+ return SDValue();
if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -19956,26 +20412,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
- if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+ // The transform for i64->f64 isn't correct for 0 when rounding to negative
+ // infinity. It produces -0.0, so disable under strictfp.
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
- if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+ if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
+ (DstVT == MVT::f32 || DstVT == MVT::f64))
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ Align SlotAlign(8);
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
if (SrcVT == MVT::i32) {
- SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
- SDValue Store1 =
- DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
+ SDValue OffsetSlot =
+ DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
+ SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
- OffsetSlot, MPI.getWithOffset(4), 4);
+ OffsetSlot, MPI.getWithOffset(4), SlotAlign);
std::pair<SDValue, SDValue> Tmp =
- BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
+ BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -19991,17 +20451,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
}
SDValue Store =
- DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
// For i64 source, we need to add the appropriate power of 2 if the input
- // was negative. This is the same as the optimization in
- // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
- // we must be careful to do the computation in x87 extended precision, not
- // in SSE. (The generic code can't know it's OK to do this, or how to.)
+ // was negative. We must be careful to do the computation in x87 extended
+ // precision, not in SSE.
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot };
SDValue Fild =
DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
- Align(8), MachineMemOperand::MOLoad);
+ SlotAlign, MachineMemOperand::MOLoad);
Chain = Fild.getValue(1);
@@ -20104,8 +20562,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
- // Adjust = (Value < Thresh) ? 0 : 0x80000000;
- // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+ // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
+ // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
// FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
@@ -20135,20 +20593,30 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
*DAG.getContext(), TheVT);
SDValue Cmp;
if (IsStrict) {
- Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
- Chain, /*IsSignaling*/ true);
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
+ /*IsSignaling*/ true);
Chain = Cmp.getValue(1);
} else {
- Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
}
- Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
- DAG.getConstant(0, DL, MVT::i64),
- DAG.getConstant(APInt::getSignMask(64),
- DL, MVT::i64));
- SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
- DAG.getConstantFP(0.0, DL, TheVT),
- ThreshVal);
+ // Our preferred lowering of
+ //
+ // (Value >= Thresh) ? 0x8000000000000000ULL : 0
+ //
+ // is
+ //
+ // (Value >= Thresh) << 63
+ //
+ // but since we can get here after LegalOperations, DAGCombine might do the
+ // wrong thing if we create a select. So, directly create the preferred
+ // version.
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
+ SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
+ Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
+
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
+ DAG.getConstantFP(0.0, DL, TheVT));
if (IsStrict) {
Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
@@ -20607,30 +21075,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+ In = DAG.getBitcast(MVT::v8i32, In);
+
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
- In = DAG.getBitcast(MVT::v8i32, In);
In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
DAG.getIntPtrConstant(0, DL));
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(0, DL));
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
- DAG.getIntPtrConstant(2, DL));
- OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
- OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4, DL));
static const int ShufMask[] = {0, 2, 4, 6};
return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
}
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+ In = DAG.getBitcast(MVT::v32i8, In);
+
// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
- In = DAG.getBitcast(MVT::v32i8, In);
-
// The PSHUFB mask:
static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1,
@@ -20639,21 +21106,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
- static const int ShufMask2[] = {0, 2, -1, -1};
- In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
- In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getBitcast(VT, In);
+ static const int ShufMask2[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v16i16, In),
+ DAG.getIntPtrConstant(0, DL));
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
DAG.getIntPtrConstant(0, DL));
-
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(4, DL));
-
- OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
- OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+ DAG.getIntPtrConstant(16, DL));
// The PSHUFB mask:
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
@@ -20989,6 +21452,155 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
}
+SDValue
+X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+ // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
+ // but making use of X86 specifics to produce better instruction sequences.
+ SDNode *Node = Op.getNode();
+ bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
+ unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+ SDLoc dl(SDValue(Node, 0));
+ SDValue Src = Node->getOperand(0);
+
+ // There are three types involved here: SrcVT is the source floating point
+ // type, DstVT is the type of the result, and TmpVT is the result of the
+ // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
+ // DstVT).
+ EVT SrcVT = Src.getValueType();
+ EVT DstVT = Node->getValueType(0);
+ EVT TmpVT = DstVT;
+
+ // This code is only for floats and doubles. Fall back to generic code for
+ // anything else.
+ if (!isScalarFPTypeInSSEReg(SrcVT))
+ return SDValue();
+
+ unsigned SatWidth = Node->getConstantOperandVal(1);
+ unsigned DstWidth = DstVT.getScalarSizeInBits();
+ unsigned TmpWidth = TmpVT.getScalarSizeInBits();
+ assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
+ "Expected saturation width smaller than result width");
+
+ // Promote result of FP_TO_*INT to at least 32 bits.
+ if (TmpWidth < 32) {
+ TmpVT = MVT::i32;
+ TmpWidth = 32;
+ }
+
+ // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
+ // us to use a native signed conversion instead.
+ if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
+ TmpVT = MVT::i64;
+ TmpWidth = 64;
+ }
+
+ // If the saturation width is smaller than the size of the temporary result,
+ // we can always use signed conversion, which is native.
+ if (SatWidth < TmpWidth)
+ FpToIntOpcode = ISD::FP_TO_SINT;
+
+ // Determine minimum and maximum integer values and their corresponding
+ // floating-point values.
+ APInt MinInt, MaxInt;
+ if (IsSigned) {
+ MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
+ MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+ } else {
+ MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
+ MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+ }
+
+ APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+ APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+
+ APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
+ MinInt, IsSigned, APFloat::rmTowardZero);
+ APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
+ MaxInt, IsSigned, APFloat::rmTowardZero);
+ bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
+ && !(MaxStatus & APFloat::opStatus::opInexact);
+
+ SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
+ SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
+
+ // If the integer bounds are exactly representable as floats, emit a
+ // min+max+fptoi sequence. Otherwise use comparisons and selects.
+ if (AreExactFloatBounds) {
+ if (DstVT != TmpVT) {
+ // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
+ SDValue MinClamped = DAG.getNode(
+ X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+ // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
+ SDValue BothClamped = DAG.getNode(
+ X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+ // Convert clamped value to integer.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
+
+ // NaN will become INDVAL, with the top bit set and the rest zero.
+ // Truncation will discard the top bit, resulting in zero.
+ return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+ }
+
+ // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
+ SDValue MinClamped = DAG.getNode(
+ X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+ // Clamp by MaxFloat from above. NaN cannot occur.
+ SDValue BothClamped = DAG.getNode(
+ X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+ // Convert clamped value to integer.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
+
+ if (!IsSigned) {
+ // In the unsigned case we're done, because we mapped NaN to MinFloat,
+ // which is zero.
+ return FpToInt;
+ }
+
+ // Otherwise, select zero if Src is NaN.
+ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+ return DAG.getSelectCC(
+ dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+ }
+
+ SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
+ SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
+
+ // Result of direct conversion, which may be selected away.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
+
+ if (DstVT != TmpVT) {
+ // NaN will become INDVAL, with the top bit set and the rest zero.
+ // Truncation will discard the top bit, resulting in zero.
+ FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+ }
+
+ SDValue Select = FpToInt;
+ // For signed conversions where we saturate to the same size as the
+ // result type of the fptoi instructions, INDVAL coincides with integer
+ // minimum, so we don't need to explicitly check it.
+ if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
+ // If Src ULT MinFloat, select MinInt. In particular, this also selects
+ // MinInt if Src is NaN.
+ Select = DAG.getSelectCC(
+ dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+ }
+
+ // If Src OGT MaxFloat, select MaxInt.
+ Select = DAG.getSelectCC(
+ dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+
+ // In the unsigned case we are done, because we mapped NaN to MinInt, which
+ // is already zero. The promoted case was already handled above.
+ if (!IsSigned || DstVT != TmpVT) {
+ return Select;
+ }
+
+ // Otherwise, select 0 if Src is NaN.
+ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+ return DAG.getSelectCC(
+ dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+}
+
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -20997,10 +21609,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
- if (VT == MVT::f128) {
- RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
- return LowerF128Call(Op, DAG, LC);
- }
+ if (VT == MVT::f128)
+ return SDValue();
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
@@ -21014,31 +21624,12 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
-
- MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
- MVT SVT = In.getSimpleValueType();
-
// It's legal except when f128 is involved
- if (SVT != MVT::f128)
+ if (In.getSimpleValueType() != MVT::f128)
return Op;
- RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
-
- // FP_ROUND node has a second operand indicating whether it is known to be
- // precise. That doesn't take part in the LibCall so we can't directly use
- // LowerF128Call.
-
- SDLoc dl(Op);
- SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
- MakeLibCallOptions CallOptions;
- std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
- dl, Chain);
-
- if (IsStrict)
- return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
-
- return Tmp.first;
+ return SDValue();
}
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -21403,8 +21994,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
if (M == SrcOpMap.end()) {
VT = Src.getValueType();
// Quit if not the same type.
- if (SrcOpMap.begin() != SrcOpMap.end() &&
- VT != SrcOpMap.begin()->first.getValueType())
+ if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
return false;
unsigned NumElts = VT.getVectorNumElements();
APInt EltCount = APInt::getNullValue(NumElts);
@@ -21442,8 +22032,11 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, X86::CondCode &X86CC) {
EVT VT = V.getValueType();
- assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
- "Element Mask vs Vector bitwidth mismatch");
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ if (Mask.getBitWidth() != ScalarSize) {
+ assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
+ return SDValue();
+ }
assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
@@ -22347,7 +22940,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
- if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
+ if (VT.getFixedSizeInBits() >
+ Op.getSimpleValueType().getFixedSizeInBits()) {
// We emitted a compare with an XMM/YMM result. Finish converting to a
// mask register using a vptestm.
EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
@@ -22522,8 +23116,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
// Try to use SUBUS and PCMPEQ.
- if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
- return V;
+ if (FlipSigns)
+ if (SDValue V =
+ LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
+ return V;
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
@@ -23318,7 +23914,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
MVT SVT = VT.getVectorElementType();
MVT InSVT = InVT.getVectorElementType();
- assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
+ assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
return SDValue();
@@ -23493,7 +24089,8 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
unsigned HalfOffset = Value0.getValueType().getStoreSize();
SDValue Ptr0 = Store->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
+ SDValue Ptr1 =
+ DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
SDValue Ch0 =
DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
Store->getOriginalAlign(),
@@ -23528,7 +24125,8 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Offset = i * ScalarSize;
- SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
DAG.getIntPtrConstant(i, DL));
SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
@@ -23549,17 +24147,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
if (StoredVal.getValueType().isVector() &&
StoredVal.getValueType().getVectorElementType() == MVT::i1) {
- assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
- "Unexpected VT");
+ unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
+ assert(NumElts <= 8 && "Unexpected VT");
assert(!St->isTruncatingStore() && "Expected non-truncating store");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
+ // We must pad with zeros to ensure we store zeroes to any unused bits.
StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getUNDEF(MVT::v16i1), StoredVal,
DAG.getIntPtrConstant(0, dl));
StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
+ // Make sure we store zeros in the extra bits.
+ if (NumElts < 8)
+ StoredVal = DAG.getZeroExtendInReg(
+ StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
@@ -23815,7 +24418,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Result;
if (!Lower) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
@@ -23916,7 +24519,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MemOps.push_back(Store);
// Store fp_offset
- FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
+ FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
Store = DAG.getStore(
Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
@@ -23981,15 +24584,18 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
Subtarget.hasSSE1());
}
- // Insert VAARG_64 node into the DAG
- // VAARG_64 returns two values: Variable Argument Address, Chain
- SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
- DAG.getConstant(ArgMode, dl, MVT::i8),
- DAG.getConstant(Align, dl, MVT::i32)};
+ // Insert VAARG node into the DAG
+ // VAARG returns two values: Variable Argument Address, Chain
+ SDValue InstOps[] = {Chain, SrcPtr,
+ DAG.getTargetConstant(ArgSize, dl, MVT::i32),
+ DAG.getTargetConstant(ArgMode, dl, MVT::i8),
+ DAG.getTargetConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(
- X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
- /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+ Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
+ VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+ /*Alignment=*/None,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
@@ -24013,9 +24619,11 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
- return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
- Align(8), /*isVolatile*/ false, false, false,
- MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+ return DAG.getMemcpy(
+ Chain, DL, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
+ Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
+ false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
@@ -24462,6 +25070,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
+ Src3.getValueType() != MVT::i8) {
+ Src3 = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
+ }
+
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -24480,9 +25094,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
{Src1, Src2, Src3});
}
- case INTR_TYPE_4OP:
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
+ case INTR_TYPE_4OP_IMM8: {
+ assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
+ SDValue Src4 = Op.getOperand(4);
+ if (Src4.getValueType() != MVT::i8) {
+ Src4 = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Src4);
+ }
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
@@ -24715,20 +25338,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
SDValue CC = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
- SDValue Sae = Op.getOperand(4);
+ SDValue Sae = Op.getOperand(5);
if (isRoundModeSAE(Sae))
return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC, Sae);
+ Op.getOperand(2), CC, Mask, Sae);
if (!isRoundModeCurDirection(Sae))
return SDValue();
}
//default rounding mode
return DAG.getNode(IntrData->Opc0, dl, MaskVT,
- {Op.getOperand(1), Op.getOperand(2), CC});
+ {Op.getOperand(1), Op.getOperand(2), CC, Mask});
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -24883,12 +25507,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
case BEXTRI: {
- assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");
+ assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
- // The control is a TargetConstant, but we need to convert it to a
- // ConstantSDNode.
uint64_t Imm = Op.getConstantOperandVal(2);
- SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
+ SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
+ Op.getValueType());
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Control);
}
@@ -25279,9 +25902,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// MMX register.
ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
- DAG.getConstant(NewIntrinsic, DL, MVT::i32),
+ DAG.getTargetConstant(NewIntrinsic, DL,
+ getPointerTy(DAG.getDataLayout())),
Op.getOperand(1), ShAmt);
-
}
}
}
@@ -25650,6 +26273,96 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
+ case Intrinsic::x86_aesenc128kl:
+ case Intrinsic::x86_aesdec128kl:
+ case Intrinsic::x86_aesenc256kl:
+ case Intrinsic::x86_aesdec256kl: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
+ SDValue Chain = Op.getOperand(0);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_aesenc128kl:
+ Opcode = X86ISD::AESENC128KL;
+ break;
+ case Intrinsic::x86_aesdec128kl:
+ Opcode = X86ISD::AESDEC128KL;
+ break;
+ case Intrinsic::x86_aesenc256kl:
+ Opcode = X86ISD::AESENC256KL;
+ break;
+ case Intrinsic::x86_aesdec256kl:
+ Opcode = X86ISD::AESDEC256KL;
+ break;
+ }
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ MachineMemOperand *MMO = MemIntr->getMemOperand();
+ EVT MemVT = MemIntr->getMemoryVT();
+ SDValue Operation = DAG.getMemIntrinsicNode(
+ Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
+ MMO);
+ SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ {ZF, Operation.getValue(0), Operation.getValue(2)});
+ }
+ case Intrinsic::x86_aesencwide128kl:
+ case Intrinsic::x86_aesdecwide128kl:
+ case Intrinsic::x86_aesencwide256kl:
+ case Intrinsic::x86_aesdecwide256kl: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(
+ {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
+ MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
+ SDValue Chain = Op.getOperand(0);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_aesencwide128kl:
+ Opcode = X86ISD::AESENCWIDE128KL;
+ break;
+ case Intrinsic::x86_aesdecwide128kl:
+ Opcode = X86ISD::AESDECWIDE128KL;
+ break;
+ case Intrinsic::x86_aesencwide256kl:
+ Opcode = X86ISD::AESENCWIDE256KL;
+ break;
+ case Intrinsic::x86_aesdecwide256kl:
+ Opcode = X86ISD::AESDECWIDE256KL;
+ break;
+ }
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ MachineMemOperand *MMO = MemIntr->getMemOperand();
+ EVT MemVT = MemIntr->getMemoryVT();
+ SDValue Operation = DAG.getMemIntrinsicNode(
+ Opcode, DL, VTs,
+ {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
+ Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
+ MemVT, MMO);
+ SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ {ZF, Operation.getValue(1), Operation.getValue(2),
+ Operation.getValue(3), Operation.getValue(4),
+ Operation.getValue(5), Operation.getValue(6),
+ Operation.getValue(7), Operation.getValue(8),
+ Operation.getValue(9)});
+ }
+ case Intrinsic::x86_testui: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
+ SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
}
return SDValue();
}
@@ -26020,9 +26733,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(2, dl, MVT::i64));
- OutChains[1] =
- DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
- /* Alignment = */ 2);
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+ MachinePointerInfo(TrmpAddr, 2), Align(2));
// Load the 'nest' parameter value into R10.
// R10 is specified in X86CallingConv.td
@@ -26034,9 +26746,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(12, dl, MVT::i64));
- OutChains[3] =
- DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
- /* Alignment = */ 2);
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 12), Align(2));
// Jump to the nested function.
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
@@ -26078,7 +26789,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
- auto &DL = DAG.getDataLayout();
+ const DataLayout &DL = DAG.getDataLayout();
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
}
@@ -26116,22 +26827,20 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(1, dl, MVT::i32));
- OutChains[1] =
- DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
- /* Alignment = */ 1);
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 1), Align(1));
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(5, dl, MVT::i32));
- OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
- Addr, MachinePointerInfo(TrmpAddr, 5),
- /* Alignment = */ 1);
+ OutChains[2] =
+ DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
+ MachinePointerInfo(TrmpAddr, 5), Align(1));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(6, dl, MVT::i32));
- OutChains[3] =
- DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
- /* Alignment = */ 1);
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+ MachinePointerInfo(TrmpAddr, 6), Align(1));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
@@ -26425,50 +27134,47 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
MVT VT = Op.getSimpleValueType();
SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
+ SDLoc DL(Op);
+
if (VT.getScalarType() == MVT::i1) {
- SDLoc dl(Op);
switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
// *addsat i1 X, Y --> X | Y
- return DAG.getNode(ISD::OR, dl, VT, X, Y);
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
// *subsat i1 X, Y --> X & ~Y
- return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
+ return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
}
}
- if (VT.is128BitVector()) {
- // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), VT);
- SDLoc DL(Op);
- if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
- // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
- SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
- SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
- return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
- }
- if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
- // usubsat X, Y --> (X >u Y) ? X - Y : 0
- SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
- SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
- return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
- }
- // Use default expansion.
- return SDValue();
+ if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
+ (VT.is256BitVector() && !Subtarget.hasInt256())) {
+ assert(Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX vector integer operation");
+ return splitVectorIntBinary(Op, DAG);
}
- if (VT == MVT::v32i16 || VT == MVT::v64i8)
- return splitVectorIntBinary(Op, DAG);
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- assert(Op.getSimpleValueType().is256BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 256-bit vector integer operation");
- return splitVectorIntBinary(Op, DAG);
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ // TODO: Move this to DAGCombiner?
+ if (SetCCResultType == VT &&
+ DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+ return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+
+ // Use default expansion.
+ return SDValue();
}
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
@@ -26518,36 +27224,8 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
- SDLoc DL(Op);
- unsigned Opcode = Op.getOpcode();
- SDValue N0 = Op.getOperand(0);
- SDValue N1 = Op.getOperand(1);
-
- // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
- // using the SMIN/SMAX instructions and flipping the signbit back.
- if (VT == MVT::v8i16) {
- assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
- "Unexpected MIN/MAX opcode");
- SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
- N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
- N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
- Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
- SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
- return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
- }
-
- // Else, expand to a compare/select.
- ISD::CondCode CC;
- switch (Opcode) {
- case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
- case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
- case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
- case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
- default: llvm_unreachable("Unknown MINMAX opcode");
- }
-
- SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
- return DAG.getSelect(DL, VT, Cond, N0, N1);
+ // Default to expand.
+ return SDValue();
}
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
@@ -26903,8 +27581,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
- case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
- case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
}
SDLoc dl(Op);
@@ -26921,8 +27597,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Entry.Node = StackPtr;
- InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
- MPI, /* Alignment = */ 16);
+ InChain =
+ DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.IsSExt = false;
@@ -27213,6 +27889,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
MVT VT = Amt.getSimpleValueType();
if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
(Subtarget.hasInt256() && VT == MVT::v16i16) ||
+ (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
return SDValue();
@@ -27790,6 +28467,12 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return Op;
}
+ // AVX512 VBMI2 vXi16 - lower to funnel shifts.
+ if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
+ unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
+ return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
+ }
+
assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
@@ -27816,7 +28499,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return splitVectorIntBinary(Op, DAG);
assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
- ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
+ ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
+ VT == MVT::v32i16) &&
Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported");
@@ -28113,8 +28797,8 @@ bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
/// a) very likely accessed only by a single thread to minimize cache traffic,
/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SDValue Chain, SDLoc DL) {
+ const X86Subtarget &Subtarget, SDValue Chain,
+ const SDLoc &DL) {
// Implementation notes:
// 1) LOCK prefix creates a full read/write reordering barrier for memory
// operations issued by the current processor. As such, the location
@@ -28552,18 +29236,28 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
+ assert(VT.getScalarType() == MVT::i8 &&
+ "Only byte vector BITREVERSE supported");
+
// Split v64i8 without BWI so that we can still use the PSHUFB lowering.
if (VT == MVT::v64i8 && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
- unsigned NumElts = VT.getVectorNumElements();
- assert(VT.getScalarType() == MVT::i8 &&
- "Only byte vector BITREVERSE supported");
-
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
- if (VT.is256BitVector() && !Subtarget.hasInt256())
+ if (VT == MVT::v32i8 && !Subtarget.hasInt256())
return splitVectorIntUnary(Op, DAG);
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
+ if (Subtarget.hasGFNI()) {
+ MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+ SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+ Matrix = DAG.getBitcast(VT, Matrix);
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
+ DAG.getTargetConstant(0, DL, MVT::i8));
+ }
+
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
// 0-15 value (moved to the other nibble).
@@ -28595,6 +29289,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
}
+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+
+ // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+ if (VT == MVT::i8 ||
+ DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+ X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+ SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+ DAG.getConstant(0, DL, MVT::i8));
+ // Copy the inverse of the parity flag into a register with setcc.
+ SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+ // Extend to the original type.
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+ }
+
+ if (VT == MVT::i64) {
+ // Xor the high and low 16-bits together using a 32-bit operation.
+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+ DAG.getConstant(32, DL, MVT::i8)));
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+ X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+ }
+
+ if (VT != MVT::i16) {
+ // Xor the high and low 16-bits together using a 32-bit operation.
+ SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+ DAG.getConstant(16, DL, MVT::i8));
+ X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+ } else {
+ // If the input is 16-bits, we need to extend to use an i32 shift below.
+ X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+ }
+
+ // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+ // This should allow an h-reg to be used to save a shift.
+ SDValue Hi = DAG.getNode(
+ ISD::TRUNCATE, DL, MVT::i8,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+ SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+ SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+ // Copy the inverse of the parity flag into a register with setcc.
+ SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+ // Extend to the original type.
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NewOpc = 0;
@@ -28731,7 +29477,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Chain =
DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
- MPI, /*Align*/ 0, MachineMemOperand::MOStore);
+ MPI, MaybeAlign(), MachineMemOperand::MOStore);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue LdOps[] = {Chain, StackPtr};
SDValue Value =
@@ -28771,6 +29517,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
MVT VT = N->getSimpleValueType(0);
+ unsigned Opc = Op.getOpcode();
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -28785,11 +29532,14 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
Carry, DAG.getAllOnesConstant(DL, CarryVT));
- unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
- SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
- Op.getOperand(1), Carry.getValue(1));
+ bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
+ SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
+ Op.getOperand(0), Op.getOperand(1),
+ Carry.getValue(1));
- SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
+ bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
+ SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
+ Sum.getValue(1), DL, DAG);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
@@ -29165,25 +29915,6 @@ SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
return NOOP;
}
-SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
- RTLIB::Libcall Call) const {
-
- bool IsStrict = Op->isStrictFPOpcode();
- unsigned Offset = IsStrict ? 1 : 0;
- SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
-
- SDLoc dl(Op);
- SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
- MakeLibCallOptions CallOptions;
- std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
- CallOptions, dl, Chain);
-
- if (IsStrict)
- return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
-
- return Tmp.first;
-}
-
// Custom split CVTPS2PH with wide types.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -29213,6 +29944,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
+ case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
@@ -29247,6 +29979,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
case ISD::FP_EXTEND:
case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FP_ROUND:
@@ -29313,6 +30047,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UMULO: return LowerXALUO(Op, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
+ case ISD::SADDO_CARRY:
+ case ISD::SSUBO_CARRY:
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::ADD:
@@ -29338,35 +30074,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
}
-/// Places new result values for the node in Results (their number
-/// and types must exactly match those of the original return values of
-/// the node), or leaves Results empty, which indicates that the node is not
-/// to be custom lowered after all.
-void X86TargetLowering::LowerOperationWrapper(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const {
- SDValue Res = LowerOperation(SDValue(N, 0), DAG);
-
- if (!Res.getNode())
- return;
-
- // If the original node has one result, take the return value from
- // LowerOperation as is. It might not be result number 0.
- if (N->getNumValues() == 1) {
- Results.push_back(Res);
- return;
- }
-
- // If the original node has multiple results, then the return node should
- // have the same number of results.
- assert((N->getNumValues() == Res->getNumValues()) &&
- "Lowering returned the wrong number of results!");
-
- // Places new result values base on N result number.
- for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
- Results.push_back(Res.getValue(I));
-}
-
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -29409,6 +30116,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Chain);
return;
}
+ case X86ISD::CVTPS2PH:
+ Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
+ return;
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
@@ -29477,28 +30187,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res);
return;
}
- case ISD::ABS: {
- assert(N->getValueType(0) == MVT::i64 &&
- "Unexpected type (!= i64) on ABS.");
- MVT HalfT = MVT::i32;
- SDValue Lo, Hi, Tmp;
- SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
-
- Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
- DAG.getConstant(0, dl, HalfT));
- Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
- DAG.getConstant(1, dl, HalfT));
- Tmp = DAG.getNode(
- ISD::SRA, dl, HalfT, Hi,
- DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
- Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
- Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
- SDValue(Lo.getNode(), 1));
- Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
- Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
- return;
- }
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
@@ -29539,10 +30227,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- LLVM_FALLTHROUGH;
- }
- case ISD::SDIVREM:
- case ISD::UDIVREM: {
SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
Results.push_back(V);
return;
@@ -29676,7 +30360,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
- SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
+ SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
@@ -29686,7 +30370,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
- Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
+ Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
@@ -30037,46 +30721,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
swapInH =
DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
swapInH, cpInH.getValue(1));
- // If the current function needs the base pointer, RBX,
- // we shouldn't use cmpxchg directly.
- // Indeed the lowering of that instruction will clobber
- // that register and since RBX will be a reserved register
- // the register allocator will not make sure its value will
- // be properly saved and restored around this live-range.
- const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ // In 64-bit mode we might need the base pointer in RBX, but we can't know
+ // until later. So we keep the RBX input in a vreg and use a custom
+ // inserter.
+ // Since RBX will be a reserved register the register allocator will not
+ // make sure its value will be properly saved and restored around this
+ // live-range.
SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- Register BasePtr = TRI->getBaseRegister();
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
- if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
- (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
- // ISel prefers the LCMPXCHG64 variant.
- // If that assert breaks, that means it is not the case anymore,
- // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
- // not just EBX. This is a matter of accepting i64 input for that
- // pseudo, and restoring into the register of the right wide
- // in expand pseudo. Everything else should just work.
- assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
- "Saving only half of the RBX");
- unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
- : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
- SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
- Regs64bit ? X86::RBX : X86::EBX,
- HalfT, swapInH.getValue(1));
- SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
- RBXSave,
- /*Glue*/ RBXSave.getValue(2)};
- Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ if (Regs64bit) {
+ SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
+ swapInH.getValue(1)};
+ Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
} else {
- unsigned Opcode =
- Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
- swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
- Regs64bit ? X86::RBX : X86::EBX, swapInL,
+ swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
swapInH.getValue(1));
SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
swapInL.getValue(1)};
- Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+ Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
}
+
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
@@ -30321,8 +30989,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(COMI)
NODE_NAME_CASE(UCOMI)
NODE_NAME_CASE(CMPM)
+ NODE_NAME_CASE(CMPMM)
NODE_NAME_CASE(STRICT_CMPM)
- NODE_NAME_CASE(CMPM_SAE)
+ NODE_NAME_CASE(CMPMM_SAE)
NODE_NAME_CASE(SETCC)
NODE_NAME_CASE(SETCC_CARRY)
NODE_NAME_CASE(FSETCC)
@@ -30381,7 +31050,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LCMPXCHG_DAG)
NODE_NAME_CASE(LCMPXCHG8_DAG)
NODE_NAME_CASE(LCMPXCHG16_DAG)
- NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
NODE_NAME_CASE(LADD)
NODE_NAME_CASE(LSUB)
@@ -30441,6 +31109,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(XOR)
NODE_NAME_CASE(AND)
NODE_NAME_CASE(BEXTR)
+ NODE_NAME_CASE(BEXTRI)
NODE_NAME_CASE(BZHI)
NODE_NAME_CASE(PDEP)
NODE_NAME_CASE(PEXT)
@@ -30478,7 +31147,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VBROADCAST)
NODE_NAME_CASE(VBROADCAST_LOAD)
NODE_NAME_CASE(VBROADCASTM)
- NODE_NAME_CASE(SUBV_BROADCAST)
+ NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
NODE_NAME_CASE(VPERMILPV)
NODE_NAME_CASE(VPERMILPI)
NODE_NAME_CASE(VPERM2X128)
@@ -30500,6 +31169,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DBPSADBW)
NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
NODE_NAME_CASE(VAARG_64)
+ NODE_NAME_CASE(VAARG_X32)
NODE_NAME_CASE(WIN_ALLOCA)
NODE_NAME_CASE(MEMBARRIER)
NODE_NAME_CASE(MFENCE)
@@ -30656,6 +31326,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ENQCMD)
NODE_NAME_CASE(ENQCMDS)
NODE_NAME_CASE(VP2INTERSECT)
+ NODE_NAME_CASE(AESENC128KL)
+ NODE_NAME_CASE(AESDEC128KL)
+ NODE_NAME_CASE(AESENC256KL)
+ NODE_NAME_CASE(AESDEC256KL)
+ NODE_NAME_CASE(AESENCWIDE128KL)
+ NODE_NAME_CASE(AESDECWIDE128KL)
+ NODE_NAME_CASE(AESENCWIDE256KL)
+ NODE_NAME_CASE(AESDECWIDE256KL)
+ NODE_NAME_CASE(TESTUI)
}
return nullptr;
#undef NODE_NAME_CASE
@@ -31001,7 +31680,7 @@ static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
/// Utility function to emit xbegin specifying the start of an RTM region.
static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
@@ -31080,11 +31759,9 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
return sinkMBB;
}
-
-
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
@@ -31095,9 +31772,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// 8 ) Align : Alignment of type
// 9 ) EFLAGS (implicit-def)
- assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
- static_assert(X86::AddrNumOperands == 5,
- "VAARG_64 assumes 5 address operands");
+ assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
+ static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
Register DestReg = MI.getOperand(0).getReg();
MachineOperand &Base = MI.getOperand(1);
@@ -31112,7 +31788,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineFunction *MF = MBB->getParent();
// Memory Reference
- assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+ assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
MachineMemOperand *OldMMO = MI.memoperands().front();
@@ -31125,9 +31801,10 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
// struct va_list {
// i32 gp_offset
@@ -31236,25 +31913,35 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Read the reg_save_area address.
Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+ BuildMI(
+ offsetMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+ RegSaveReg)
.add(Base)
.add(Scale)
.add(Index)
- .addDisp(Disp, 16)
+ .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
.add(Segment)
.setMemRefs(LoadOnlyMMO);
- // Zero-extend the offset
- Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
- .addImm(0)
- .addReg(OffsetReg)
- .addImm(X86::sub_32bit);
+ if (Subtarget.isTarget64BitLP64()) {
+ // Zero-extend the offset
+ Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addImm(X86::sub_32bit);
- // Add the offset to the reg_save_area to get the final address.
- BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
- .addReg(OffsetReg64)
- .addReg(RegSaveReg);
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+ .addReg(OffsetReg64)
+ .addReg(RegSaveReg);
+ } else {
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
+ .addReg(OffsetReg)
+ .addReg(RegSaveReg);
+ }
// Compute the offset for the next argument
Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
@@ -31283,7 +31970,9 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Load the overflow_area address into a register.
Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+ BuildMI(overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+ OverflowAddrReg)
.add(Base)
.add(Scale)
.add(Index)
@@ -31298,11 +31987,17 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
- BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+ TmpReg)
.addReg(OverflowAddrReg)
.addImm(Alignment.value() - 1);
- BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
+ OverflowDestReg)
.addReg(TmpReg)
.addImm(~(uint64_t)(Alignment.value() - 1));
} else {
@@ -31313,12 +32008,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Compute the next overflow address after this argument.
// (the overflow address should be kept 8-byte aligned)
Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
- .addReg(OverflowDestReg)
- .addImm(ArgSizeA8);
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+ NextAddrReg)
+ .addReg(OverflowDestReg)
+ .addImm(ArgSizeA8);
// Store the new overflow address.
- BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+ BuildMI(overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
.add(Base)
.add(Scale)
.add(Index)
@@ -31374,10 +32073,10 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
// Now add the instructions.
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
Register CountReg = MI.getOperand(0).getReg();
- int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
+ int RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
@@ -31686,7 +32385,7 @@ MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
@@ -31841,7 +32540,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
const unsigned ProbeSize = getStackProbeSize(*MF);
@@ -31934,7 +32633,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
assert(MF->shouldSplitStack());
@@ -31969,7 +32668,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
const TargetRegisterClass *AddrRegClass =
getRegClassFor(getPointerTy(MF->getDataLayout()));
- unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
@@ -32069,7 +32768,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
assert(!isAsynchronousEHPersonality(
classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
@@ -32107,7 +32806,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
// inside MC, therefore without the two markers shrink-wrapping
// may push the prologue/epilogue pass them.
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction &MF = *BB->getParent();
// Emit CALLSEQ_START right before the instruction.
@@ -32136,7 +32835,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
// be in the normal return register.
MachineFunction *F = BB->getParent();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
assert(MI.getOperand(3).isGlobal() && "This should be a global");
@@ -32275,7 +32974,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
MachineBasicBlock *BB) const {
// Copy the virtual register into the R11 physical register and
// call the retpoline thunk.
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
Register CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
@@ -32337,7 +33036,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
/// \param [in] MBB The Machine Basic Block that will be modified.
void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -32380,7 +33079,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -32540,7 +33239,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -32721,7 +33420,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -32805,7 +33504,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB,
int FI) const {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -32854,7 +33553,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -33084,7 +33783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
auto TMMImmToTMMReg = [](unsigned Imm) {
assert (Imm < 8 && "Illegal tmm index");
@@ -33094,8 +33793,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
case X86::TLS_addr64:
+ case X86::TLS_addrX32:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
+ case X86::TLS_base_addrX32:
return EmitLoweredTLSAddr(MI, BB);
case X86::INDIRECT_THUNK_CALL32:
case X86::INDIRECT_THUNK_CALL64:
@@ -33251,7 +33952,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
case X86::VAARG_64:
- return EmitVAARG64WithCustomInserter(MI, BB);
+ case X86::VAARG_X32:
+ return EmitVAARGWithCustomInserter(MI, BB);
case X86::EH_SjLj_SetJmp32:
case X86::EH_SjLj_SetJmp64:
@@ -33274,10 +33976,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitPatchPoint(MI, BB);
case TargetOpcode::PATCHABLE_EVENT_CALL:
- return emitXRayCustomEvent(MI, BB);
-
case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
- return emitXRayTypedEvent(MI, BB);
+ return BB;
case X86::LCMPXCHG8B: {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -33332,14 +34032,75 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
- case X86::LCMPXCHG16B:
+ case X86::LCMPXCHG16B_NO_RBX: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ Register BasePtr = TRI->getBaseRegister();
+ if (TRI->hasBasePointer(*MF) &&
+ (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+ if (!BB->isLiveIn(BasePtr))
+ BB->addLiveIn(BasePtr);
+ // Save RBX into a virtual register.
+ Register SaveRBX =
+ MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+ .addReg(X86::RBX);
+ Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
+ for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+ MIB.add(MI.getOperand(Idx));
+ MIB.add(MI.getOperand(X86::AddrNumOperands));
+ MIB.addReg(SaveRBX);
+ } else {
+ // Simple case, just copy the virtual register to RBX.
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
+ .add(MI.getOperand(X86::AddrNumOperands));
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
+ for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+ MIB.add(MI.getOperand(Idx));
+ }
+ MI.eraseFromParent();
return BB;
- case X86::LCMPXCHG8B_SAVE_EBX:
- case X86::LCMPXCHG16B_SAVE_RBX: {
- unsigned BasePtr =
- MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
- if (!BB->isLiveIn(BasePtr))
- BB->addLiveIn(BasePtr);
+ }
+ case X86::MWAITX: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ Register BasePtr = TRI->getBaseRegister();
+ bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
+ // If no need to save the base pointer, we generate MWAITXrrr,
+ // else we generate pseudo MWAITX_SAVE_RBX.
+ if (!IsRBX || !TRI->hasBasePointer(*MF)) {
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(1).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
+ .addReg(MI.getOperand(2).getReg());
+ BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
+ MI.eraseFromParent();
+ } else {
+ if (!BB->isLiveIn(BasePtr)) {
+ BB->addLiveIn(BasePtr);
+ }
+ // Parameters can be copied into ECX and EAX but not EBX yet.
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(1).getReg());
+ assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
+ // Save RBX into a virtual register.
+ Register SaveRBX =
+ MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+ .addReg(X86::RBX);
+ // Generate mwaitx pseudo.
+ Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
+ .addDef(Dst) // Destination tied in with SaveRBX.
+ .addReg(MI.getOperand(2).getReg()) // input value of EBX.
+ .addUse(SaveRBX); // Save of base pointer.
+ MI.eraseFromParent();
+ }
return BB;
}
case TargetOpcode::PREALLOCATED_SETUP: {
@@ -33377,7 +34138,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTDPBUSD:
case X86::PTDPBUUD:
case X86::PTDPBF16PS: {
- const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc;
switch (MI.getOpcode()) {
case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
@@ -33397,7 +34157,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
case X86::PTILEZERO: {
- const DebugLoc &DL = MI.getDebugLoc();
unsigned Imm = MI.getOperand(0).getImm();
BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
MI.eraseFromParent(); // The pseudo is gone now.
@@ -33406,7 +34165,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTILELOADD:
case X86::PTILELOADDT1:
case X86::PTILESTORED: {
- const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc;
switch (MI.getOpcode()) {
case X86::PTILELOADD: Opc = X86::TILELOADD; break;
@@ -33607,13 +34365,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits Known2;
if (!!DemandedLHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
- Known.One &= Known2.One;
- Known.Zero &= Known2.Zero;
+ Known = KnownBits::commonBits(Known, Known2);
}
if (!!DemandedRHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
- Known.One &= Known2.One;
- Known.Zero &= Known2.Zero;
+ Known = KnownBits::commonBits(Known, Known2);
}
if (Known.countMinLeadingZeros() < BitWidth)
@@ -33656,11 +34412,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
- Known.One &= Known2.One;
- Known.Zero &= Known2.Zero;
+ Known = KnownBits::commonBits(Known, Known2);
break;
}
- case X86ISD::BEXTR: {
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -33682,6 +34438,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case X86ISD::PDEP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ // Zeros are retained from the mask operand. But not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ break;
+ }
+ case X86ISD::PEXT: {
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ // The result has as many leading zeros as the number of zeroes in the mask.
+ unsigned Count = Known.Zero.countPopulation();
+ Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+ Known.One.clearAllBits();
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P:
case X86ISD::CVTP2SI:
@@ -33698,7 +34476,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case X86ISD::VMFPROUND:
case X86ISD::CVTPS2PH:
case X86ISD::MCVTPS2PH: {
- // Conversions - upper elements are known zero.
+ // Truncations/Conversions - upper elements are known zero.
EVT SrcVT = Op.getOperand(0).getValueType();
if (SrcVT.isVector()) {
unsigned NumSrcElts = SrcVT.getVectorNumElements();
@@ -33776,8 +34554,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
continue;
KnownBits Known2 =
DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
- Known.One &= Known2.One;
- Known.Zero &= Known2.Zero;
+ Known = KnownBits::commonBits(Known, Known2);
}
}
}
@@ -33956,11 +34733,18 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
- if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
- isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
- Shuffle = X86ISD::VZEXT_MOVL;
- SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
- return true;
+ if (MaskEltSize == 32 && Mask[0] == 0) {
+ if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+ if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
}
// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
@@ -34014,17 +34798,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
@@ -34033,17 +34817,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
- if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v4f64;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
@@ -34053,19 +34837,21 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
- if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v8f64;
return true;
}
if (isTargetShuffleEquivalent(
- Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+ MaskVT, Mask,
+ {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
}
if (isTargetShuffleEquivalent(
- Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+ MaskVT, Mask,
+ {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
@@ -34147,7 +34933,10 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
- if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
@@ -34217,30 +35006,31 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
+ unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
- if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
- (AllowFloatDomain || !Subtarget.hasSSE41())) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
+ Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
SrcVT = DstVT = MVT::v2f64;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
SrcVT = DstVT = MVT::v4f32;
@@ -34274,6 +35064,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
+ // Attempt to match against a OR if we're performing a blend shuffle and the
+ // non-blended source element is zero in each case.
+ if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
+ bool IsBlend = true;
+ unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
+ unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
+ unsigned Scale1 = NumV1Elts / NumMaskElts;
+ unsigned Scale2 = NumV2Elts / NumMaskElts;
+ APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
+ APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ if (M == SM_SentinelZero) {
+ DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+ DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+ continue;
+ }
+ if (M == (int)i) {
+ DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+ continue;
+ }
+ if (M == (int)(i + NumMaskElts)) {
+ DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+ continue;
+ }
+ IsBlend = false;
+ break;
+ }
+ if (IsBlend &&
+ DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
+ DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
+ Shuffle = ISD::OR;
+ SrcVT = DstVT = MaskVT.changeTypeToInteger();
+ return true;
+ }
+ }
+
return false;
}
@@ -34462,6 +35292,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
"Unexpected number of shuffle inputs!");
+ MVT RootVT = Root.getSimpleValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned NumRootElts = RootVT.getVectorNumElements();
+
+ // Canonicalize shuffle input op to the requested type.
+ // TODO: Support cases where Op is smaller than VT.
+ auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
+ return DAG.getBitcast(VT, Op);
+ };
+
// Find the inputs that enter the chain. Note that multiple uses are OK
// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);
@@ -34471,10 +35311,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
- MVT RootVT = Root.getSimpleValueType();
- assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
- VT2.getSizeInBits() == RootVT.getSizeInBits() &&
- "Vector size mismatch");
+ assert(VT1.getSizeInBits() == RootSizeInBits &&
+ VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
SDLoc DL(Root);
SDValue Res;
@@ -34482,12 +35320,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
- return DAG.getBitcast(RootVT, V1);
+ return CanonicalizeShuffleInput(RootVT, V1);
}
bool OptForSize = DAG.shouldOptForSize();
- unsigned RootSizeInBits = RootVT.getSizeInBits();
- unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
(RootVT.isFloatingPoint() && Depth >= 1) ||
@@ -34508,33 +35344,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// we can just use the broadcast directly. This works for smaller broadcast
// elements as well as they already repeat across each mask element
if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
- (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
- return DAG.getBitcast(RootVT, V1);
- }
-
- // Attempt to match a subvector broadcast.
- // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
- if (UnaryShuffle &&
- (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
- SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
- if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
- SDValue Src = Inputs[0];
- if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(0).isUndef() &&
- Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
- MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
- return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
- Src.getValueType(),
- Src.getOperand(1)));
- }
- }
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ V1.getValueSizeInBits() >= RootSizeInBits) {
+ return CanonicalizeShuffleInput(RootVT, V1);
}
// Handle 128/256-bit lane shuffles of 512-bit vectors.
if (RootVT.is512BitVector() &&
(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
- MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
-
// If the upper subvectors are zeroable, then an extract+insert is more
// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
// to zero the upper subvectors.
@@ -34543,12 +35360,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
"Unexpected lane shuffle");
- Res = DAG.getBitcast(ShuffleVT, V1);
- unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
bool UseZero = isAnyZero(BaseMask);
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
- Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
- return DAG.getBitcast(RootVT, Res);
+ return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
}
// Narrow shuffle mask to v4x128.
@@ -34557,8 +35373,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
// Try to lower to vshuf64x2/vshuf32x4.
- auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
- SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
unsigned PermMask = 0;
// Insure elements came from the same Op.
SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
@@ -34581,8 +35397,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
- DAG.getBitcast(ShuffleVT, Ops[0]),
- DAG.getBitcast(ShuffleVT, Ops[1]),
+ CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
+ CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
DAG.getTargetConstant(PermMask, DL, MVT::i8));
};
@@ -34597,6 +35413,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
if (!isAnyZero(Mask) && !PreferPERMQ) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+ return SDValue(); // Nothing to do!
+ MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
return DAG.getBitcast(RootVT, V);
}
@@ -34604,8 +35423,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Handle 128-bit lane shuffles of 256-bit vectors.
if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
- MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
-
// If the upper half is zeroable, then an extract+insert is more optimal
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
// zero the upper half.
@@ -34613,11 +35430,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
- Res = DAG.getBitcast(ShuffleVT, V1);
- Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
- Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
- DL, 256);
- return DAG.getBitcast(RootVT, Res);
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
+ return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+ DL, 256);
}
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
@@ -34632,12 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
-
- Res = DAG.getBitcast(ShuffleVT, V1);
- Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
- DAG.getUNDEF(ShuffleVT),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
- return DAG.getBitcast(RootVT, Res);
+ return DAG.getNode(
+ X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
+ DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
@@ -34653,13 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] & 3) << 0);
PermMask |= ((BaseMask[1] & 3) << 4);
-
- Res = DAG.getNode(
- X86ISD::VPERM2X128, DL, ShuffleVT,
- DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
- DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
- return DAG.getBitcast(RootVT, Res);
+ SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
+ SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
+ return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
+ CanonicalizeShuffleInput(RootVT, LHS),
+ CanonicalizeShuffleInput(RootVT, RHS),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
}
}
@@ -34721,8 +35533,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if ((Subtarget.hasAVX2() ||
(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
- SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
- if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+ if (isUndefOrEqual(Mask, 0)) {
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
MayFoldLoad(V1.getOperand(0))) {
@@ -34735,7 +35546,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Subtarget.hasAVX2()) {
if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
- Res = DAG.getBitcast(MaskVT, V1);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
}
@@ -34750,7 +35561,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
+ Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
return DAG.getBitcast(RootVT, Res);
}
@@ -34762,7 +35573,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = DAG.getBitcast(ShuffleVT, V1);
+ Res = CanonicalizeShuffleInput(ShuffleVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
@@ -34773,16 +35584,32 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// from a scalar.
// TODO: Handle other insertions here as well?
if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
- MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
- !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
- SDValue SrcV1 = V1, SrcV2 = V2;
- if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
- SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ Subtarget.hasSSE41() &&
+ !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
+ if (MaskEltSizeInBits == 32) {
+ SDValue SrcV1 = V1, SrcV2 = V2;
+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
+ DAG) &&
+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
+ CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ if (MaskEltSizeInBits == 64 &&
+ isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
+ V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V2.getScalarValueSizeInBits() <= 32) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
return SDValue(); // Nothing to do!
+ PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
- DAG.getBitcast(MVT::v4f32, SrcV1),
- DAG.getBitcast(MVT::v4f32, SrcV2),
+ CanonicalizeShuffleInput(MVT::v4f32, V1),
+ CanonicalizeShuffleInput(MVT::v4f32, V2),
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -34796,8 +35623,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
- NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
+ NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+ NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
return DAG.getBitcast(RootVT, Res);
}
@@ -34810,8 +35637,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
- NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
+ NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
+ NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
@@ -34828,7 +35655,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Zeroable)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(IntMaskVT, V1);
+ V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
@@ -34838,8 +35665,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(IntMaskVT, V1);
- V2 = DAG.getBitcast(IntMaskVT, V2);
+ V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
+ V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
@@ -34858,7 +35685,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
if (Depth == 0 && Root.getOpcode() == Opc)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
if (ShuffleVT.getSizeInBits() < RootSizeInBits)
Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
@@ -34875,8 +35702,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return SDValue(); // Nothing to do!
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
- V1 = DAG.getBitcast(ShuffleSrcVT, V1);
- V2 = DAG.getBitcast(ShuffleSrcVT, V2);
+ V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
+ V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
@@ -34893,49 +35720,56 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
+ // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
+ // higher depth before combining them.
+ bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
bool MaskContainsZeros = isAnyZero(Mask);
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
- if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
- ((Subtarget.hasAVX2() &&
- (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasAVX512() &&
- (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- Res = DAG.getBitcast(MaskVT, V1);
- Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
- return DAG.getBitcast(RootVT, Res);
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
+ if (Subtarget.hasAVX2() &&
+ (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ // AVX512 variants (non-VLX will pad to 512-bit shuffles).
+ if ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = DAG.getUNDEF(MaskVT);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
}
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
- // vector as the second source.
+ // vector as the second source (non-VLX will pad to 512-bit shuffles).
if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasVLX() &&
- (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
- MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
-
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- Res = DAG.getBitcast(MaskVT, V1);
- SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
- Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
@@ -34946,22 +35780,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG, Subtarget))
return WideShuffle;
- // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
+ // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles).
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasVLX() &&
- (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- V1 = DAG.getBitcast(MaskVT, V1);
- V2 = DAG.getBitcast(MaskVT, V2);
- Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
return SDValue();
@@ -34987,7 +35820,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
EltBits[i] = AllOnes;
}
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
- Res = DAG.getBitcast(MaskVT, V1);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
unsigned AndOpcode =
MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
@@ -35007,7 +35840,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
VPermIdx.push_back(Idx);
}
SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
- Res = DAG.getBitcast(MaskVT, V1);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
return DAG.getBitcast(RootVT, Res);
}
@@ -35039,8 +35872,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
VPerm2Idx.push_back(Index);
}
- V1 = DAG.getBitcast(MaskVT, V1);
- V2 = DAG.getBitcast(MaskVT, V2);
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
@@ -35074,7 +35907,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
- Res = DAG.getBitcast(ByteVT, V1);
+ Res = CanonicalizeShuffleInput(ByteVT, V1);
SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
return DAG.getBitcast(RootVT, Res);
@@ -35104,8 +35937,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::v16i8;
- V1 = DAG.getBitcast(ByteVT, V1);
- V2 = DAG.getBitcast(ByteVT, V2);
+ V1 = CanonicalizeShuffleInput(ByteVT, V1);
+ V2 = CanonicalizeShuffleInput(ByteVT, V2);
SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
return DAG.getBitcast(RootVT, Res);
@@ -35118,25 +35951,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG, Subtarget))
return WideShuffle;
- // If we have a dual input shuffle then lower to VPERMV3.
+ // If we have a dual input shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles)
if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
- (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasVLX() &&
- (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
- MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
- MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
- (Subtarget.hasBWI() && Subtarget.hasVLX() &&
- (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
- (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
- (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
- (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- V1 = DAG.getBitcast(MaskVT, V1);
- V2 = DAG.getBitcast(MaskVT, V2);
- Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
+ MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
+ MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
+ MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
@@ -35161,12 +35991,16 @@ static SDValue combineX86ShuffleChainWithExtract(
if (NumInputs == 0)
return SDValue();
+ EVT RootVT = Root.getValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
+
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
SmallVector<unsigned, 4> Offsets(NumInputs, 0);
// Peek through subvectors.
// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
- unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
+ unsigned WideSizeInBits = RootSizeInBits;
for (unsigned i = 0; i != NumInputs; ++i) {
SDValue &Src = WideInputs[i];
unsigned &Offset = Offsets[i];
@@ -35189,8 +36023,6 @@ static SDValue combineX86ShuffleChainWithExtract(
if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
return SDValue();
- EVT RootVT = Root.getValueType();
- unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned Scale = WideSizeInBits / RootSizeInBits;
assert((WideSizeInBits % RootSizeInBits) == 0 &&
"Unexpected subvector extraction");
@@ -35250,6 +36082,149 @@ static SDValue combineX86ShuffleChainWithExtract(
return SDValue();
}
+// Canonicalize the combined shuffle mask chain with horizontal ops.
+// NOTE: This may update the Ops and Mask.
+static SDValue canonicalizeShuffleMaskWithHorizOp(
+ MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
+ unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Mask.empty() || Ops.empty())
+ return SDValue();
+
+ SmallVector<SDValue> BC;
+ for (SDValue Op : Ops)
+ BC.push_back(peekThroughBitcasts(Op));
+
+ // All ops must be the same horizop + type.
+ SDValue BC0 = BC[0];
+ EVT VT0 = BC0.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
+ return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
+ }))
+ return SDValue();
+
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+ bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
+ if (!isHoriz && !isPack)
+ return SDValue();
+
+ int NumElts = VT0.getVectorNumElements();
+ int NumLanes = VT0.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumHalfEltsPerLane = NumEltsPerLane / 2;
+
+ // See if we can remove the shuffle by resorting the HOP chain so that
+ // the HOP args are pre-shuffled.
+ // TODO: Generalize to any sized/depth chain.
+ // TODO: Add support for PACKSS/PACKUS.
+ if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() &&
+ shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) {
+ SmallVector<int> ScaledMask;
+ if (scaleShuffleElements(Mask, 4, ScaledMask)) {
+ // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
+ auto GetHOpSrc = [&](int M) {
+ if (M == SM_SentinelUndef)
+ return DAG.getUNDEF(VT0);
+ if (M == SM_SentinelZero)
+ return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
+ SDValue Src0 = BC[M / NumElts];
+ SDValue Src1 = Src0.getOperand((M % 4) >= 2);
+ if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
+ return Src1.getOperand(M % 2);
+ return SDValue();
+ };
+ SDValue M0 = GetHOpSrc(ScaledMask[0]);
+ SDValue M1 = GetHOpSrc(ScaledMask[1]);
+ SDValue M2 = GetHOpSrc(ScaledMask[2]);
+ SDValue M3 = GetHOpSrc(ScaledMask[3]);
+ if (M0 && M1 && M2 && M3) {
+ SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
+ SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
+ return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+ }
+ }
+ }
+
+ if (2 < Ops.size())
+ return SDValue();
+
+ SDValue BC1 = BC[BC.size() - 1];
+ if (Mask.size() == VT0.getVectorNumElements()) {
+ // Canonicalize binary shuffles of horizontal ops that use the
+ // same sources to an unary shuffle.
+ // TODO: Try to perform this fold even if the shuffle remains.
+ if (Ops.size() == 2) {
+ auto ContainsOps = [](SDValue HOp, SDValue Op) {
+ return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
+ };
+ // Commute if all BC0's ops are contained in BC1.
+ if (ContainsOps(BC1, BC0.getOperand(0)) &&
+ ContainsOps(BC1, BC0.getOperand(1))) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ std::swap(BC0, BC1);
+ }
+
+ // If BC1 can be represented by BC0, then convert to unary shuffle.
+ if (ContainsOps(BC0, BC1.getOperand(0)) &&
+ ContainsOps(BC0, BC1.getOperand(1))) {
+ for (int &M : Mask) {
+ if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
+ continue;
+ int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
+ M -= NumElts + (SubLane * NumHalfEltsPerLane);
+ if (BC1.getOperand(SubLane) != BC0.getOperand(0))
+ M += NumHalfEltsPerLane;
+ }
+ }
+ }
+
+ // Canonicalize unary horizontal ops to only refer to lower halves.
+ for (int i = 0; i != NumElts; ++i) {
+ int &M = Mask[i];
+ if (isUndefOrZero(M))
+ continue;
+ if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
+ (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+ M -= NumHalfEltsPerLane;
+ if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
+ (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+ M -= NumHalfEltsPerLane;
+ }
+ }
+
+ // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+ // represents the LHS/RHS inputs for the lower/upper halves.
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
+ SmallVector<int, 16> TargetMask128, WideMask128;
+ if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
+ scaleShuffleElements(TargetMask128, 2, WideMask128)) {
+ assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
+ bool SingleOp = (Ops.size() == 1);
+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WideMask128[0] & 1);
+ Hi = Hi.getOperand(WideMask128[1] & 1);
+ if (SingleOp) {
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ SDValue Undef = DAG.getUNDEF(SrcVT);
+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+ Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
+ Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
+ Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
+ Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
+ }
+ return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ }
+ }
+
+ return SDValue();
+}
+
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
@@ -35341,6 +36316,14 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
return DAG.getBitcast(VT, CstOp);
}
+namespace llvm {
+ namespace X86 {
+ enum {
+ MaxShuffleCombineDepth = 8
+ };
+ }
+} // namespace llvm
+
/// Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
@@ -35373,31 +36356,30 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
- bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
assert(RootMask.size() > 0 &&
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
"Illegal shuffle root mask");
+ assert(Root.getSimpleValueType().isVector() &&
+ "Shuffles operate on vector types!");
+ unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
- const unsigned MaxRecursionDepth = 8;
- if (Depth >= MaxRecursionDepth)
+ if (Depth >= MaxDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
SDValue Op = SrcOps[SrcOpIndex];
Op = peekThroughOneUseBitcasts(Op);
- MVT VT = Op.getSimpleValueType();
- if (!VT.isVector())
- return SDValue(); // Bail if we hit a non-vector.
+ EVT VT = Op.getValueType();
+ if (!VT.isVector() || !VT.isSimple())
+ return SDValue(); // Bail if we hit a non-simple non-vector.
- assert(Root.getSimpleValueType().isVector() &&
- "Shuffles operate on vector types!");
- unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
- assert(VT.getSizeInBits() == RootSizeInBits &&
- "Can only combine shuffles of the same vector register size.");
+ assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
+ "Can only combine shuffles upto size of the root op.");
// Extract target shuffle mask and resolve sentinels and inputs.
// TODO - determine Op's demanded elts from RootMask.
@@ -35410,17 +36392,32 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
- // Shuffle inputs must be the same size as the result, bail on any larger
- // inputs and widen any smaller inputs.
- if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
- return Op.getValueSizeInBits() > RootSizeInBits;
+ // Shuffle inputs must not be larger than the shuffle result.
+ // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
+ if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+ return OpInput.getValueSizeInBits() > VT.getSizeInBits();
}))
return SDValue();
- for (SDValue &Op : OpInputs)
- if (Op.getValueSizeInBits() < RootSizeInBits)
- Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
- SDLoc(Op), RootSizeInBits);
+ // If the shuffle result was smaller than the root, we need to adjust the
+ // mask indices and pad the mask with undefs.
+ if (RootSizeInBits > VT.getSizeInBits()) {
+ unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
+ unsigned OpMaskSize = OpMask.size();
+ if (OpInputs.size() > 1) {
+ unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
+ for (int &M : OpMask) {
+ if (M < 0)
+ continue;
+ int EltIdx = M % OpMaskSize;
+ int OpIdx = M / OpMaskSize;
+ M = (PaddedMaskSize * OpIdx) + EltIdx;
+ }
+ }
+ OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
+ OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
+ OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
+ }
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
@@ -35561,10 +36558,6 @@ static SDValue combineX86ShufflesRecursively(
// Handle the all undef/zero cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
-
- // TODO - should we handle the mixed zero/undef case as well? Just returning
- // a zero mask will lose information on undef elements possibly reducing
- // future combine possibilities.
if (all_of(Mask, [](int Idx) { return Idx < 0; }))
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
@@ -35584,7 +36577,7 @@ static SDValue combineX86ShufflesRecursively(
// shuffles to avoid constant pool bloat.
// Don't recurse if we already have more source ops than we can combine in
// the remaining recursion depth.
- if (Ops.size() < (MaxRecursionDepth - Depth)) {
+ if (Ops.size() < (MaxDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
// For empty roots, we need to resolve zeroable elements before combining
// them with other shuffles.
@@ -35596,7 +36589,7 @@ static SDValue combineX86ShufflesRecursively(
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
}
@@ -35607,6 +36600,24 @@ static SDValue combineX86ShufflesRecursively(
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
return Cst;
+ // Canonicalize the combined shuffle mask chain with horizontal ops.
+ // NOTE: This will update the Ops and Mask.
+ if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+ Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
+ return DAG.getBitcast(Root.getValueType(), HOp);
+
+ // Widen any subvector shuffle inputs we've collected.
+ if (any_of(Ops, [RootSizeInBits](SDValue Op) {
+ return Op.getValueSizeInBits() < RootSizeInBits;
+ })) {
+ for (SDValue &Op : Ops)
+ if (Op.getValueSizeInBits() < RootSizeInBits)
+ Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
+ RootSizeInBits);
+ // Reresolve - we might have repeated subvector sources.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+ }
+
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() <= 2) {
// Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -35614,8 +36625,10 @@ static SDValue combineX86ShufflesRecursively(
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- SmallVector<int, 64> WidenedMask;
- while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ while (Mask.size() > 1) {
+ SmallVector<int, 64> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ break;
Mask = std::move(WidenedMask);
}
@@ -35642,6 +36655,7 @@ static SDValue combineX86ShufflesRecursively(
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
}
@@ -35875,6 +36889,54 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
return SDValue();
}
+/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
+static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
+
+ MVT VT = V.getSimpleValueType();
+ SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
+ SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
+ unsigned SrcOpc0 = Src0.getOpcode();
+ unsigned SrcOpc1 = Src1.getOpcode();
+ EVT SrcVT0 = Src0.getValueType();
+ EVT SrcVT1 = Src1.getValueType();
+
+ if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
+ return SDValue();
+
+ switch (SrcOpc0) {
+ case X86ISD::MOVDDUP: {
+ SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+ SDValue RHS =
+ DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
+ return DAG.getBitcast(VT, Res);
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ case X86ISD::PSHUFD:
+ case X86ISD::VPERMILPI:
+ if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
+ SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+ SDValue RHS =
+ DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
+ Src0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ }
+
+ return SDValue();
+}
+
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -35884,59 +36946,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
- bool IsUnary;
- SmallVector<int, 64> TargetMask;
- SmallVector<SDValue, 2> TargetOps;
- if (isTargetShuffle(Opcode))
- getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
-
- // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
- // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
- // represents the LHS/RHS inputs for the lower/upper halves.
- SmallVector<int, 16> TargetMask128;
- if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
- isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
- SmallVector<int, 16> WidenedMask128 = TargetMask128;
- while (WidenedMask128.size() > 2) {
- SmallVector<int, 16> WidenedMask;
- if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
- break;
- WidenedMask128 = std::move(WidenedMask);
- }
- if (WidenedMask128.size() == 2) {
- assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
- SDValue BC0 = peekThroughBitcasts(TargetOps.front());
- SDValue BC1 = peekThroughBitcasts(TargetOps.back());
- EVT VT0 = BC0.getValueType();
- EVT VT1 = BC1.getValueType();
- unsigned Opcode0 = BC0.getOpcode();
- unsigned Opcode1 = BC1.getOpcode();
- bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
- if (Opcode0 == Opcode1 && VT0 == VT1 &&
- (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
- bool SingleOp = (TargetOps.size() == 1);
- if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
- SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
- SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
- Lo = Lo.getOperand(WidenedMask128[0] & 1);
- Hi = Hi.getOperand(WidenedMask128[1] & 1);
- if (SingleOp) {
- MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
- SDValue Undef = DAG.getUNDEF(SrcVT);
- SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
- Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
- Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
- Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
- Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
- }
- SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- return DAG.getBitcast(VT, Horiz);
- }
- }
- }
- }
-
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
@@ -36000,6 +37009,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
@@ -36029,7 +37039,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
for (SDNode *User : Src->uses())
if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
Src == User->getOperand(0) &&
- User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ User->getValueSizeInBits(0).getFixedSize() >
+ VT.getFixedSizeInBits()) {
return extractSubVector(SDValue(User, 0), 0, DAG, DL,
VT.getSizeInBits());
}
@@ -36115,7 +37126,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
LN->isSimple()) {
unsigned Offset = ShiftAmt / 8;
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
SDValue Ops[] = { LN->getChain(), Ptr };
SDValue BcastLd = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
@@ -36147,15 +37159,16 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
// vbroadcast(vector load X) -> vbroadcast_load
- if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
- ISD::isNormalLoad(Src.getNode())) {
+ if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
+ SrcVT == MVT::v4i32) &&
+ Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
// Unless the load is volatile or atomic.
if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BcastLd = DAG.getMemIntrinsicNode(
- X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
LN->getPointerInfo(), LN->getOriginalAlign(),
LN->getMemOperand()->getFlags());
DCI.CombineTo(N.getNode(), BcastLd);
@@ -36242,6 +37255,27 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
}
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
+ SDValue V = peekThroughOneUseBitcasts(N0);
+
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
+ isNullConstant(V.getOperand(2))) {
+ SDValue In = V.getOperand(1);
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ In.getValueSizeInBits() /
+ VT.getScalarSizeInBits());
+ In = DAG.getBitcast(SubVT, In);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), Movl,
+ V.getOperand(2));
+ }
+ }
+
return SDValue();
}
case X86ISD::BLENDI: {
@@ -36283,32 +37317,51 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
case X86ISD::VPERM2X128: {
- // If both 128-bit values were inserted into high halves of 256-bit values,
- // the shuffle can be reduced to a concatenation of subvectors:
- // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
- // Note: We are only looking for the exact high/high shuffle mask because we
- // expect to fold other similar patterns before creating this opcode.
- SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
- SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
- unsigned Imm = N.getConstantOperandVal(2);
- if (!(Imm == 0x31 &&
- Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Ins0.getValueType() == Ins1.getValueType()))
- return SDValue();
+ // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() == ISD::BITCAST &&
+ (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
+ EVT SrcVT = LHS.getOperand(0).getValueType();
+ if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
+ DAG.getBitcast(SrcVT, LHS),
+ DAG.getBitcast(SrcVT, RHS),
+ N->getOperand(2)));
+ }
+ }
+
+ // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
+ if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
+ return Res;
- SDValue X = Ins0.getOperand(1);
- SDValue Y = Ins1.getOperand(1);
- unsigned C1 = Ins0.getConstantOperandVal(2);
- unsigned C2 = Ins1.getConstantOperandVal(2);
- MVT SrcVT = X.getSimpleValueType();
- unsigned SrcElts = SrcVT.getVectorNumElements();
- if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
- C1 != SrcElts || C2 != SrcElts)
+ // Fold vperm2x128 subvector shuffle with an inner concat pattern.
+ // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
+ auto FindSubVector128 = [&](unsigned Idx) {
+ if (Idx > 3)
+ return SDValue();
+ SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
+ SmallVector<SDValue> SubOps;
+ if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+ return SubOps[Idx & 1];
+ unsigned NumElts = Src.getValueType().getVectorNumElements();
+ if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueSizeInBits() == 128 &&
+ Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
+ return Src.getOperand(1);
+ }
return SDValue();
-
- return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
- Ins1.getValueType(), X, Y));
+ };
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
+ if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
+ MVT SubVT = VT.getHalfNumVectorElementsVT();
+ SubLo = DAG.getBitcast(SubVT, SubLo);
+ SubHi = DAG.getBitcast(SubVT, SubHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
+ }
+ }
+ return SDValue();
}
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
@@ -36751,10 +37804,12 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
+ // TODO: Can we use getTargetShuffleInputs instead?
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
- if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
- return SDValue();
+ if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH)
+ if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
// For a broadcast, peek through an extract element of index 0 to find the
// horizontal op: broadcast (ext_vec_elt HOp, 0)
@@ -36773,6 +37828,28 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
return SDValue();
+ // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
+ // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
+ // Don't fold if hop(x,y) == hop(z,w).
+ if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) {
+ SDValue HOp2 = N->getOperand(1);
+ if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32)
+ return SDValue();
+ if (HOp == HOp2)
+ return SDValue();
+ SDLoc DL(HOp);
+ unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
+ SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi),
+ HOp2.getOperand(LoHi));
+ // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
+ // combining and domain handling will simplify this later on.
+ EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
+ Res = DAG.getBitcast(ShuffleVT, Res);
+ Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
+ getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
+ return DAG.getBitcast(VT, Res);
+ }
+
// 128-bit horizontal math instructions are defined to operate on adjacent
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
@@ -36817,19 +37894,20 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
// but this should be tied to whatever horizontal op matching and shuffle
// canonicalization are producing.
if (HOp.getValueSizeInBits() == 128 &&
- (isTargetShuffleEquivalent(Mask, {0, 0}) ||
- isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
- isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
+ (isShuffleEquivalent(Mask, {0, 0}) ||
+ isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
+ isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
return updateHOp(HOp, DAG);
if (HOp.getValueSizeInBits() == 256 &&
- (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
- isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
- isTargetShuffleEquivalent(
+ (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
+ isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
+ isShuffleEquivalent(
Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
return updateHOp(HOp, DAG);
@@ -36887,6 +37965,34 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
return HAddSub;
+
+ // Merge shuffles through binops if its likely we'll be able to merge it
+ // with other shuffles (as long as they aren't splats).
+ // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
+ // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
+ unsigned SrcOpcode = N->getOperand(0).getOpcode();
+ if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
+ N->isOnlyUserOf(N->getOperand(0).getNode()) &&
+ N->isOnlyUserOf(N->getOperand(1).getNode())) {
+ SDValue Op00 = N->getOperand(0).getOperand(0);
+ SDValue Op10 = N->getOperand(1).getOperand(0);
+ SDValue Op01 = N->getOperand(0).getOperand(1);
+ SDValue Op11 = N->getOperand(1).getOperand(1);
+ auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
+ auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
+ auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
+ auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
+ if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
+ ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
+ SDLoc DL(N);
+ ArrayRef<int> Mask = SVN->getMask();
+ SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
+ SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask);
+ return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
+ }
+ }
+ }
}
// Attempt to combine into a vector load/broadcast.
@@ -36920,32 +38026,11 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// TODO - merge this into combineX86ShufflesRecursively.
APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
+ if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
+ DCI))
return SDValue(N, 0);
}
- // Pull subvector inserts into undef through VZEXT_MOVL by making it an
- // insert into a zero vector. This helps get VZEXT_MOVL closer to
- // scalar_to_vectors where 256/512 are canonicalized to an insert and a
- // 128-bit scalar_to_vector. This reduces the number of isel patterns.
- if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
- N->getOperand(0).hasOneUse()) {
- SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));
-
- if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
- V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
- SDValue In = V.getOperand(1);
- MVT SubVT =
- MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
- In.getValueSizeInBits() / VT.getScalarSizeInBits());
- In = DAG.getBitcast(SubVT, In);
- SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
- getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
- Movl, V.getOperand(2));
- }
- }
-
return SDValue();
}
@@ -37076,7 +38161,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
Depth + 1))
return true;
- // TODO convert SrcUndef to KnownUndef.
+
+ // Aggressively peek through ops to get at the demanded elts.
+ if (!DemandedElts.isAllOnesValue())
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+ Src, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
break;
}
case X86ISD::KSHIFTL: {
@@ -37265,7 +38356,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (!SrcVT.isVector())
- return false;
+ break;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
if (Src.getValueType() != VT)
@@ -37318,21 +38409,62 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
- // Subvector broadcast.
- case X86ISD::SUBV_BROADCAST: {
+ // Scalar broadcast.
+ case X86ISD::VBROADCAST: {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
if (Src.getValueSizeInBits() > ExtSizeInBits)
Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
- else if (Src.getValueSizeInBits() < ExtSizeInBits) {
- MVT SrcSVT = Src.getSimpleValueType().getScalarType();
- MVT SrcVT =
- MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
- Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
- }
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ case X86ISD::VBROADCAST_LOAD: {
+ SDLoc DL(Op);
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
TLO.DAG, DL, ExtSizeInBits));
}
+ // Subvector broadcast.
+ case X86ISD::SUBV_BROADCAST_LOAD: {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
+ SDLoc DL(Op);
+ SDValue Ld =
+ TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(), MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Ld.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
+ SDLoc DL(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst =
+ TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
+ Ops, MemVT, MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ break;
+ }
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -37384,7 +38516,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::BLENDI:
- // Saturated Packs.
+ // Integer ops.
+ case X86ISD::AVG:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
// Horizontal Ops.
@@ -37477,16 +38610,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// If we don't demand all elements, then attempt to combine to a simpler
// shuffle.
- // TODO: Handle other depths, but first we need to handle the fact that
- // it might combine to the same shuffle.
- if (!DemandedElts.isAllOnesValue() && Depth == 0) {
+ // We need to convert the depth to something combineX86ShufflesRecursively
+ // can handle - so pretend its Depth == 0 again, and reduce the max depth
+ // to match. This prevents combineX86ShuffleChain from returning a
+ // combined shuffle that's the same as the original root, causing an
+ // infinite loop.
+ if (!DemandedElts.isAllOnesValue()) {
+ assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
+
SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i)
if (DemandedElts[i])
DemandedMask[i] = i;
SDValue NewShuffle = combineX86ShufflesRecursively(
- {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
+ {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
+ /*HasVarMask*/ false,
/*AllowVarMask*/ true, TLO.DAG, Subtarget);
if (NewShuffle)
return TLO.CombineTo(Op, NewShuffle);
@@ -37589,7 +38728,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// Low bits known zero.
Known.Zero.setLowBits(ShAmt);
- break;
+ return false;
}
case X86ISD::VSRLI: {
unsigned ShAmt = Op.getConstantOperandVal(1);
@@ -37608,7 +38747,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
- break;
+ return false;
}
case X86ISD::VSRAI: {
SDValue Op0 = Op.getOperand(0);
@@ -37657,7 +38796,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// High bits are known one.
if (Known.One[BitWidth - ShAmt - 1])
Known.One.setHighBits(ShAmt);
- break;
+ return false;
}
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
@@ -37723,8 +38862,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return true;
KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
- Known.One = KnownVec.One & KnownScl.One;
- Known.Zero = KnownVec.Zero & KnownScl.Zero;
+ Known = KnownBits::commonBits(KnownVec, KnownScl);
return false;
}
break;
@@ -37804,34 +38942,83 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
return false;
}
- case X86ISD::BEXTR: {
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Only bottom 16-bits of the control bits are required.
if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
// NOTE: SimplifyDemandedBits won't do this for constants.
- const APInt &Val1 = Cst1->getAPIntValue();
- APInt MaskedVal1 = Val1 & 0xFFFF;
- if (MaskedVal1 != Val1) {
+ uint64_t Val1 = Cst1->getZExtValue();
+ uint64_t MaskedVal1 = Val1 & 0xFFFF;
+ if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
SDLoc DL(Op);
return TLO.CombineTo(
Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
TLO.DAG.getConstant(MaskedVal1, DL, VT)));
}
+
+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+ // If the length is 0, the result is 0.
+ if (Length == 0) {
+ Known.setAllZero();
+ return false;
+ }
+
+ if ((Shift + Length) <= BitWidth) {
+ APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
+ return true;
+
+ Known = Known.extractBits(Length, Shift);
+ Known = Known.zextOrTrunc(BitWidth);
+ return false;
+ }
+ } else {
+ assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
+ KnownBits Known1;
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+ if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+ return true;
+
+ // If the length is 0, replace with 0.
+ KnownBits LengthBits = Known1.extractBits(8, 8);
+ if (LengthBits.isZero())
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
}
- KnownBits Known1;
- APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
- if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+ break;
+ }
+ case X86ISD::PDEP: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
+ APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+ // If the demanded bits has leading zeroes, we don't demand those from the
+ // mask.
+ if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
return true;
- // If the length is 0, replace with 0.
- KnownBits LengthBits = Known1.extractBits(8, 8);
- if (LengthBits.isZero())
- return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+ // The number of possible 1s in the mask determines the number of LSBs of
+ // operand 0 used. Undemanded bits from the mask don't matter so filter
+ // them before counting.
+ KnownBits Known2;
+ uint64_t Count = (~Known.Zero & LoMask).countPopulation();
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+ return true;
- break;
+ // Zeroes are retained from the mask, but not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ return false;
}
}
@@ -38242,6 +39429,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
// Convert build vector ops to MMX data in the bottom elements.
SmallVector<SDValue, 8> Ops;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
if (Splat) {
if (Splat.isUndef())
@@ -38254,14 +39443,16 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
if (NumElts == 8)
Splat = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
- DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
- Splat);
+ DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
+ TLI.getPointerTy(DAG.getDataLayout())),
+ Splat, Splat);
// Use PSHUFW to repeat 16-bit elements.
unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
- DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
+ DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
+ TLI.getPointerTy(DAG.getDataLayout())),
Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
}
Ops.append(NumElts, Splat);
@@ -38277,7 +39468,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
: Intrinsic::x86_mmx_punpcklbw));
- SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
+ SDValue Intrin = DAG.getTargetConstant(
+ IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
for (unsigned i = 0; i != NumOps; i += 2)
Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
Ops[i], Ops[i + 1]);
@@ -38291,7 +39483,7 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
// a vector/float/double that got truncated/extended/bitcast to/from a scalar
// integer. If so, replace the scalar ops with bool vector equivalents back down
// the chain.
-static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -38344,6 +39536,10 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
case ISD::SHL: {
// If we find a suitable source, a SHL becomes a KSHIFTL.
SDValue Src0 = V.getOperand(0);
+ if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
+ ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
+ break;
+
if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
return DAG.getNode(
@@ -38686,8 +39882,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
// PHMINPOSUW.
-static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// Bail without SSE41.
if (!Subtarget.hasSSE41())
return SDValue();
@@ -38760,9 +39956,8 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
}
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
-static SDValue combineHorizontalPredicateResult(SDNode *Extract,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// Bail without SSE2.
if (!Subtarget.hasSSE2())
return SDValue();
@@ -38876,10 +40071,8 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
if (BinOp == ISD::XOR) {
- // parity -> (AND (CTPOP(MOVMSK X)), 1)
- SDValue Mask = DAG.getConstant(1, DL, CmpVT);
- SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
- Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
+ // parity -> (PARITY(MOVMSK X))
+ SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
}
@@ -39067,10 +40260,12 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
// Handle extract(truncate(x)) for 0'th index.
// TODO: Treat this as a faux shuffle?
// TODO: When can we use this for general indices?
- if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
+ if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
+ (SrcVT.getSizeInBits() % 128) == 0) {
Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
- Src = DAG.getBitcast(SrcVT, Src);
- return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
+ MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
+ return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
+ Idx);
}
// Resolve the target shuffle inputs and mask.
@@ -39146,7 +40341,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
- DAG.getIntPtrConstant(SrcIdx, dl));
+ DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
}
@@ -39253,8 +40448,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.
-static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
// We need at least SSE2 to anything here.
@@ -39262,8 +40457,8 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
return SDValue();
ISD::NodeType Opc;
- SDValue Rdx =
- DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
+ {ISD::ADD, ISD::MUL, ISD::FADD}, true);
if (!Rdx)
return SDValue();
@@ -39278,7 +40473,46 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
SDLoc DL(ExtElt);
- // vXi8 reduction - sub 128-bit vector.
+ // vXi8 mul reduction - promote to vXi16 mul reduction.
+ if (Opc == ISD::MUL) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
+ return SDValue();
+ if (VecVT.getSizeInBits() >= 128) {
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
+ SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+ SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+ Lo = DAG.getBitcast(WideVT, Lo);
+ Hi = DAG.getBitcast(WideVT, Hi);
+ Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
+ while (Rdx.getValueSizeInBits() > 128) {
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
+ }
+ } else {
+ if (VecVT == MVT::v4i8)
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+ DAG.getUNDEF(MVT::v4i8));
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i8));
+ Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
+ Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
+ }
+ if (NumElts >= 8)
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {4, 5, 6, 7, -1, -1, -1, -1}));
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {2, 3, -1, -1, -1, -1, -1, -1}));
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {1, -1, -1, -1, -1, -1, -1, -1}));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // vXi8 add reduction - sub 128-bit vector.
if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
if (VecVT == MVT::v4i8) {
// Pad with zero.
@@ -39309,7 +40543,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
!isPowerOf2_32(VecVT.getVectorNumElements()))
return SDValue();
- // vXi8 reduction - sum lo/hi halves then use PSADBW.
+ // vXi8 add reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {
SDValue Lo, Hi;
@@ -39415,7 +40649,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
// TODO - Remove this once we can handle the implicit zero-extension of
- // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
// combineBasicSADPattern.
return SDValue();
}
@@ -39447,14 +40681,15 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SAD;
// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
- if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+ if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
return Cmp;
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
- if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
+ if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
return MinMax;
- if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
+ // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
+ if (SDValue V = combineArithReduction(N, DAG, Subtarget))
return V;
if (SDValue V = scalarizeExtEltFP(N, DAG))
@@ -39578,7 +40813,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (TValIsAllOnes && FValIsAllZeros)
return DAG.getBitcast(VT, Cond);
- if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
+ if (!TLI.isTypeLegal(CondVT))
return SDValue();
// vselect Cond, 111..., X -> or Cond, X
@@ -39901,6 +41136,36 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
}
+ // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
+ // by forcing the unselected elements to zero.
+ // TODO: Can we handle more shuffles with this?
+ if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
+ LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
+ LHS.hasOneUse() && RHS.hasOneUse()) {
+ MVT SimpleVT = VT.getSimpleVT();
+ bool LHSUnary, RHSUnary;
+ SmallVector<SDValue, 1> LHSOps, RHSOps;
+ SmallVector<int, 64> LHSMask, RHSMask, CondMask;
+ if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
+ getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
+ LHSUnary) &&
+ getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
+ RHSUnary)) {
+ int NumElts = VT.getVectorNumElements();
+ for (int i = 0; i != NumElts; ++i) {
+ if (CondMask[i] < NumElts)
+ RHSMask[i] = 0x80;
+ else
+ LHSMask[i] = 0x80;
+ }
+ LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
+ getConstVector(LHSMask, SimpleVT, DAG, DL, true));
+ RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
+ getConstVector(RHSMask, SimpleVT, DAG, DL, true));
+ return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+ }
+ }
+
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
@@ -40127,13 +41392,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
- // Canonicalize max and min:
- // (x > y) ? x : y -> (x >= y) ? x : y
- // (x < y) ? x : y -> (x <= y) ? x : y
+ // Canonicalize min/max:
+ // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
+ // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
- // the need for an extra compare
- // against zero. e.g.
- // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+ // the need for an extra compare against zero. e.g.
+ // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
// subl %esi, %edi
// testl %edi, %edi
// movl $0, %eax
@@ -40142,142 +41406,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// xorl %eax, %eax
// subl %esi, $edi
// cmovsl %eax, %edi
+ //
+ // We can also canonicalize
+ // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
+ // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
+ // This allows the use of a test instruction for the compare.
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
Cond.hasOneUse() &&
- DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
- DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- switch (CC) {
- default: break;
- case ISD::SETLT:
- case ISD::SETGT: {
- ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+ if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
+ (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
+ ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
- }
- }
-
- // Match VSELECTs into subs with unsigned saturation.
- if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
- // psubus is available in SSE2 for i8 and i16 vectors.
- Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
- isPowerOf2_32(VT.getVectorNumElements()) &&
- (VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16)) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- // Check if one of the arms of the VSELECT is a zero vector. If it's on the
- // left side invert the predicate to simplify logic below.
- SDValue Other;
- if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
- Other = RHS;
- CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
- } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
- Other = LHS;
- }
-
- if (Other.getNode() && Other->getNumOperands() == 2 &&
- Other->getOperand(0) == Cond.getOperand(0)) {
- SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
- SDValue CondRHS = Cond->getOperand(1);
-
- // Look for a general sub with unsigned saturation first.
- // x >= y ? x-y : 0 --> subus x, y
- // x > y ? x-y : 0 --> subus x, y
- if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
- Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
- return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
-
- if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
- if (isa<BuildVectorSDNode>(CondRHS)) {
- // If the RHS is a constant we have to reverse the const
- // canonicalization.
- // x > C-1 ? x+-C : 0 --> subus x, C
- auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
- return (!Op && !Cond) ||
- (Op && Cond &&
- Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
- };
- if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
- ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
- /*AllowUndefs*/ true)) {
- OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- OpRHS);
- return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
- }
-
- // Another special case: If C was a sign bit, the sub has been
- // canonicalized into a xor.
- // FIXME: Would it be better to use computeKnownBits to determine
- // whether it's safe to decanonicalize the xor?
- // x s< 0 ? x^C : 0 --> subus x, C
- if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
- if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
- OpRHSConst->getAPIntValue().isSignMask()) {
- // Note that we have to rebuild the RHS constant here to ensure we
- // don't rely on particular values of undef lanes.
- OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
- return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
- }
- }
- }
- }
- }
- }
-
- // Match VSELECTs into add with unsigned saturation.
- if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
- // paddus is available in SSE2 for i8 and i16 vectors.
- Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
- isPowerOf2_32(VT.getVectorNumElements()) &&
- (VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16)) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
- SDValue CondLHS = Cond->getOperand(0);
- SDValue CondRHS = Cond->getOperand(1);
-
- // Check if one of the arms of the VSELECT is vector with all bits set.
- // If it's on the left side invert the predicate to simplify logic below.
- SDValue Other;
- if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
- Other = RHS;
- CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
- } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
- Other = LHS;
- }
-
- if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
- SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
-
- // Canonicalize condition operands.
- if (CC == ISD::SETUGE) {
- std::swap(CondLHS, CondRHS);
- CC = ISD::SETULE;
- }
-
- // We can test against either of the addition operands.
- // x <= x+y ? x+y : ~0 --> addus x, y
- // x+y >= x ? x+y : ~0 --> addus x, y
- if (CC == ISD::SETULE && Other == CondRHS &&
- (OpLHS == CondLHS || OpRHS == CondLHS))
- return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
-
- if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
- CondLHS == OpLHS) {
- // If the RHS is a constant we have to reverse the const
- // canonicalization.
- // x > ~C ? x+C : ~0 --> addus x, C
- auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
- return Cond->getAPIntValue() == ~Op->getAPIntValue();
- };
- if (CC == ISD::SETULE &&
- ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
- return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
- }
+ if (CC == ISD::SETUGT && isOneConstant(RHS)) {
+ ISD::CondCode NewCC = ISD::SETUGE;
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
}
@@ -40308,10 +41457,18 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
- if (CondVT.getScalarType() != MVT::i1)
+ if (CondVT.getScalarType() != MVT::i1) {
if (SDValue CondNot = IsNOT(Cond, DAG))
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+ // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
+ if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
+ ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
+ Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
+ DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
+ }
+ }
// Try to optimize vXi1 selects if both operands are either all constants or
// bitcasts from scalar integer type. In that case we can convert the operands
@@ -41928,73 +43085,115 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
- assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
- "Unexpected pack opcode");
+ assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||
+ X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||
+ X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected hadd/hsub/pack opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- unsigned NumDstElts = VT.getVectorNumElements();
+ EVT SrcVT = N0.getValueType();
- // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
- // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+ // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+ // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
// truncation trees that help us avoid lane crossing shuffles.
// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+ // TODO: We don't handle vXf64 shuffles yet.
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getConstantOperandAPInt(1) == 0 &&
- N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
+ N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
- N0.getOperand(0).getValueType().is256BitVector()) {
+ N0.getOperand(0).getValueType().is256BitVector() &&
+ SrcVT.getScalarSizeInBits() <= 32) {
// TODO - support target/faux shuffles.
SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
- // To keep the PACK LHS/RHS coherency, we must be able to scale the unary
+ // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
// shuffle to a vXi64 width - we can probably relax this in the future.
SmallVector<int, 4> ShuffleMask;
if (SVN->getOperand(1).isUndef() &&
scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
SDLoc DL(N);
SDValue Lo, Hi;
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
Lo = DAG.getBitcast(N0.getValueType(), Lo);
Hi = DAG.getBitcast(N1.getValueType(), Hi);
SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
- Res = DAG.getBitcast(MVT::v4i32, Res);
- Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
return DAG.getBitcast(VT, Res);
}
}
}
- // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
+ // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
+ // TODO: Merge with binary shuffle folds below.
+ if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
+ int PostShuffle[4] = {0, 1, 2, 3};
+
+ // If the op is an unary shuffle that can scale to v2x64,
+ // then we can perform this as a v4x32 post shuffle.
+ auto AdjustOp = [&](SDValue V, int Offset) {
+ auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
+ SmallVector<int, 2> ScaledMask;
+ if (!SVN || !SVN->getOperand(1).isUndef() ||
+ !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
+ !N->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
+ PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
+ return SVN->getOperand(0);
+ };
+
+ SDValue Src0 = AdjustOp(N0, 0);
+ SDValue Src1 = AdjustOp(N1, 2);
+ if (Src0 || Src1) {
+ Src0 = Src0 ? Src0 : N0;
+ Src1 = Src1 ? Src1 : N1;
+ SDLoc DL(N);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+
+ // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
// TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
- if (VT.is256BitVector()) {
- if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
- if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
- SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
- if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
- scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
- SDValue Op00 = SVN0->getOperand(0);
- SDValue Op01 = SVN0->getOperand(1);
- SDValue Op10 = SVN1->getOperand(0);
- SDValue Op11 = SVN1->getOperand(1);
- if ((Op00 == Op11) && (Op01 == Op10)) {
- std::swap(Op10, Op11);
- ShuffleVectorSDNode::commuteMask(ShuffleMask1);
- }
- if ((Op00 == Op10) && (Op01 == Op11)) {
- SmallVector<int, 4> ShuffleMask;
- ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
- ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
- SDLoc DL(N);
- SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
- Res = DAG.getBitcast(MVT::v4i64, Res);
- Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
- return DAG.getBitcast(VT, Res);
- }
+ if (VT.is256BitVector() && Subtarget.hasInt256()) {
+ SmallVector<int> Mask0, Mask1;
+ SmallVector<SDValue> Ops0, Ops1;
+ if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
+ getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
+ !Ops0.empty() && !Ops1.empty()) {
+ SDValue Op00 = Ops0.front(), Op01 = Ops0.back();
+ SDValue Op10 = Ops1.front(), Op11 = Ops1.back();
+ SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+ if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT &&
+ Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT &&
+ scaleShuffleElements(Mask0, 2, ShuffleMask0) &&
+ scaleShuffleElements(Mask1, 2, ShuffleMask1)) {
+ if ((Op00 == Op11) && (Op01 == Op10)) {
+ std::swap(Op10, Op11);
+ ShuffleVectorSDNode::commuteMask(ShuffleMask1);
+ }
+ if ((Op00 == Op10) && (Op01 == Op11)) {
+ SmallVector<int, 4> ShuffleMask;
+ ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+ ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+ SDLoc DL(N);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
}
}
}
@@ -42077,7 +43276,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
}
// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
- if (SDValue V = combineVectorPackWithShuffle(N, DAG))
+ if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
return V;
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
@@ -42099,6 +43298,28 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
}
}
+ // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
+ if (VT.is128BitVector()) {
+ unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ SDValue Src0, Src1;
+ if (N0.getOpcode() == ExtOpc &&
+ N0.getOperand(0).getValueType().is64BitVector() &&
+ N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+ Src0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == ExtOpc &&
+ N1.getOperand(0).getValueType().is64BitVector() &&
+ N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+ Src1 = N1.getOperand(0);
+ }
+ if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
+ assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
+ Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
+ Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
+ }
+ }
+
// Attempt to combine as shuffle.
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -42107,6 +43328,20 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
+ X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
+ "Unexpected horizontal add/sub opcode");
+
+ // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
+ if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
+ return V;
+
+ return SDValue();
+}
+
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -42735,74 +43970,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
return SDValue();
}
-// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
-// Turn it into series of XORs and a setnp.
-static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- EVT VT = N->getValueType(0);
-
- // We only support 64-bit and 32-bit. 64-bit requires special handling
- // unless the 64-bit popcnt instruction is legal.
- if (VT != MVT::i32 && VT != MVT::i64)
- return SDValue();
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- // LHS needs to be a single use CTPOP.
- if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
- return SDValue();
-
- // RHS needs to be 1.
- if (!isOneConstant(N1))
- return SDValue();
-
- SDLoc DL(N);
- SDValue X = N0.getOperand(0);
-
- // If this is 64-bit, its always best to xor the two 32-bit pieces together
- // even if we have popcnt.
- if (VT == MVT::i64) {
- SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
- DAG.getNode(ISD::SRL, DL, VT, X,
- DAG.getConstant(32, DL, MVT::i8)));
- SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
- // Generate a 32-bit parity idiom. This will bring us back here if we need
- // to expand it too.
- SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
- DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
- DAG.getConstant(1, DL, MVT::i32));
- return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
- }
- assert(VT == MVT::i32 && "Unexpected VT!");
-
- // Xor the high and low 16-bits together using a 32-bit operation.
- SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
- DAG.getConstant(16, DL, MVT::i8));
- X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
-
- // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
- // This should allow an h-reg to be used to save a shift.
- // FIXME: We only get an h-reg in 32-bit mode.
- SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
- DAG.getNode(ISD::SRL, DL, VT, X,
- DAG.getConstant(8, DL, MVT::i8)));
- SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
- SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
- SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
-
- // Copy the inverse of the parity flag into a register with setcc.
- SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
- // Zero extend to original type.
- return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
-}
-
-
// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
// Where C is a mask containing the same number of bits as the setcc and
// where the setcc will freely 0 upper bits of k-register. We can replace the
@@ -42894,10 +44061,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
}
}
- // This must be done before legalization has expanded the ctpop.
- if (SDValue V = combineParity(N, DAG, Subtarget))
- return V;
-
// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
@@ -42967,7 +44130,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (VT == SrcVecVT.getScalarType() &&
N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
- llvm::all_of(EltBits, [](APInt M) {
+ llvm::all_of(EltBits, [](const APInt &M) {
return M.isNullValue() || M.isAllOnesValue();
})) {
unsigned NumElts = SrcVecVT.getVectorNumElements();
@@ -42986,6 +44149,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
@@ -43653,14 +44817,13 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
unsigned NumElems = VT.getVectorNumElements();
EVT ScalarVT = VT.getVectorElementType();
- if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
- NumElems >= 2 && isPowerOf2_32(NumElems)))
+ if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
return SDValue();
// InScalarVT is the intermediate type in AVG pattern and it should be greater
// than the original input type (i8/i16).
EVT InScalarVT = InVT.getVectorElementType();
- if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+ if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
return SDValue();
if (!Subtarget.hasSSE2())
@@ -43688,8 +44851,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
};
// Check if each element of the vector is right-shifted by one.
- auto LHS = In.getOperand(0);
- auto RHS = In.getOperand(1);
+ SDValue LHS = In.getOperand(0);
+ SDValue RHS = In.getOperand(1);
if (!IsConstVectorInRange(RHS, 1, 1))
return SDValue();
if (LHS.getOpcode() != ISD::ADD)
@@ -43705,6 +44868,29 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
};
+ auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
+ // Pad to a power-of-2 vector, split+apply and extract the original vector.
+ unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
+ EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
+ if (NumElemsPow2 != NumElems) {
+ SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+ SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Idx = DAG.getIntPtrConstant(i, DL);
+ Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
+ Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
+ }
+ Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
+ Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
+ }
+ SDValue Res =
+ SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
+ if (NumElemsPow2 == NumElems)
+ return Res;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ };
+
// Take care of the case when one of the operands is a constant vector whose
// element is in the range [1, 256].
if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
@@ -43715,9 +44901,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
SDValue VecOnes = DAG.getConstant(1, DL, InVT);
Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Operands[0].getOperand(0), Operands[1] },
- AVGBuilder);
+ return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
}
// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
@@ -43764,8 +44948,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
}
// The pattern is detected, emit X86ISD::AVG instruction(s).
- return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
- AVGBuilder);
+ return AVGSplitter(Operands[0], Operands[1]);
}
return SDValue();
@@ -43798,7 +44981,8 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
unsigned HalfOffset = 16;
SDValue Ptr1 = Ld->getBasePtr();
- SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
+ SDValue Ptr2 =
+ DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
SDValue Load1 =
@@ -43832,6 +45016,29 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
}
}
+ // If we also broadcast this as a subvector to a wider type, then just extract
+ // the lowest subvector.
+ if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
+ (RegVT.is128BitVector() || RegVT.is256BitVector())) {
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Chain = Ld->getChain();
+ for (SDNode *User : Ptr->uses()) {
+ if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0).getFixedSize() >
+ RegVT.getFixedSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ RegVT.getSizeInBits());
+ Extract = DAG.getBitcast(RegVT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+ }
+ }
+
// Cast ptr32 and ptr64 pointers to the default address space before a load.
unsigned AddrSpace = Ld->getAddressSpace();
if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
@@ -43873,7 +45080,7 @@ static int getOneTrueElt(SDValue V) {
auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
if (!ConstNode)
return -1;
- if (ConstNode->getAPIntValue().isAllOnesValue()) {
+ if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
// If we already found a one, this is too many.
if (TrueIndex >= 0)
return -1;
@@ -43889,7 +45096,8 @@ static int getOneTrueElt(SDValue V) {
/// scalar element, and the alignment for the scalar memory access.
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
SelectionDAG &DAG, SDValue &Addr,
- SDValue &Index, unsigned &Alignment) {
+ SDValue &Index, Align &Alignment,
+ unsigned &Offset) {
int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
if (TrueMaskElt < 0)
return false;
@@ -43897,14 +45105,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
// Get the address of the one scalar element that is specified by the mask
// using the appropriate offset from the base pointer.
EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+ Offset = 0;
Addr = MaskedOp->getBasePtr();
if (TrueMaskElt != 0) {
- unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
- Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
+ Offset = TrueMaskElt * EltVT.getStoreSize();
+ Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
+ SDLoc(MaskedOp));
}
Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
- Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
+ Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
+ EltVT.getStoreSize());
return true;
}
@@ -43914,15 +45125,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
- unsigned Alignment;
- if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
+ Align Alignment;
+ unsigned Offset;
+ if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
return SDValue();
// Load the one scalar element that is specified by the mask using the
@@ -43930,13 +45143,25 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
EVT EltVT = VT.getVectorElementType();
+
+ EVT CastVT = VT;
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ }
+
SDValue Load =
- DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
+ DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+ ML->getPointerInfo().getWithOffset(Offset),
Alignment, ML->getMemOperand()->getFlags());
+ SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
+
// Insert the loaded element into the appropriate place in the vector.
- SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
- ML->getPassThru(), Load, VecIndex);
+ SDValue Insert =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
+ Insert = DAG.getBitcast(VT, Insert);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
}
@@ -43999,7 +45224,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
- if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+ if (SDValue ScalarLoad =
+ reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
return ScalarLoad;
// TODO: Do some AVX512 subsets benefit from this transform?
@@ -44036,25 +45262,35 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
- unsigned Alignment;
- if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
+ Align Alignment;
+ unsigned Offset;
+ if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
return SDValue();
// Extract the one scalar element that is actually being stored.
SDLoc DL(MS);
- EVT VT = MS->getValue().getValueType();
+ SDValue Value = MS->getValue();
+ EVT VT = Value.getValueType();
EVT EltVT = VT.getVectorElementType();
- SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
- MS->getValue(), VecIndex);
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ EVT CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ Value = DAG.getBitcast(CastVT, Value);
+ }
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
// Store that element at the appropriate offset from the base pointer.
- return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
+ return DAG.getStore(MS->getChain(), DL, Extract, Addr,
+ MS->getPointerInfo().getWithOffset(Offset),
Alignment, MS->getMemOperand()->getFlags());
}
@@ -44072,7 +45308,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isTruncatingStore())
return SDValue();
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
return ScalarStore;
// If the mask value has been legalized to a non-boolean vector, try to
@@ -44133,17 +45369,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
StoredVal.getOperand(0).getValueType() == MVT::i8) {
- return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
+ SDValue Val = StoredVal.getOperand(0);
+ // We must store zeros to the unused bits.
+ Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
+ return DAG.getStore(St->getChain(), dl, Val,
St->getBasePtr(), St->getPointerInfo(),
St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
// Widen v2i1/v4i1 stores to v8i1.
- if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+ if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
Subtarget.hasAVX512()) {
unsigned NumConcats = 8 / VT.getVectorNumElements();
- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
+ // We must store zeros to the unused bits.
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
@@ -44165,7 +45405,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
Hi = combinevXi1ConstantToInteger(Hi, DAG);
SDValue Ptr0 = St->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
SDValue Ch0 =
DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
@@ -44244,6 +45484,36 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
VT, St->getMemOperand(), DAG);
}
+ // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
+ if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
+ auto IsExtractedElement = [](SDValue V) {
+ if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
+ V = V.getOperand(0);
+ unsigned Opc = V.getOpcode();
+ if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
+ if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
+ return V.getOperand(0);
+ }
+ return SDValue();
+ };
+ if (SDValue Extract = IsExtractedElement(StoredVal)) {
+ SDValue Trunc = peekThroughOneUseBitcasts(Extract);
+ if (Trunc.getOpcode() == X86ISD::VTRUNC) {
+ SDValue Src = Trunc.getOperand(0);
+ MVT DstVT = Trunc.getSimpleValueType();
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
+ MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
+ if (NumTruncBits == VT.getSizeInBits() &&
+ TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
+ return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
+ TruncVT, St->getMemOperand());
+ }
+ }
+ }
+ }
+
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
@@ -44386,8 +45656,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
- const X86Subtarget &Subtarget, bool IsCommutative,
+static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ bool IsCommutative,
SmallVectorImpl<int> &PostShuffleMask) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
@@ -44481,12 +45752,6 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
RMask.push_back(i);
}
- // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
- if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
- (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
- isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
- return false;
-
// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.
if (A != C) {
@@ -44541,23 +45806,39 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
}
}
- LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
- RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+ SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
bool IsIdentityPostShuffle =
isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
if (IsIdentityPostShuffle)
PostShuffleMask.clear();
+ // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
+ if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+ isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
+ return false;
+
+ // If the source nodes are already used in HorizOps then always accept this.
+ // Shuffle folding should merge these back together.
+ bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
+ return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+ });
+ bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
+ return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+ });
+ bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
+
// Assume a SingleSource HOP if we only shuffle one input and don't need to
// shuffle the result.
- if (!shouldUseHorizontalOp(LHS == RHS &&
+ if (!ForceHorizOp &&
+ !shouldUseHorizontalOp(NewLHS == NewRHS &&
(NumShuffles < 2 || !IsIdentityPostShuffle),
DAG, Subtarget))
return false;
- LHS = DAG.getBitcast(VT, LHS);
- RHS = DAG.getBitcast(VT, RHS);
+ LHS = DAG.getBitcast(VT, NewLHS);
+ RHS = DAG.getBitcast(VT, NewRHS);
return true;
}
@@ -44575,7 +45856,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
SmallVector<int, 8> PostShuffleMask;
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
+ isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
+ PostShuffleMask)) {
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
if (!PostShuffleMask.empty())
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
@@ -44583,8 +45865,6 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
return HorizBinOp;
}
- // NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
-
return SDValue();
}
@@ -44722,7 +46002,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
EVT OutSVT = OutVT.getVectorElementType();
EVT InSVT = InVT.getVectorElementType();
- if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
NumElems >= 8))
return SDValue();
@@ -44784,8 +46064,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
// there's no harm in trying pack.
if (Subtarget.hasAVX512() &&
!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
- InVT.is512BitVector()))
+ InVT.is512BitVector())) {
+ // PACK should still be worth it for 128-bit vectors if the sources were
+ // originally concatenated from subvectors.
+ SmallVector<SDValue> ConcatOps;
+ if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
return SDValue();
+ }
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@@ -44807,9 +46092,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
return SDValue();
- if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
+ unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
+ if (NumSignBits > MinSignBits)
return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+ // If we have a srl that only generates signbits that we will discard in
+ // the truncation then we can use PACKSS by converting the srl to a sra.
+ // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
+ if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
+ if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
+ In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
+ if (*ShAmt == MinSignBits) {
+ SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
+ Subtarget);
+ }
+ }
+
return SDValue();
}
@@ -46127,7 +47426,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
- EVT InVT = N0.getValueType();
SDLoc DL(N);
// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
@@ -46156,23 +47454,17 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
- if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
- isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
- // Invert and sign-extend a boolean is the same as zero-extend and subtract
- // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
- // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
- // sext (xor Bool, -1) --> sub (zext Bool), 1
- SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
- return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
- }
-
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
- if (VT.isVector())
+ if (VT.isVector()) {
if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
return R;
+ if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
+ return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
+ }
+
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
@@ -46191,14 +47483,23 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if (!TLI.isTypeLegal(VT))
return SDValue();
- EVT ScalarVT = VT.getScalarType();
- if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
- return SDValue();
-
SDValue A = N->getOperand(IsStrict ? 1 : 0);
SDValue B = N->getOperand(IsStrict ? 2 : 1);
SDValue C = N->getOperand(IsStrict ? 3 : 2);
+ // If the operation allows fast-math and the target does not support FMA,
+ // split this into mul+add to avoid libcall(s).
+ SDNodeFlags Flags = N->getFlags();
+ if (!IsStrict && Flags.hasAllowReassociation() &&
+ TLI.isOperationExpand(ISD::FMA, VT)) {
+ SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
+ return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
+ }
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+ return SDValue();
+
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOperations = !DCI.isBeforeLegalizeOps();
@@ -46621,7 +47922,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
- // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
+ // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
// with scalar comparisons.
if (SDValue NotSrc = IsNOT(Src, DAG)) {
SDLoc DL(N);
@@ -46632,6 +47933,17 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(NotMask, DL, VT));
}
+ // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
+ // results with scalar comparisons.
+ if (Src.getOpcode() == X86ISD::PCMPGT &&
+ ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
+ DAG.getConstant(NotMask, DL, VT));
+ }
+
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnesValue(NumBits));
@@ -46669,7 +47981,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
return DAG.getMaskedGather(Gather->getVTList(),
Gather->getMemoryVT(), DL, Ops,
Gather->getMemOperand(),
- Gather->getIndexType());
+ Gather->getIndexType(),
+ Gather->getExtensionType());
}
auto *Scatter = cast<MaskedScatterSDNode>(GorS);
SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
@@ -46677,7 +47990,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
return DAG.getMaskedScatter(Scatter->getVTList(),
Scatter->getMemoryVT(), DL,
Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ Scatter->getIndexType(),
+ Scatter->isTruncatingStore());
}
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
@@ -47672,17 +48986,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
bool IsAdd = N->getOpcode() == ISD::ADD;
+ auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
SmallVector<int, 8> PostShuffleMask;
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&
- isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
- auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
- Ops[0].getValueType(), Ops);
+ isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
+ PostShuffleMask)) {
+ auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
};
SDValue HorizBinOp =
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
@@ -47747,12 +49062,11 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector())
return SDValue();
- // PSUBUS is supported, starting from SSE2, but truncation for v8i32
- // is only worth it with SSSE3 (PSHUFB).
+ // PSUBUS is supported, starting from SSE2.
EVT EltVT = VT.getVectorElementType();
- if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
- !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
- !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
+ if (!(Subtarget.hasSSE2() &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
+ VT == MVT::v8i64 || VT == MVT::v16i32)))
return SDValue();
SDValue SubusLHS, SubusRHS;
@@ -47788,9 +49102,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue MinLHS = Op1.getOperand(0).getOperand(0);
SDValue MinRHS = Op1.getOperand(0).getOperand(1);
EVT TruncVT = Op1.getOperand(0).getValueType();
- if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
- TruncVT == MVT::v8i64)) &&
- !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+ if (!(Subtarget.hasSSE2() &&
+ (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
+ TruncVT == MVT::v16i32)))
return SDValue();
SDValue OpToSaturate;
if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
@@ -47828,7 +49142,7 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
// values, or first 48 bits for 64 bit values.
KnownBits Known = DAG.computeKnownBits(SubusLHS);
unsigned NumZeros = Known.countMinLeadingZeros();
- if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
+ if (NumZeros < (VT.getScalarSizeInBits() - 16))
return SDValue();
EVT ExtType = SubusLHS.getValueType();
@@ -47928,43 +49242,47 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
SDValue Op0 = Ops[0];
bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
- // Fold subvector loads into one.
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
- bool Fast;
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *FirstLd->getMemOperand(), &Fast) &&
- Fast) {
- if (SDValue Ld =
- EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
- return Ld;
- }
- }
-
// Repeated subvectors.
- if (IsSplat) {
- // If this broadcast/subv_broadcast is inserted into both halves, use a
- // larger broadcast/subv_broadcast.
- if (Op0.getOpcode() == X86ISD::VBROADCAST ||
- Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
+ if (IsSplat &&
+ (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
+ // If this broadcast is inserted into both halves, use a larger broadcast.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST)
return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
- // If this broadcast_load is inserted into both halves, use a larger
- // broadcast_load. Update other uses to use an extracted subvector.
- if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ // If this scalar/subvector broadcast_load is inserted into both halves, use
+ // a larger broadcast_load. Update other uses to use an extracted subvector.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
- SDValue BcastLd = DAG.getMemIntrinsicNode(
- X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
- MemIntr->getMemOperand());
+ SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(
Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
return BcastLd;
}
+ // If this is a simple subvector load repeated across multiple lanes, then
+ // broadcast the load. Update other uses to use an extracted subvector.
+ if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
+ if (Ld->isSimple() && !Ld->isNonTemporal() &&
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
+ Ld->getMemoryVT(), Ld->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(
+ Op0,
+ extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
@@ -48042,6 +49360,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
break;
+ case X86ISD::VPERMV3:
+ if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
+ MVT OpVT = Op0.getSimpleValueType();
+ int NumSrcElts = OpVT.getVectorNumElements();
+ SmallVector<int, 64> ConcatMask;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ bool IsUnary;
+ SmallVector<int, 64> SubMask;
+ SmallVector<SDValue, 2> SubOps;
+ if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
+ SubMask, IsUnary))
+ break;
+ for (int M : SubMask) {
+ if (0 <= M) {
+ M += M < NumSrcElts ? 0 : NumSrcElts;
+ M += i * NumSrcElts;
+ }
+ ConcatMask.push_back(M);
+ }
+ }
+ if (ConcatMask.size() == (NumOps * NumSrcElts)) {
+ SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
+ Ops[1].getOperand(0), DAG, DL);
+ SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
+ Ops[1].getOperand(2), DAG, DL);
+ MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
+ SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
+ }
+ }
+ break;
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:
@@ -48074,10 +49424,33 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
Op0.getOperand(1));
}
break;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case X86ISD::ANDNP:
+ // TODO: Add 256-bit support.
+ if (!IsSplat && VT.is512BitVector()) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
- if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
- Subtarget.hasInt256()) {
+ if (!IsSplat && VT.is256BitVector() &&
+ (VT.isFloatingPoint() || Subtarget.hasInt256())) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
LHS.push_back(Ops[i].getOperand(0));
@@ -48112,6 +49485,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
+ // Fold subvector loads into one.
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+ bool Fast;
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *FirstLd->getMemOperand(), &Fast) &&
+ Fast) {
+ if (SDValue Ld =
+ EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ return Ld;
+ }
+ }
+
return SDValue();
}
@@ -48183,7 +49570,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Ins = SubVec.getOperand(0);
if (isNullConstant(Ins.getOperand(2)) &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
- Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
+ Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
+ SubVecVT.getFixedSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
@@ -48336,12 +49724,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
unsigned IdxVal = N->getConstantOperandVal(1);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned InSizeInBits = InVecVT.getSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
TLI.isTypeLegal(InVecVT) &&
- InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
- auto isConcatenatedNot = [] (SDValue V) {
+ InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
+ auto isConcatenatedNot = [](SDValue V) {
V = peekThroughBitcasts(V);
if (!isBitwiseNot(V))
return false;
@@ -48384,53 +49774,32 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
- InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
+ InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
SDLoc DL(N);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL),
InVec.getOperand(1), InVec.getOperand(2));
}
- // If we're extracting from a broadcast then we're better off just
- // broadcasting to the smaller type directly, assuming this is the only use.
- // As its a broadcast we don't care about the extraction index.
- if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
- InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
- return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
-
- if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
- auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
- if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
- SDValue BcastLd =
- DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
- MemIntr->getMemoryVT(),
- MemIntr->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
- return BcastLd;
- }
- }
-
// If we're extracting an upper subvector from a broadcast we should just
// extract the lowest subvector instead which should allow
// SimplifyDemandedVectorElts do more simplifications.
if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
- return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
- // If we're extracting a broadcasted subvector, just use the source.
- if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
- InVec.getOperand(0).getValueType() == VT)
- return InVec.getOperand(0);
+ // If we're extracting a broadcasted subvector, just use the lowest subvector.
+ if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
// Attempt to extract from the source of a shuffle vector.
- if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
+ if ((InSizeInBits % SizeInBits) == 0 &&
(IdxVal % VT.getVectorNumElements()) == 0) {
SmallVector<int, 32> ShuffleMask;
SmallVector<int, 32> ScaledMask;
SmallVector<SDValue, 2> ShuffleInputs;
- unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
+ unsigned NumSubVecs = InSizeInBits / SizeInBits;
// Decode the shuffle mask and scale it so its shuffling subvectors.
if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
@@ -48440,19 +49809,19 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (ScaledMask[SubVecIdx] == SM_SentinelZero)
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
- if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
+ if (Src.getValueSizeInBits() == InSizeInBits) {
unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
- SDLoc(N), VT.getSizeInBits());
+ SDLoc(N), SizeInBits);
}
}
}
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
+ unsigned InOpcode = InVec.getOpcode();
if (IdxVal == 0 && InVec.hasOneUse()) {
- unsigned InOpcode = InVec.getOpcode();
if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
@@ -48476,10 +49845,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
InOpcode == ISD::SIGN_EXTEND ||
InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
- VT.is128BitVector() &&
- InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
+ (SizeInBits == 128 || SizeInBits == 256) &&
+ InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
+ SDLoc DL(N);
+ SDValue Ext = InVec.getOperand(0);
+ if (Ext.getValueSizeInBits() > SizeInBits)
+ Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
- return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
+ return DAG.getNode(ExtOp, DL, VT, Ext);
}
if (InOpcode == ISD::VSELECT &&
InVec.getOperand(0).getValueType().is256BitVector() &&
@@ -48491,6 +49864,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
+ if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+ (VT.is128BitVector() || VT.is256BitVector())) {
+ SDLoc DL(N);
+ SDValue InVecSrc = InVec.getOperand(0);
+ unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
+ SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext);
+ }
+ }
+
+ // Always split vXi64 logical shifts where we're extracting the upper 32-bits
+ // as this is very likely to fold into a shuffle/truncation.
+ if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
+ InVecVT.getScalarSizeInBits() == 64 &&
+ InVec.getConstantOperandAPInt(1) == 32) {
+ SDLoc DL(N);
+ SDValue Ext =
+ extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
}
return SDValue();
@@ -48574,7 +49966,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
// If the input is an extend_invec and the SimplifyDemandedBits call didn't
// convert it to any_extend_invec, due to the LegalOperations check, do the
// conversion directly to a vector shuffle manually. This exposes combine
- // opportunities missed by combineExtInVec not calling
+ // opportunities missed by combineEXTEND_VECTOR_INREG not calling
// combineX86ShufflesRecursively on SSE4.1 targets.
// FIXME: This is basically a hack around several other issues related to
// ANY_EXTEND_VECTOR_INREG.
@@ -48602,11 +49994,13 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
+ unsigned Opcode = N->getOpcode();
+ unsigned InOpcode = In.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Try to merge vector loads and extend_inreg to an extload.
@@ -48615,7 +50009,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
auto *Ld = cast<LoadSDNode>(In);
if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
- ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
+ ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
? ISD::SEXTLOAD
: ISD::ZEXTLOAD;
EVT MemVT =
@@ -48623,8 +50017,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT,
- Ld->getOriginalAlign(),
+ Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
@@ -48632,9 +50025,23 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
}
}
+ // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
+ if (Opcode == InOpcode)
+ return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
+
+ // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
+ // -> EXTEND_VECTOR_INREG(X).
+ // TODO: Handle non-zero subvector indices.
+ if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
+ In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
+ In.getOperand(0).getOperand(0).getValueSizeInBits() ==
+ In.getValueSizeInBits())
+ return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
+
// Attempt to combine as a shuffle.
- // TODO: SSE41 support
- if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
+ // TODO: General ZERO_EXTEND_VECTOR_INREG support.
+ if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -48755,11 +50162,15 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
}
-// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
-// cases where the loads have the same input chain and the output chains are
-// unused. This avoids any memory ordering issues.
-static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
+// from. Limit this to cases where the loads have the same input chain and the
+// output chains are unused. This avoids any memory ordering issues.
+static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
+ "Unknown broadcast load type");
+
// Only do this if the chain result is unused.
if (N->hasAnyUseOfValue(1))
return SDValue();
@@ -48774,13 +50185,13 @@ static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
// Look at other users of our base pointer and try to find a wider broadcast.
// The input chain and the size of the memory VT must match.
for (SDNode *User : Ptr->uses())
- if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ if (User != N && User->getOpcode() == N->getOpcode() &&
cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
MemVT.getSizeInBits() &&
!User->hasAnyUseOfValue(1) &&
- User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
VT.getSizeInBits());
Extract = DAG.getBitcast(VT, Extract);
@@ -48851,6 +50262,17 @@ static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBits), DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -48887,7 +50309,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
- case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
@@ -48932,13 +50355,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
- case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
- Subtarget);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::PACKSS:
case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
case X86ISD::VSHL:
case X86ISD::VSRA:
case X86ISD::VSRL:
@@ -49015,8 +50442,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
- case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::VBROADCAST_LOAD:
+ case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
+ case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
}
return SDValue();
@@ -49305,7 +50734,7 @@ static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
.Case("{@ccnl}", X86::COND_GE)
.Case("{@ccnle}", X86::COND_G)
.Case("{@ccno}", X86::COND_NO)
- .Case("{@ccnp}", X86::COND_P)
+ .Case("{@ccnp}", X86::COND_NP)
.Case("{@ccns}", X86::COND_NS)
.Case("{@cco}", X86::COND_O)
.Case("{@ccp}", X86::COND_P)
@@ -49541,8 +50970,8 @@ LowerXConstraint(EVT ConstraintVT) const {
// Lower @cc targets via setcc.
SDValue X86TargetLowering::LowerAsmOutputForConstraint(
- SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
- SelectionDAG &DAG) const {
+ SDValue &Chain, SDValue &Flag, const SDLoc &DL,
+ const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
if (Cond == X86::COND_INVALID)
return SDValue();
@@ -49978,30 +51407,35 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Not found as a standard register?
if (!Res.second) {
- // Map st(0) -> st(7) -> ST0
- if (Constraint.size() == 7 && Constraint[0] == '{' &&
- tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
- Constraint[3] == '(' &&
- (Constraint[4] >= '0' && Constraint[4] <= '7') &&
- Constraint[5] == ')' && Constraint[6] == '}') {
- // st(7) is not allocatable and thus not a member of RFP80. Return
- // singleton class in cases where we have a reference to it.
- if (Constraint[4] == '7')
- return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
- return std::make_pair(X86::FP0 + Constraint[4] - '0',
- &X86::RFP80RegClass);
- }
-
- // GCC allows "st(0)" to be called just plain "st".
- if (StringRef("{st}").equals_lower(Constraint))
- return std::make_pair(X86::FP0, &X86::RFP80RegClass);
+ // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
+ // to/from f80.
+ if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' && Constraint[6] == '}') {
+ // st(7) is not allocatable and thus not a member of RFP80. Return
+ // singleton class in cases where we have a reference to it.
+ if (Constraint[4] == '7')
+ return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
+ return std::make_pair(X86::FP0 + Constraint[4] - '0',
+ &X86::RFP80RegClass);
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint))
+ return std::make_pair(X86::FP0, &X86::RFP80RegClass);
+ }
// flags -> EFLAGS
if (StringRef("{flags}").equals_lower(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
// dirflag -> DF
- if (StringRef("{dirflag}").equals_lower(Constraint))
+ // Only allow for clobber.
+ if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
return std::make_pair(X86::DF, &X86::DFCCRRegClass);
// fpsr -> FPSW
@@ -50275,3 +51709,10 @@ X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
.getAsInteger(0, StackProbeSize);
return StackProbeSize;
}
+
+Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ if (ML->isInnermost() &&
+ ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
+ return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
+ return TargetLowering::getPrefLoopAlignment();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
index 7f3dc90a2d73..76c83b7df9eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
@@ -384,8 +384,10 @@ namespace llvm {
/// Vector comparison generating mask bits for fp and
/// integer signed and unsigned data types.
CMPM,
- // Vector comparison with SAE for FP values
- CMPM_SAE,
+ // Vector mask comparison generating mask bits for FP values.
+ CMPMM,
+ // Vector mask comparison with SAE for FP values.
+ CMPMM_SAE,
// Arithmetic operations with FLAGS results.
ADD,
@@ -400,6 +402,7 @@ namespace llvm {
// Bit field extract.
BEXTR,
+ BEXTRI,
// Zero High Bits Starting with Specified Bit Position.
BZHI,
@@ -502,8 +505,6 @@ namespace llvm {
VBROADCAST,
// Broadcast mask to vector.
VBROADCASTM,
- // Broadcast subvector to vector.
- SUBV_BROADCAST,
/// SSE4A Extraction and Insertion.
EXTRQI,
@@ -708,6 +709,9 @@ namespace llvm {
// For avx512-vp2intersect
VP2INTERSECT,
+ // User level interrupts - testui
+ TESTUI,
+
/// X86 strict FP compare instructions.
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
STRICT_FCMPS,
@@ -747,11 +751,13 @@ namespace llvm {
STRICT_CVTPS2PH,
STRICT_CVTPH2PS,
+ // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
+ // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
+
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
LCMPXCHG16_DAG,
- LCMPXCHG8_SAVE_EBX_DAG,
LCMPXCHG16_SAVE_RBX_DAG,
/// LOCK-prefixed arithmetic read-modify-write instructions.
@@ -768,9 +774,12 @@ namespace llvm {
// extract_vector_elt, store.
VEXTRACT_STORE,
- // scalar broadcast from memory
+ // scalar broadcast from memory.
VBROADCAST_LOAD,
+ // subvector broadcast from memory.
+ SUBV_BROADCAST_LOAD,
+
// Store FP control world into i16 memory.
FNSTCW16m,
@@ -806,9 +815,10 @@ namespace llvm {
/// specifies the type to store as.
FST,
- /// This instruction grabs the address of the next argument
+ /// These instructions grab the address of the next argument
/// from a va_list. (reads and modifies the va_list in memory)
VAARG_64,
+ VAARG_X32,
// Vector truncating store with unsigned/signed saturation
VTRUNCSTOREUS,
@@ -821,6 +831,16 @@ namespace llvm {
MGATHER,
MSCATTER,
+ // Key locker nodes that produce flags.
+ AESENC128KL,
+ AESDEC128KL,
+ AESENC256KL,
+ AESDEC256KL,
+ AESENCWIDE128KL,
+ AESDECWIDE128KL,
+ AESENCWIDE256KL,
+ AESDECWIDE256KL,
+
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
// opcodes will be thought as target memory ops!
@@ -835,7 +855,7 @@ namespace llvm {
/// Returns true of the given offset can be
/// fit into displacement field of the instruction.
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
- bool hasSymbolicDisplacement = true);
+ bool hasSymbolicDisplacement);
/// Determines whether the callee is required to pop its
/// own arguments. Callee pop is necessary to support tail calls.
@@ -907,14 +927,6 @@ namespace llvm {
///
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
- /// Places new result values for the node in Results (their number
- /// and types must exactly match those of the original return values of
- /// the node), or leaves Results empty, which indicates that the node is not
- /// to be custom lowered after all.
- void LowerOperationWrapper(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) const override;
-
/// Replace the results of node with an illegal result
/// type with new values built out of custom code.
///
@@ -1116,7 +1128,8 @@ namespace llvm {
}
/// Handle Lowering flag assembly outputs.
- SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL,
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+ const SDLoc &DL,
const AsmOperandInfo &Constraint,
SelectionDAG &DAG) const override;
@@ -1349,8 +1362,6 @@ namespace llvm {
Align Alignment,
SelectionDAG &DAG) const;
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
-
/// Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
@@ -1397,6 +1408,8 @@ namespace llvm {
SDValue Addr, SelectionDAG &DAG)
const override;
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -1488,6 +1501,7 @@ namespace llvm {
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
@@ -1514,9 +1528,6 @@ namespace llvm {
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
- RTLIB::Libcall Call) const;
-
SDValue
LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -1572,8 +1583,7 @@ namespace llvm {
// Utility function to emit the low-level va_arg code for X86-64.
MachineBasicBlock *
- EmitVAARG64WithCustomInserter(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
+ EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
/// Utility function to emit the xmm reg save portion of va_start.
MachineBasicBlock *
@@ -1689,7 +1699,7 @@ namespace llvm {
};
/// Generate unpacklo/unpackhi shuffle mask.
- void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
bool Unary);
/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 1628f85da808..85410c54a4d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
#define DEBUG_TYPE "x86-indirect-branch-tracking"
-static cl::opt<bool> IndirectBranchTracking(
+cl::opt<bool> IndirectBranchTracking(
"x86-indirect-branch-tracking", cl::init(false), cl::Hidden,
cl::desc("Enable X86 indirect branch tracking pass."));
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
index 828887d96129..3d96d198b409 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -95,7 +95,6 @@ struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
MF.front().addLiveIn(X86::R11);
- return;
}
};
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 53925bbfd72f..004e6fa5ebf4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -214,10 +214,10 @@ bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
MachineInstrBuilder MIB(MF, PFetch);
- assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
- X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
- X86::AddrSegmentReg == 4 &&
- "Unexpected change in X86 operand offset order.");
+ static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
+ X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
+ X86::AddrSegmentReg == 4,
+ "Unexpected change in X86 operand offset order.");
// This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
// FIXME(mtrofin): consider adding a:
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
index a82d98d88b30..56d2709f5937 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -27,7 +27,6 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Support/Debug.h"
@@ -48,9 +47,6 @@ public:
StringRef getPassName() const override {
return "X86 insert wait instruction";
}
-
-private:
- const TargetInstrInfo *TII; // Machine instruction info.
};
} // namespace
@@ -119,7 +115,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
return false;
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- TII = ST.getInstrInfo();
+ const X86InstrInfo *TII = ST.getInstrInfo();
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
new file mode 100644
index 000000000000..c4150ed52854
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -0,0 +1,2017 @@
+//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(Constant *V) {
+ VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
+ V = ConstantExpr::getBitCast(V, IntTy);
+ V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
+ V);
+ return V;
+}
+
+/// Convert the x86 XMM integer vector mask to a vector of bools based on
+/// each element's most significant bit (the sign bit).
+static Value *getBoolVecFromMask(Value *Mask) {
+ // Fold Constant Mask.
+ if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
+ return getNegativeIsTrueBoolVec(ConstantMask);
+
+ // Mask was extended from a boolean vector.
+ Value *ExtMask;
+ if (PatternMatch::match(
+ Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
+ ExtMask->getType()->isIntOrIntVectorTy(1))
+ return ExtMask;
+
+ return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+ Value *Ptr = II.getOperand(0);
+ Value *Mask = II.getOperand(1);
+ Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+ // Zero Mask - masked load instruction creates a zero vector.
+ if (isa<ConstantAggregateZero>(Mask))
+ return IC.replaceInstUsesWith(II, ZeroVec);
+
+ // The mask is constant or extended from a bool vector. Convert this x86
+ // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+ if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+ // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+ // the LLVM intrinsic definition for the pointer argument.
+ unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+ Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+ // The pass-through vector for an x86 masked load is a zero vector.
+ CallInst *NewMaskedLoad =
+ IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
+ return IC.replaceInstUsesWith(II, NewMaskedLoad);
+ }
+
+ return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+ Value *Ptr = II.getOperand(0);
+ Value *Mask = II.getOperand(1);
+ Value *Vec = II.getOperand(2);
+
+ // Zero Mask - this masked store instruction does nothing.
+ if (isa<ConstantAggregateZero>(Mask)) {
+ IC.eraseInstFromFunction(II);
+ return true;
+ }
+
+ // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+ // anything else at this level.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+ return false;
+
+ // The mask is constant or extended from a bool vector. Convert this x86
+ // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+ if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+ unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+ Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+ IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
+
+ // 'Replace uses' doesn't work for stores. Erase the original masked store.
+ IC.eraseInstFromFunction(II);
+ return true;
+ }
+
+ return false;
+}
+
+static Value *simplifyX86immShift(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+ bool IsImm = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
+ break;
+ }
+ assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<FixedVectorType>(Vec->getType());
+ auto SVT = VT->getElementType();
+ auto AmtVT = Amt->getType();
+ unsigned VWidth = VT->getNumElements();
+ unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift. If its guaranteed to be out of range, logical shifts combine
+ // to zero and arithmetic shifts are clamped to (BitWidth - 1).
+ if (IsImm) {
+ assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
+ KnownBits KnownAmtBits =
+ llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+ if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+ Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+ Amt = Builder.CreateVectorSplat(VWidth, Amt);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ if (KnownAmtBits.getMinValue().uge(BitWidth)) {
+ if (LogicalShift)
+ return ConstantAggregateZero::get(VT);
+ Amt = ConstantInt::get(SVT, BitWidth - 1);
+ return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
+ }
+ } else {
+ // Ensure the first element has an in-range value and the rest of the
+ // elements in the bottom 64 bits are zero.
+ assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+ cast<VectorType>(AmtVT)->getElementType() == SVT &&
+ "Unexpected shift-by-scalar type");
+ unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
+ APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
+ APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
+ KnownBits KnownLowerBits = llvm::computeKnownBits(
+ Amt, DemandedLower, II.getModule()->getDataLayout());
+ KnownBits KnownUpperBits = llvm::computeKnownBits(
+ Amt, DemandedUpper, II.getModule()->getDataLayout());
+ if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
+ (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+ SmallVector<int, 16> ZeroSplat(VWidth, 0);
+ Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ }
+
+ // Simplify if count is constant vector.
+ auto CDV = dyn_cast<ConstantDataVector>(Amt);
+ if (!CDV)
+ return nullptr;
+
+ // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+ cast<VectorType>(AmtVT)->getElementType() == SVT &&
+ "Unexpected shift-by-scalar type");
+
+ // Concatenate the sub-elements to create the 64-bit value.
+ APInt Count(64, 0);
+ for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
+ unsigned SubEltIdx = (NumSubElts - 1) - i;
+ auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+ Count <<= BitWidth;
+ Count |= SubElt->getValue().zextOrTrunc(64);
+ }
+
+ // If shift-by-zero then just return the original value.
+ if (Count.isNullValue())
+ return Vec;
+
+ // Handle cases when Shift >= BitWidth.
+ if (Count.uge(BitWidth)) {
+ // If LogicalShift - just return zero.
+ if (LogicalShift)
+ return ConstantAggregateZero::get(VT);
+
+ // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+ Count = APInt(64, BitWidth - 1);
+ }
+
+ // Get a constant vector of the same type as the first operand.
+ auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+ auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+ if (ShiftLeft)
+ return Builder.CreateShl(Vec, ShiftVec);
+
+ if (LogicalShift)
+ return Builder.CreateLShr(Vec, ShiftVec);
+
+ return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
+ break;
+ }
+ assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<FixedVectorType>(II.getType());
+ auto SVT = VT->getElementType();
+ int NumElts = VT->getNumElements();
+ int BitWidth = SVT->getIntegerBitWidth();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift.
+ APInt UpperBits =
+ APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
+ if (llvm::MaskedValueIsZero(Amt, UpperBits,
+ II.getModule()->getDataLayout())) {
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+
+ // Simplify if all shift amounts are constant/undef.
+ auto *CShift = dyn_cast<Constant>(Amt);
+ if (!CShift)
+ return nullptr;
+
+ // Collect each element's shift amount.
+ // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+ bool AnyOutOfRange = false;
+ SmallVector<int, 8> ShiftAmts;
+ for (int I = 0; I < NumElts; ++I) {
+ auto *CElt = CShift->getAggregateElement(I);
+ if (isa_and_nonnull<UndefValue>(CElt)) {
+ ShiftAmts.push_back(-1);
+ continue;
+ }
+
+ auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+ if (!COp)
+ return nullptr;
+
+ // Handle out of range shifts.
+ // If LogicalShift - set to BitWidth (special case).
+ // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+ APInt ShiftVal = COp->getValue();
+ if (ShiftVal.uge(BitWidth)) {
+ AnyOutOfRange = LogicalShift;
+ ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+ continue;
+ }
+
+ ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+ }
+
+ // If all elements out of range or UNDEF, return vector of zeros/undefs.
+ // ArithmeticShift should only hit this if they are all UNDEF.
+ auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+ if (llvm::all_of(ShiftAmts, OutOfRange)) {
+ SmallVector<Constant *, 8> ConstantVec;
+ for (int Idx : ShiftAmts) {
+ if (Idx < 0) {
+ ConstantVec.push_back(UndefValue::get(SVT));
+ } else {
+ assert(LogicalShift && "Logical shift expected");
+ ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+ }
+ }
+ return ConstantVector::get(ConstantVec);
+ }
+
+ // We can't handle only some out of range values with generic logical shifts.
+ if (AnyOutOfRange)
+ return nullptr;
+
+ // Build the shift amount constant vector.
+ SmallVector<Constant *, 8> ShiftVecAmts;
+ for (int Idx : ShiftAmts) {
+ if (Idx < 0)
+ ShiftVecAmts.push_back(UndefValue::get(SVT));
+ else
+ ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+ }
+ auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+ if (ShiftLeft)
+ return Builder.CreateShl(Vec, ShiftVec);
+
+ if (LogicalShift)
+ return Builder.CreateLShr(Vec, ShiftVec);
+
+ return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder, bool IsSigned) {
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ Type *ResTy = II.getType();
+
+ // Fast all undef handling.
+ if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+ return UndefValue::get(ResTy);
+
+ auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
+ unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+ unsigned NumSrcElts = ArgTy->getNumElements();
+ assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
+ "Unexpected packing types");
+
+ unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+ unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+ unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
+ assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
+ "Unexpected packing types");
+
+ // Constant folding.
+ if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+ return nullptr;
+
+ // Clamp Values - signed/unsigned both use signed clamp values, but they
+ // differ on the min/max values.
+ APInt MinValue, MaxValue;
+ if (IsSigned) {
+ // PACKSS: Truncate signed value with signed saturation.
+ // Source values less than dst minint are saturated to minint.
+ // Source values greater than dst maxint are saturated to maxint.
+ MinValue =
+ APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+ MaxValue =
+ APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+ } else {
+ // PACKUS: Truncate signed value with unsigned saturation.
+ // Source values less than zero are saturated to zero.
+ // Source values greater than dst maxuint are saturated to maxuint.
+ MinValue = APInt::getNullValue(SrcScalarSizeInBits);
+ MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
+ }
+
+ auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
+ auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
+ Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
+ Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
+ Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
+ Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
+
+ // Shuffle clamped args together at the lane level.
+ SmallVector<int, 32> PackMask;
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+ PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+ PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
+ }
+ auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
+
+ // Truncate to dst size.
+ return Builder.CreateTrunc(Shuffle, ResTy);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Arg = II.getArgOperand(0);
+ Type *ResTy = II.getType();
+
+ // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+ if (isa<UndefValue>(Arg))
+ return Constant::getNullValue(ResTy);
+
+ auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
+ // We can't easily peek through x86_mmx types.
+ if (!ArgTy)
+ return nullptr;
+
+ // Expand MOVMSK to compare/bitcast/zext:
+ // e.g. PMOVMSKB(v16i8 x):
+ // %cmp = icmp slt <16 x i8> %x, zeroinitializer
+ // %int = bitcast <16 x i1> %cmp to i16
+ // %res = zext i16 %int to i32
+ unsigned NumElts = ArgTy->getNumElements();
+ Type *IntegerVecTy = VectorType::getInteger(ArgTy);
+ Type *IntegerTy = Builder.getIntNTy(NumElts);
+
+ Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
+ Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+ Res = Builder.CreateBitCast(Res, IntegerTy);
+ Res = Builder.CreateZExtOrTrunc(Res, ResTy);
+ return Res;
+}
+
+static Value *simplifyX86addcarry(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Value *CarryIn = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Op2 = II.getArgOperand(2);
+ Type *RetTy = II.getType();
+ Type *OpTy = Op1->getType();
+ assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
+ RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
+ "Unexpected types for x86 addcarry");
+
+ // If carry-in is zero, this is just an unsigned add with overflow.
+ if (match(CarryIn, PatternMatch::m_ZeroInt())) {
+ Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
+ {Op1, Op2});
+ // The types have to be adjusted to match the x86 call types.
+ Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
+ Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
+ Builder.getInt8Ty());
+ Value *Res = UndefValue::get(RetTy);
+ Res = Builder.CreateInsertValue(Res, UAddOV, 0);
+ return Builder.CreateInsertValue(Res, UAddResult, 1);
+ }
+
+ return nullptr;
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ if (!CInt)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+ // The immediate permute control byte looks like this:
+ // [3:0] - zero mask for each 32-bit lane
+ // [5:4] - select one 32-bit destination lane
+ // [7:6] - select one 32-bit source lane
+
+ uint8_t Imm = CInt->getZExtValue();
+ uint8_t ZMask = Imm & 0xf;
+ uint8_t DestLane = (Imm >> 4) & 0x3;
+ uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+ ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+ // If all zero mask bits are set, this was just a weird way to
+ // generate a zero vector.
+ if (ZMask == 0xf)
+ return ZeroVector;
+
+ // Initialize by passing all of the first source bits through.
+ int ShuffleMask[4] = {0, 1, 2, 3};
+
+ // We may replace the second operand with the zero vector.
+ Value *V1 = II.getArgOperand(1);
+
+ if (ZMask) {
+ // If the zero mask is being used with a single input or the zero mask
+ // overrides the destination lane, this is a shuffle with the zero vector.
+ if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+ (ZMask & (1 << DestLane))) {
+ V1 = ZeroVector;
+ // We may still move 32-bits of the first source vector from one lane
+ // to another.
+ ShuffleMask[DestLane] = SourceLane;
+ // The zero mask may override the previous insert operation.
+ for (unsigned i = 0; i < 4; ++i)
+ if ((ZMask >> i) & 0x1)
+ ShuffleMask[i] = i + 4;
+ } else {
+ // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+ return nullptr;
+ }
+ } else {
+ // Replace the selected destination lane with the selected source lane.
+ ShuffleMask[DestLane] = SourceLane + 4;
+ }
+
+ return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+}
+
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
+ ConstantInt *CILength, ConstantInt *CIIndex,
+ InstCombiner::BuilderTy &Builder) {
+ auto LowConstantHighUndef = [&](uint64_t Val) {
+ Type *IntTy64 = Type::getInt64Ty(II.getContext());
+ Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+ UndefValue::get(IntTy64)};
+ return ConstantVector::get(Args);
+ };
+
+ // See if we're dealing with constant values.
+ Constant *C0 = dyn_cast<Constant>(Op0);
+ ConstantInt *CI0 =
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ : nullptr;
+
+ // Attempt to constant fold.
+ if (CILength && CIIndex) {
+ // From AMD documentation: "The bit index and field length are each six
+ // bits in length other bits of the field are ignored."
+ APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+ APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+ unsigned Index = APIndex.getZExtValue();
+
+ // From AMD documentation: "a value of zero in the field length is
+ // defined as length of 64".
+ unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+ // From AMD documentation: "If the sum of the bit index + length field
+ // is greater than 64, the results are undefined".
+ unsigned End = Index + Length;
+
+ // Note that both field index and field length are 8-bit quantities.
+ // Since variables 'Index' and 'Length' are unsigned values
+ // obtained from zero-extending field index and field length
+ // respectively, their sum should never wrap around.
+ if (End > 64)
+ return UndefValue::get(II.getType());
+
+ // If we are inserting whole bytes, we can convert this to a shuffle.
+ // Lowering can recognize EXTRQI shuffle masks.
+ if ((Length % 8) == 0 && (Index % 8) == 0) {
+ // Convert bit indices to byte indices.
+ Length /= 8;
+ Index /= 8;
+
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+ SmallVector<int, 16> ShuffleMask;
+ for (int i = 0; i != (int)Length; ++i)
+ ShuffleMask.push_back(i + Index);
+ for (int i = Length; i != 8; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(-1);
+
+ Value *SV = Builder.CreateShuffleVector(
+ Builder.CreateBitCast(Op0, ShufTy),
+ ConstantAggregateZero::get(ShufTy), ShuffleMask);
+ return Builder.CreateBitCast(SV, II.getType());
+ }
+
+ // Constant Fold - shift Index'th bit to lowest position and mask off
+ // Length bits.
+ if (CI0) {
+ APInt Elt = CI0->getValue();
+ Elt.lshrInPlace(Index);
+ Elt = Elt.zextOrTrunc(Length);
+ return LowConstantHighUndef(Elt.getZExtValue());
+ }
+
+ // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+ Value *Args[] = {Op0, CILength, CIIndex};
+ Module *M = II.getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+ return Builder.CreateCall(F, Args);
+ }
+ }
+
+ // Constant Fold - extraction from zero is always {zero, undef}.
+ if (CI0 && CI0->isZero())
+ return LowConstantHighUndef(0);
+
+ return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+ APInt APLength, APInt APIndex,
+ InstCombiner::BuilderTy &Builder) {
+ // From AMD documentation: "The bit index and field length are each six bits
+ // in length other bits of the field are ignored."
+ APIndex = APIndex.zextOrTrunc(6);
+ APLength = APLength.zextOrTrunc(6);
+
+ // Attempt to constant fold.
+ unsigned Index = APIndex.getZExtValue();
+
+ // From AMD documentation: "a value of zero in the field length is
+ // defined as length of 64".
+ unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+ // From AMD documentation: "If the sum of the bit index + length field
+ // is greater than 64, the results are undefined".
+ unsigned End = Index + Length;
+
+ // Note that both field index and field length are 8-bit quantities.
+ // Since variables 'Index' and 'Length' are unsigned values
+ // obtained from zero-extending field index and field length
+ // respectively, their sum should never wrap around.
+ if (End > 64)
+ return UndefValue::get(II.getType());
+
+ // If we are inserting whole bytes, we can convert this to a shuffle.
+ // Lowering can recognize INSERTQI shuffle masks.
+ if ((Length % 8) == 0 && (Index % 8) == 0) {
+ // Convert bit indices to byte indices.
+ Length /= 8;
+ Index /= 8;
+
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+ SmallVector<int, 16> ShuffleMask;
+ for (int i = 0; i != (int)Index; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 0; i != (int)Length; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = Index + Length; i != 8; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(-1);
+
+ Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+ Builder.CreateBitCast(Op1, ShufTy),
+ ShuffleMask);
+ return Builder.CreateBitCast(SV, II.getType());
+ }
+
+ // See if we're dealing with constant values.
+ Constant *C0 = dyn_cast<Constant>(Op0);
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CI00 =
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ : nullptr;
+ ConstantInt *CI10 =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ : nullptr;
+
+ // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+ if (CI00 && CI10) {
+ APInt V00 = CI00->getValue();
+ APInt V10 = CI10->getValue();
+ APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+ V00 = V00 & ~Mask;
+ V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+ APInt Val = V00 | V10;
+ Type *IntTy64 = Type::getInt64Ty(II.getContext());
+ Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+ UndefValue::get(IntTy64)};
+ return ConstantVector::get(Args);
+ }
+
+ // If we were an INSERTQ call, we'll save demanded elements if we convert to
+ // INSERTQI.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+ Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+ Value *Args[] = {Op0, Op1, CILength, CIIndex};
+ Module *M = II.getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+ return Builder.CreateCall(F, Args);
+ }
+
+ return nullptr;
+}
+
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+ "Unexpected number of elements in shuffle mask!");
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[64];
+
+ // Each byte in the shuffle control mask forms an index to permute the
+ // corresponding byte in the destination operand.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
+
+ // If the most significant bit (bit[7]) of each byte of the shuffle
+ // control mask is set, then zero is written in the result byte.
+ // The zero vector is in the right-hand side of the resulting
+ // shufflevector.
+
+ // The value of each index for the high 128-bit lane is the least
+ // significant 4 bits of the respective shuffle control byte.
+ Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+ Indexes[I] = Index;
+ }
+
+ auto V1 = II.getArgOperand(0);
+ auto V2 = Constant::getNullValue(VecTy);
+ return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+ bool IsPD = VecTy->getScalarType()->isDoubleTy();
+ unsigned NumLaneElts = IsPD ? 2 : 4;
+ assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[16];
+
+ // The intrinsics only read one or two bits, clear the rest.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ APInt Index = cast<ConstantInt>(COp)->getValue();
+ Index = Index.zextOrTrunc(32).getLoBits(2);
+
+ // The PD variants uses bit 1 to select per-lane element index, so
+ // shift down to convert to generic shuffle mask index.
+ if (IsPD)
+ Index.lshrInPlace(1);
+
+ // The _256 variants are a bit trickier since the mask bits always index
+ // into the corresponding 128 half. In order to convert to a generic
+ // shuffle, we have to make that explicit.
+ Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
+
+ Indexes[I] = Index.getZExtValue();
+ }
+
+ auto V1 = II.getArgOperand(0);
+ return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned Size = VecTy->getNumElements();
+ assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+ "Unexpected shuffle mask size");
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[64];
+
+ for (unsigned I = 0; I < Size; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+ Index &= Size - 1;
+ Indexes[I] = Index;
+ }
+
+ auto V1 = II.getArgOperand(0);
+ return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
+}
+
+Optional<Instruction *>
+X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
+ unsigned DemandedWidth) {
+ APInt UndefElts(Width, 0);
+ APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+ return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+ };
+
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::x86_bmi_bextr_32:
+ case Intrinsic::x86_bmi_bextr_64:
+ case Intrinsic::x86_tbm_bextri_u32:
+ case Intrinsic::x86_tbm_bextri_u64:
+ // If the RHS is a constant we can try some simplifications.
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ uint64_t Shift = C->getZExtValue();
+ uint64_t Length = (Shift >> 8) & 0xff;
+ Shift &= 0xff;
+ unsigned BitWidth = II.getType()->getIntegerBitWidth();
+ // If the length is 0 or the shift is out of range, replace with zero.
+ if (Length == 0 || Shift >= BitWidth) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ // If the LHS is also a constant, we can completely constant fold this.
+ if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Result = InC->getZExtValue() >> Shift;
+ if (Length > BitWidth)
+ Length = BitWidth;
+ Result &= maskTrailingOnes<uint64_t>(Length);
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
+ // are only masking bits that a shift already cleared?
+ }
+ break;
+
+ case Intrinsic::x86_bmi_bzhi_32:
+ case Intrinsic::x86_bmi_bzhi_64:
+ // If the RHS is a constant we can try some simplifications.
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ uint64_t Index = C->getZExtValue() & 0xff;
+ unsigned BitWidth = II.getType()->getIntegerBitWidth();
+ if (Index >= BitWidth) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ if (Index == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ // If the LHS is also a constant, we can completely constant fold this.
+ if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Result = InC->getZExtValue();
+ Result &= maskTrailingOnes<uint64_t>(Index);
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ // TODO should we convert this to an AND if the RHS is constant?
+ }
+ break;
+ case Intrinsic::x86_bmi_pext_32:
+ case Intrinsic::x86_bmi_pext_64:
+ if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ if (MaskC->isNullValue()) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ if (MaskC->isAllOnesValue()) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
+ Value *Shifted = IC.Builder.CreateLShr(Masked,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ return IC.replaceInstUsesWith(II, Shifted);
+ }
+
+
+ if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Src = SrcC->getZExtValue();
+ uint64_t Mask = MaskC->getZExtValue();
+ uint64_t Result = 0;
+ uint64_t BitToSet = 1;
+
+ while (Mask) {
+ // Isolate lowest set bit.
+ uint64_t BitToTest = Mask & -Mask;
+ if (BitToTest & Src)
+ Result |= BitToSet;
+
+ BitToSet <<= 1;
+ // Clear lowest set bit.
+ Mask &= Mask - 1;
+ }
+
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ }
+ break;
+ case Intrinsic::x86_bmi_pdep_32:
+ case Intrinsic::x86_bmi_pdep_64:
+ if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ if (MaskC->isNullValue()) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ if (MaskC->isAllOnesValue()) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Shifted = IC.Builder.CreateShl(Input,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
+ return IC.replaceInstUsesWith(II, Masked);
+ }
+
+ if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Src = SrcC->getZExtValue();
+ uint64_t Mask = MaskC->getZExtValue();
+ uint64_t Result = 0;
+ uint64_t BitToTest = 1;
+
+ while (Mask) {
+ // Isolate lowest set bit.
+ uint64_t BitToSet = Mask & -Mask;
+ if (BitToTest & Src)
+ Result |= BitToSet;
+
+ BitToTest <<= 1;
+ // Clear lowest set bit;
+ Mask &= Mask - 1;
+ }
+
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ }
+ break;
+
+ case Intrinsic::x86_sse_cvtss2si:
+ case Intrinsic::x86_sse_cvtss2si64:
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvtsd2si:
+ case Intrinsic::x86_sse2_cvtsd2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ case Intrinsic::x86_avx512_vcvtss2si32:
+ case Intrinsic::x86_avx512_vcvtss2si64:
+ case Intrinsic::x86_avx512_vcvtss2usi32:
+ case Intrinsic::x86_avx512_vcvtss2usi64:
+ case Intrinsic::x86_avx512_vcvtsd2si32:
+ case Intrinsic::x86_avx512_vcvtsd2si64:
+ case Intrinsic::x86_avx512_vcvtsd2usi32:
+ case Intrinsic::x86_avx512_vcvtsd2usi64:
+ case Intrinsic::x86_avx512_cvttss2si:
+ case Intrinsic::x86_avx512_cvttss2si64:
+ case Intrinsic::x86_avx512_cvttss2usi:
+ case Intrinsic::x86_avx512_cvttss2usi64:
+ case Intrinsic::x86_avx512_cvttsd2si:
+ case Intrinsic::x86_avx512_cvttsd2si64:
+ case Intrinsic::x86_avx512_cvttsd2usi:
+ case Intrinsic::x86_avx512_cvttsd2usi64: {
+ // These intrinsics only demand the 0th element of their input vectors. If
+ // we can simplify the input based on that, do so now.
+ Value *Arg = II.getArgOperand(0);
+ unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_avx2_pmovmskb:
+ if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ case Intrinsic::x86_avx512_vcomi_ss:
+ case Intrinsic::x86_avx512_vcomi_sd:
+ case Intrinsic::x86_avx512_mask_cmp_ss:
+ case Intrinsic::x86_avx512_mask_cmp_sd: {
+ // These intrinsics only demand the 0th element of their input vectors. If
+ // we can simplify the input based on that, do so now.
+ bool MadeChange = false;
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ if (R->getValue() == 4) {
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+
+ Value *V;
+ switch (IID) {
+ default:
+ llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ V = IC.Builder.CreateFAdd(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
+ V = IC.Builder.CreateFSub(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ V = IC.Builder.CreateFMul(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ V = IC.Builder.CreateFDiv(Arg0, Arg1);
+ break;
+ }
+
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+ break;
+
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
+ if (R->getValue() == 4) {
+ // Extract the element as scalars.
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
+ Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+ Value *V;
+ switch (IID) {
+ default:
+ llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ V = IC.Builder.CreateFAdd(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ V = IC.Builder.CreateFSub(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ V = IC.Builder.CreateFMul(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ V = IC.Builder.CreateFDiv(LHS, RHS);
+ break;
+ }
+
+ // Handle the masking aspect of the intrinsic.
+ Value *Mask = II.getArgOperand(3);
+ auto *C = dyn_cast<ConstantInt>(Mask);
+ // We don't need a select if we know the mask bit is a 1.
+ if (!C || !C->getValue()[0]) {
+ // Cast the mask to an i1 vector and then extract the lowest element.
+ auto *MaskTy = FixedVectorType::get(
+ IC.Builder.getInt1Ty(),
+ cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
+ Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
+ // Extract the lowest element from the passthru operand.
+ Value *Passthru =
+ IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
+ V = IC.Builder.CreateSelect(Mask, V, Passthru);
+ }
+
+ // Insert the result back into the original argument 0.
+ V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
+
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+ break;
+
+ // Constant fold ashr( <A x Bi>, Ci ).
+ // Constant fold lshr( <A x Bi>, Ci ).
+ // Constant fold shl( <A x Bi>, Ci ).
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512: {
+ if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ Value *Arg1 = II.getArgOperand(1);
+ assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+ "Unexpected packed shift size");
+ unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
+
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+ return IC.replaceOperand(II, 1, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
+ if (Value *V = simplifyX86varShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx512_packssdw_512:
+ case Intrinsic::x86_avx512_packsswb_512:
+ if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx512_packusdw_512:
+ case Intrinsic::x86_avx512_packuswb_512:
+ if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_pclmulqdq:
+ case Intrinsic::x86_pclmulqdq_256:
+ case Intrinsic::x86_pclmulqdq_512: {
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ unsigned Imm = C->getZExtValue();
+
+ bool MadeChange = false;
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ unsigned VWidth =
+ cast<FixedVectorType>(Arg0->getType())->getNumElements();
+
+ APInt UndefElts1(VWidth, 0);
+ APInt DemandedElts1 =
+ APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
+ if (Value *V =
+ IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+
+ APInt UndefElts2(VWidth, 0);
+ APInt DemandedElts2 =
+ APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
+ if (Value *V =
+ IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+
+ // If either input elements are undef, the result is zero.
+ if (DemandedElts1.isSubsetOf(UndefElts1) ||
+ DemandedElts2.isSubsetOf(UndefElts2)) {
+ return IC.replaceInstUsesWith(II,
+ ConstantAggregateZero::get(II.getType()));
+ }
+
+ if (MadeChange) {
+ return &II;
+ }
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_insertps:
+ if (Value *V = simplifyX86insertps(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse4a_extrq: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+ VWidth1 == 16 && "Unexpected operand sizes");
+
+ // See if we're dealing with constant values.
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CILength =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ : nullptr;
+ ConstantInt *CIIndex =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ : nullptr;
+
+ // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+ if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+ // operands and the lowest 16-bits of the second.
+ bool MadeChange = false;
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_extrqi: {
+ // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+ // bits of the lower 64-bits. The upper 64-bits are undefined.
+ Value *Op0 = II.getArgOperand(0);
+ unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+ "Unexpected operand size");
+
+ // See if we're dealing with constant values.
+ ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
+
+ // Attempt to simplify to a constant or shuffle vector.
+ if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+ // operand.
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_insertq: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+ cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
+ "Unexpected operand size");
+
+ // See if we're dealing with constant values.
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CI11 =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ : nullptr;
+
+ // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+ if (CI11) {
+ const APInt &V11 = CI11->getValue();
+ APInt Len = V11.zextOrTrunc(6);
+ APInt Idx = V11.lshr(8).zextOrTrunc(6);
+ if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+
+ // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+ // operand.
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_insertqi: {
+ // INSERTQI: Extract lowest Length bits from lower half of second source and
+ // insert over first source starting at Index bit. The upper 64-bits are
+ // undefined.
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+ VWidth1 == 2 && "Unexpected operand sizes");
+
+ // See if we're dealing with constant values.
+ ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
+
+ // Attempt to simplify to a constant or shuffle vector.
+ if (CILength && CIIndex) {
+ APInt Len = CILength->getValue().zextOrTrunc(6);
+ APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+ if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+
+ // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+ // operands.
+ bool MadeChange = false;
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_pblendvb:
+ case Intrinsic::x86_sse41_blendvps:
+ case Intrinsic::x86_sse41_blendvpd:
+ case Intrinsic::x86_avx_blendv_ps_256:
+ case Intrinsic::x86_avx_blendv_pd_256:
+ case Intrinsic::x86_avx2_pblendvb: {
+ // fold (blend A, A, Mask) -> A
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Mask = II.getArgOperand(2);
+ if (Op0 == Op1) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // Zero Mask - select 1st argument.
+ if (isa<ConstantAggregateZero>(Mask)) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
+ if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+ Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+ return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
+ }
+
+ // Convert to a vector select if we can bypass casts and find a boolean
+ // vector condition value.
+ Value *BoolVec;
+ Mask = InstCombiner::peekThroughBitcast(Mask);
+ if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
+ BoolVec->getType()->isVectorTy() &&
+ BoolVec->getType()->getScalarSizeInBits() == 1) {
+ assert(Mask->getType()->getPrimitiveSizeInBits() ==
+ II.getType()->getPrimitiveSizeInBits() &&
+ "Not expecting mask and operands with different sizes");
+
+ unsigned NumMaskElts =
+ cast<FixedVectorType>(Mask->getType())->getNumElements();
+ unsigned NumOperandElts =
+ cast<FixedVectorType>(II.getType())->getNumElements();
+ if (NumMaskElts == NumOperandElts) {
+ return SelectInst::Create(BoolVec, Op1, Op0);
+ }
+
+ // If the mask has less elements than the operands, each mask bit maps to
+ // multiple elements of the operands. Bitcast back and forth.
+ if (NumMaskElts < NumOperandElts) {
+ Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
+ Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+ Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
+ return new BitCastInst(Sel, II.getType());
+ }
+ }
+
+ break;
+ }
+
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx_vpermilvar_ps:
+ case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
+ case Intrinsic::x86_avx_vpermilvar_pd:
+ case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
+ if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ case Intrinsic::x86_avx512_permvar_df_256:
+ case Intrinsic::x86_avx512_permvar_df_512:
+ case Intrinsic::x86_avx512_permvar_di_256:
+ case Intrinsic::x86_avx512_permvar_di_512:
+ case Intrinsic::x86_avx512_permvar_hi_128:
+ case Intrinsic::x86_avx512_permvar_hi_256:
+ case Intrinsic::x86_avx512_permvar_hi_512:
+ case Intrinsic::x86_avx512_permvar_qi_128:
+ case Intrinsic::x86_avx512_permvar_qi_256:
+ case Intrinsic::x86_avx512_permvar_qi_512:
+ case Intrinsic::x86_avx512_permvar_sf_512:
+ case Intrinsic::x86_avx512_permvar_si_512:
+ if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx_maskload_ps:
+ case Intrinsic::x86_avx_maskload_pd:
+ case Intrinsic::x86_avx_maskload_ps_256:
+ case Intrinsic::x86_avx_maskload_pd_256:
+ case Intrinsic::x86_avx2_maskload_d:
+ case Intrinsic::x86_avx2_maskload_q:
+ case Intrinsic::x86_avx2_maskload_d_256:
+ case Intrinsic::x86_avx2_maskload_q_256:
+ if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
+ return I;
+ }
+ break;
+
+ case Intrinsic::x86_sse2_maskmov_dqu:
+ case Intrinsic::x86_avx_maskstore_ps:
+ case Intrinsic::x86_avx_maskstore_pd:
+ case Intrinsic::x86_avx_maskstore_ps_256:
+ case Intrinsic::x86_avx_maskstore_pd_256:
+ case Intrinsic::x86_avx2_maskstore_d:
+ case Intrinsic::x86_avx2_maskstore_q:
+ case Intrinsic::x86_avx2_maskstore_d_256:
+ case Intrinsic::x86_avx2_maskstore_q_256:
+ if (simplifyX86MaskedStore(II, IC)) {
+ return nullptr;
+ }
+ break;
+
+ case Intrinsic::x86_addcarry_32:
+ case Intrinsic::x86_addcarry_64:
+ if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ default:
+ break;
+ }
+ return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+ bool &KnownBitsComputed) const {
+ switch (II.getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_avx2_pmovmskb: {
+ // MOVMSK copies the vector elements' sign bits to the low bits
+ // and zeros the high bits.
+ unsigned ArgWidth;
+ if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+ ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+ } else {
+ auto Arg = II.getArgOperand(0);
+ auto ArgType = cast<FixedVectorType>(Arg->getType());
+ ArgWidth = ArgType->getNumElements();
+ }
+
+ // If we don't need any of low bits then return zero,
+ // we know that DemandedMask is non-zero already.
+ APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+ Type *VTy = II.getType();
+ if (DemandedElts.isNullValue()) {
+ return ConstantInt::getNullValue(VTy);
+ }
+
+ // We know that the upper bits are set to zero.
+ Known.Zero.setBitsFrom(ArgWidth);
+ KnownBitsComputed = true;
+ break;
+ }
+ }
+ return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ simplifyAndSetOp) const {
+ unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
+ switch (II.getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::x86_xop_vfrcz_ss:
+ case Intrinsic::x86_xop_vfrcz_sd:
+ // The instructions for these intrinsics are speced to zero upper bits not
+ // pass them through like other scalar intrinsics. So we shouldn't just
+ // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+ // Instead we should return a zero vector.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return ConstantAggregateZero::get(II.getType());
+ }
+
+ // Only the lower element is used.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // Only the lower element is undefined. The high elements are zero.
+ UndefElts = UndefElts[0];
+ break;
+
+ // Unary scalar-as-vector operations that work column-wise.
+ case Intrinsic::x86_sse_rcp_ss:
+ case Intrinsic::x86_sse_rsqrt_ss:
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+ // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+ // checks).
+ break;
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0. The low element is a function of both
+ // operands.
+ case Intrinsic::x86_sse_min_ss:
+ case Intrinsic::x86_sse_max_ss:
+ case Intrinsic::x86_sse_cmp_ss:
+ case Intrinsic::x86_sse2_min_sd:
+ case Intrinsic::x86_sse2_max_sd:
+ case Intrinsic::x86_sse2_cmp_sd: {
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+ // Lower element is undefined if both lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0])
+ UndefElts.clearBit(0);
+
+ break;
+ }
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element comes from operand 1.
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ // Don't use the low element of operand 0.
+ APInt DemandedElts2 = DemandedElts;
+ DemandedElts2.clearBit(0);
+ simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+ // Take the high undef elements from operand 0 and take the lower element
+ // from operand 1.
+ UndefElts.clearBit(0);
+ UndefElts |= UndefElts2[0];
+ break;
+ }
+
+ // Three input scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element is a function of all
+ // three inputs.
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_max_ss_round:
+ case Intrinsic::x86_avx512_mask_min_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ case Intrinsic::x86_avx512_mask_max_sd_round:
+ case Intrinsic::x86_avx512_mask_min_sd_round:
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1 and 2.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
+
+ // Lower element is undefined if all three lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0] || !UndefElts3[0])
+ UndefElts.clearBit(0);
+ break;
+
+ // TODO: Add fmaddsub support?
+ case Intrinsic::x86_sse3_addsub_pd:
+ case Intrinsic::x86_sse3_addsub_ps:
+ case Intrinsic::x86_avx_addsub_pd_256:
+ case Intrinsic::x86_avx_addsub_ps_256: {
+ // If none of the even or none of the odd lanes are required, turn this
+ // into a generic FP math instruction.
+ APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
+ APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
+ bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
+ bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
+ if (IsSubOnly || IsAddOnly) {
+ assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&II);
+ Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
+ return IC.Builder.CreateBinOp(
+ IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
+ }
+
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ UndefElts &= UndefElts2;
+ break;
+ }
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx2_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx512_packssdw_512:
+ case Intrinsic::x86_avx512_packsswb_512:
+ case Intrinsic::x86_avx512_packusdw_512:
+ case Intrinsic::x86_avx512_packuswb_512: {
+ auto *Ty0 = II.getArgOperand(0)->getType();
+ unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
+ assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+ unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+ unsigned VWidthPerLane = VWidth / NumLanes;
+ unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+ // Per lane, pack the elements of the first input and then the second.
+ // e.g.
+ // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+ // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+ for (int OpNum = 0; OpNum != 2; ++OpNum) {
+ APInt OpDemandedElts(InnerVWidth, 0);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ unsigned LaneIdx = Lane * VWidthPerLane;
+ for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+ unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+ if (DemandedElts[Idx])
+ OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+ }
+ }
+
+ // Demand elements from the operand.
+ APInt OpUndefElts(InnerVWidth, 0);
+ simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
+
+ // Pack the operand's UNDEF elements, one lane at a time.
+ OpUndefElts = OpUndefElts.zext(VWidth);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+ LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+ LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+ UndefElts |= LaneElts;
+ }
+ }
+ break;
+ }
+
+ // PSHUFB
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ // PERMILVAR
+ case Intrinsic::x86_avx_vpermilvar_ps:
+ case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
+ case Intrinsic::x86_avx_vpermilvar_pd:
+ case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
+ // PERMV
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps: {
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
+ break;
+ }
+
+ // SSE4A instructions leave the upper 64-bits of the 128-bit result
+ // in an undefined state.
+ case Intrinsic::x86_sse4a_extrq:
+ case Intrinsic::x86_sse4a_extrqi:
+ case Intrinsic::x86_sse4a_insertq:
+ case Intrinsic::x86_sse4a_insertqi:
+ UndefElts.setHighBits(VWidth / 2);
+ break;
+ }
+ return None;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
index e26dd5050a23..e4f3290cab9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
@@ -16,17 +16,21 @@
let Predicates = [HasAMXTILE, In64BitMode] in {
let SchedRW = [WriteSystem] in {
- let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ let hasSideEffects = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"ldtilecfg\t$src",
[(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+ let hasSideEffects = 1 in
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"sttilecfg\t$src",
[(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+ let mayLoad = 1 in
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloadd\t{$src, $dst|$dst, $src}", []>,
VEX, T8XD;
+ let mayLoad = 1 in
def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloaddt1\t{$src, $dst|$dst, $src}", []>,
@@ -34,6 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
"tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+ let mayStore = 1 in
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
(ins sibmem:$dst, TILE:$src),
"tilestored\t{$src, $dst|$dst, $src}", []>,
@@ -42,6 +47,25 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
"tilezero\t$dst", []>,
VEX, T8XD;
+ // Pseduo instruction for RA.
+ let hasSideEffects = 1, mayLoad = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
+
+ let hasSideEffects = 1, mayStore = 1 in
+ def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
+
+ def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3,
+ TILECFG:$cfg), []>;
+ def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
+ GR16:$src2, opaquemem:$src3,
+ TILE:$src4, TILECFG:$cfg), []>;
+ def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ TILECFG:$cfg), []>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
@@ -50,7 +74,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
sibmem:$src2), []>;
def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
- [(int_x86_tilezero imm:$src)]>;
+ [(int_x86_tilezero timm:$src)]>;
}
} // SchedRW
} // HasAMXTILE
@@ -76,25 +100,31 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
VEX_4V, T8PS;
}
+ // Pseduo instruction for RA.
+ let Constraints = "$src4 = $dst" in
+ def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbssd imm:$src1,
- imm:$src2, imm:$src3)]>;
+ [(int_x86_tdpbssd timm:$src1,
+ timm:$src2, timm:$src3)]>;
def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbsud imm:$src1,
- imm:$src2, imm:$src3)]>;
+ [(int_x86_tdpbsud timm:$src1,
+ timm:$src2, timm:$src3)]>;
def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbusd imm:$src1,
- imm:$src2, imm:$src3)]>;
+ [(int_x86_tdpbusd timm:$src1,
+ timm:$src2, timm:$src3)]>;
def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbuud imm:$src1,
- imm:$src2, imm:$src3)]>;
+ [(int_x86_tdpbuud timm:$src1,
+ timm:$src2, timm:$src3)]>;
}
}
} // HasAMXTILE
@@ -112,8 +142,8 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbf16ps imm:$src1,
- imm:$src2, imm:$src3)]>;
+ [(int_x86_tdpbf16ps timm:$src1,
+ timm:$src2, timm:$src3)]>;
}
}
} // HasAMXTILE, HasAMXBF16
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
index a3ad0b1c8dd6..0c2b278fdd7b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1414,11 +1414,12 @@ defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
- X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _Dst,
+ X86VectorVTInfo _Src> {
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
- (_Dst.VT (X86SubVBroadcast
- (_Src.VT (_Src.LdFrag addr:$src))))>,
+ (_Dst.VT (OpNode addr:$src))>,
Sched<[SchedWriteShuffle.YMM.Folded]>,
AVX5128IBase, EVEX;
}
@@ -1427,13 +1428,14 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
// the unmasked patterns so that we only use the DQ instructions when masking
// is requested.
multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
- X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _Dst,
+ X86VectorVTInfo _Src> {
let hasSideEffects = 0, mayLoad = 1 in
defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
(null_frag),
- (_Dst.VT (X86SubVBroadcast
- (_Src.VT (_Src.LdFrag addr:$src))))>,
+ (_Dst.VT (OpNode addr:$src))>,
Sched<[SchedWriteShuffle.YMM.Folded]>,
AVX5128IBase, EVEX;
}
@@ -1443,225 +1445,194 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
//
defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
- v16i32_info, v4i32x_info>,
+ X86SubVBroadcastld128, v16i32_info, v4i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
- v16f32_info, v4f32x_info>,
+ X86SubVBroadcastld128, v16f32_info, v4f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
- v8i64_info, v4i64x_info>, VEX_W,
+ X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT4>;
defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
- v8f64_info, v4f64x_info>, VEX_W,
+ X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT4>;
let Predicates = [HasAVX512] in {
-def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
+def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))),
+def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))),
+def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTI64X4rm addr:$src)>;
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
- (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v4f64 VR256X:$src), 1)>;
-def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
- (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v8f32 VR256X:$src), 1)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
- (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v4i64 VR256X:$src), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
- (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v8i32 VR256X:$src), 1)>;
-def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
- (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v16i16 VR256X:$src), 1)>;
-def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
- (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v32i8 VR256X:$src), 1)>;
-
-def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF32X4rm addr:$src)>;
-def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4rm addr:$src)>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
(v16f32 immAllZerosV)),
(VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
(v16i32 immAllZerosV)),
(VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
(v8f64 immAllZerosV)),
(VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
(v8i64 immAllZerosV)),
(VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
let Predicates = [HasVLX] in {
defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
- v8i32x_info, v4i32x_info>,
+ X86SubVBroadcastld128, v8i32x_info, v4i32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
- v8f32x_info, v4f32x_info>,
+ X86SubVBroadcastld128, v8f32x_info, v4f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
-def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF32X4Z256rm addr:$src)>;
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4Z256rm addr:$src)>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
(v8f32 immAllZerosV)),
(VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
(v8i32 immAllZerosV)),
(VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
-
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
- (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v2f64 VR128X:$src), 1)>;
-def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
- (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v4f32 VR128X:$src), 1)>;
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
- (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v2i64 VR128X:$src), 1)>;
-def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
- (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v4i32 VR128X:$src), 1)>;
-def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
- (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v8i16 VR128X:$src), 1)>;
-def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
- (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v16i8 VR128X:$src), 1)>;
}
let Predicates = [HasVLX, HasDQI] in {
defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
- v4i64x_info, v2i64x_info>, VEX_W1X,
+ X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
- v4f64x_info, v2f64x_info>, VEX_W1X,
+ X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
(v4f64 immAllZerosV)),
(VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
(v4i64 immAllZerosV)),
(VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI] in {
defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
- v8i64_info, v2i64x_info>, VEX_W,
+ X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8",
- v16i32_info, v8i32x_info>,
+ X86SubVBroadcastld256, v16i32_info, v8i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
- v8f64_info, v2f64x_info>, VEX_W,
+ X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
- v16f32_info, v8f32x_info>,
+ X86SubVBroadcastld256, v16f32_info, v8f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
(v16f32 immAllZerosV)),
(VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
(v16i32 immAllZerosV)),
(VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
(v8f64 immAllZerosV)),
(VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
(v8i64 immAllZerosV)),
(VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
@@ -2494,10 +2465,6 @@ def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
(X86cmpm node:$src1, node:$src2, node:$cc), [{
return N->hasOneUse();
}]>;
-def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
- (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{
- return N->hasOneUse();
-}]>;
def X86cmpm_imm_commute : SDNodeXForm<timm, [{
uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
@@ -2564,19 +2531,71 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
(X86cmpm_imm_commute timm:$cc))>;
+
+ // Patterns for mask intrinsics.
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
+ _.RC:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2, timm:$cc)>;
+
+ // Patterns for mask intrinsics with loads in other operand.
+ def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
}
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
let Uses = [MXCSR] in
- defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
- (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1",
"$src1, $src2, {sae}, $cc",
- (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
- (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- timm:$cc)>,
+ [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
+ [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
EVEX_B, Sched<[sched]>;
}
@@ -2836,6 +2855,8 @@ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
+def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>;
def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
@@ -2871,9 +2892,6 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
// Load/store kreg
let Predicates = [HasDQI] in {
- def : Pat<(store VK1:$src, addr:$dst),
- (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
-
def : Pat<(v1i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
def : Pat<(v2i1 (load addr:$src)),
@@ -2919,10 +2937,9 @@ let Predicates = [HasAVX512] in {
def : Pat<(insert_subvector (v16i1 immAllZerosV),
(v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)),
- (COPY_TO_REGCLASS
- (KMOVWkr (AND32ri8
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
- (i32 1))), VK16)>;
+ (KMOVWkr (AND32ri8
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
+ (i32 1)))>;
}
// Mask unary operation
@@ -6487,8 +6504,8 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512vl_f64_info, "PD">, VEX_W;
}
-defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd,
- X86Fmadd, X86FmaddRnd>;
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
+ fma, X86FmaddRnd>;
defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub,
X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub,
@@ -6578,8 +6595,8 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512vl_f64_info, "PD">, VEX_W;
}
-defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd,
- X86Fmadd, X86FmaddRnd>;
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
+ fma, X86FmaddRnd>;
defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub,
X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub,
@@ -6670,8 +6687,8 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512vl_f64_info, "PD">, VEX_W;
}
-defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd,
- X86Fmadd, X86FmaddRnd>;
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
+ fma, X86FmaddRnd>;
defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub,
X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub,
@@ -6773,7 +6790,7 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>;
defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
@@ -6981,7 +6998,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
}
}
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
"SS", X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
"SS", X86Movss, v4f32x_info, fp32imm0>;
@@ -6990,7 +7007,7 @@ defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMA
defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
"SS", X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
"SD", X86Movsd, v2f64x_info, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
"SD", X86Movsd, v2f64x_info, fp64imm0>;
@@ -7523,7 +7540,7 @@ multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
SDNode OpNode, SDNode OpNodeRnd,
X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
- let Predicates = [HasAVX512] in {
+ let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
@@ -7534,7 +7551,7 @@ multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
SDNode OpNode, SDNode OpNodeSAE,
X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
- let Predicates = [HasAVX512] in {
+ let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
EVEX_CD8<32, CD8VT1>, XS;
@@ -10433,39 +10450,6 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-let Predicates = [HasAVX512] in {
-// Provide fallback in case the load node that is used in the broadcast
-// patterns above is used by additional users, which prevents the pattern
-// selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
- (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- 0)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
- (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- 0)>;
-
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
- (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- 0)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
- (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- 0)>;
-
-def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
- (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- 0)>;
-
-def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
- (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- 0)>;
-}
-
multiclass avx512_valign<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
// NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the
@@ -10895,7 +10879,7 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
def mr : AVX512Ii8<opc, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))),
+ [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))),
addr:$dst)]>,
EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>;
}
@@ -10906,7 +10890,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
- (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+ (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
EVEX, TAPD, Sched<[WriteVecExtract]>;
defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
@@ -10919,7 +10903,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
- (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+ (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
EVEX, PD, Sched<[WriteVecExtract]>;
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
@@ -10959,12 +10943,13 @@ defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, PatFrag LdFrag> {
+ X86VectorVTInfo _, PatFrag LdFrag,
+ SDPatternOperator immoperator> {
def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
- (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+ (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
@@ -10975,10 +10960,10 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
- (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
Sched<[WriteVecInsert]>;
- defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
}
}
@@ -10993,7 +10978,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
- _.ScalarLdFrag>, TAPD;
+ _.ScalarLdFrag, imm>, TAPD;
}
}
@@ -11205,17 +11190,6 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
- // Additional patterns for matching loads in other positions.
- def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
- (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (OpNode _.RC:$src1,
- (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 timm:$src4))),
- (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
-
// Additional patterns for matching zero masking with loads in other
// positions.
def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
@@ -11264,17 +11238,6 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
- // Additional patterns for matching broadcasts in other positions.
- def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3),
- _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
- (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (OpNode _.RC:$src1,
- (_.BroadcastLdFrag addr:$src3),
- _.RC:$src2, (i8 timm:$src4))),
- (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
-
// Additional patterns for matching zero masking with broadcasts in other
// positions.
def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
@@ -11346,398 +11309,6 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
avx512vl_i64_info>, VEX_W;
-// Patterns to use VPTERNLOG for vXi16/vXi8 vectors.
-let Predicates = [HasVLX] in {
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
- (i8 timm:$src4))),
- (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
- timm:$src4)>;
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (loadv16i8 addr:$src3), (i8 timm:$src4))),
- (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2,
- VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i8 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
- (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i8 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
- (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
- (i8 timm:$src4))),
- (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
- timm:$src4)>;
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (loadv8i16 addr:$src3), (i8 timm:$src4))),
- (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2,
- VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v8i16 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
- (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v8i16 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
- (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v4i32 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v4i32 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v4i32 (X86vpternlog VR128X:$src1,
- (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v2i64 (X86vpternlog VR128X:$src1, VR128X:$src2,
- (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v2i64 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v2i64 (X86vpternlog VR128X:$src1,
- (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
- VR128X:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
- (i8 timm:$src4))),
- (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
- timm:$src4)>;
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (loadv32i8 addr:$src3), (i8 timm:$src4))),
- (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2,
- VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i8 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
- (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i8 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
- (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
- (i8 timm:$src4))),
- (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
- timm:$src4)>;
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (loadv16i16 addr:$src3), (i8 timm:$src4))),
- (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2,
- VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i16 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
- (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i16 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
- (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v8i32 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v8i32 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v8i32 (X86vpternlog VR256X:$src1,
- (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v4i64 (X86vpternlog VR256X:$src1, VR256X:$src2,
- (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v4i64 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v4i64 (X86vpternlog VR256X:$src1,
- (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
- VR256X:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-}
-
-let Predicates = [HasAVX512] in {
- def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
- (i8 timm:$src4))),
- (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
- timm:$src4)>;
- def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
- (loadv64i8 addr:$src3), (i8 timm:$src4))),
- (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2,
- VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v64i8 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v64i8 (X86vpternlog VR512:$src1,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v64i8 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v64i8 (X86vpternlog VR512:$src1,
- (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
- (i8 timm:$src4))),
- (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
- timm:$src4)>;
- def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
- (loadv32i16 addr:$src3), (i8 timm:$src4))),
- (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2,
- VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i16 (X86vpternlog VR512:$src1,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i16 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i16 (X86vpternlog VR512:$src1,
- (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v32i16 (X86vpternlog VR512:$src1,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v16i32 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v16i32 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v16i32 (X86vpternlog VR512:$src1,
- (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-
- def : Pat<(v8i64 (X86vpternlog VR512:$src1, VR512:$src2,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- timm:$src4)>;
- def : Pat<(v8i64 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, VR512:$src1, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(v8i64 (X86vpternlog VR512:$src1,
- (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
- VR512:$src2, (i8 timm:$src4))),
- (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
- (VPTERNLOG132_imm8 timm:$src4))>;
-}
-
// Patterns to implement vnot using vpternlog instead of creating all ones
// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
// so that the result is only dependent on src0. But we use the same source
@@ -12281,11 +11852,6 @@ defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul
defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
-def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
- (X86vpmaddwd node:$lhs, node:$rhs), [{
- return N->hasOneUse();
-}]>;
-
// Patterns to match VPDPWSSD from existing instructions/intrinsics.
let Predicates = [HasVNNI] in {
def : Pat<(v16i32 (add VR512:$src1,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
index f7f22285bd15..e83e1e74ff52 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1182,6 +1182,15 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
X86sub_flag, sub, 0, 1, 0>;
}
+// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
+// __builtin_parity where the last step xors an h-register with an l-register.
+let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst",
+ Defs = [EFLAGS], isCommutable = 1 in
+def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
+ (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
+ Sched<[WriteALU]>;
+
// Arithmetic.
defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
1, 0>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
index 4df93fb2ed60..7a2facf226d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -73,25 +73,32 @@ let usesCustomInserter = 1, Defs = [EFLAGS] in {
def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
(outs),
(ins GR8:$al,
- i64imm:$regsavefi, i64imm:$offset,
+ i32imm:$regsavefi, i32imm:$offset,
variable_ops),
"#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
[(X86vastart_save_xmm_regs GR8:$al,
- imm:$regsavefi,
- imm:$offset),
+ timm:$regsavefi,
+ timm:$offset),
(implicit EFLAGS)]>;
-// The VAARG_64 pseudo-instruction takes the address of the va_list,
-// and places the address of the next argument into a register.
-let Defs = [EFLAGS] in
+// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the
+// va_list, and place the address of the next argument into a register.
+let Defs = [EFLAGS] in {
def VAARG_64 : I<0, Pseudo,
(outs GR64:$dst),
(ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
"#VAARG_64 $dst, $ap, $size, $mode, $align",
[(set GR64:$dst,
- (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
- (implicit EFLAGS)]>;
-
+ (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+ (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>;
+def VAARG_X32 : I<0, Pseudo,
+ (outs GR32:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_X32 $dst, $ap, $size, $mode, $align",
+ [(set GR32:$dst,
+ (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+ (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>;
+}
// When using segmented stacks these are lowered into instructions which first
// check if the current stacklet has enough free memory. If it does, memory is
@@ -467,11 +474,19 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_addr64",
[(X86tlsaddr tls64addr:$sym)]>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode, IsLP64]>;
def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_base_addr64",
[(X86tlsbaseaddr tls64baseaddr:$sym)]>,
- Requires<[In64BitMode]>;
+ Requires<[In64BitMode, IsLP64]>;
+def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addrX32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[In64BitMode, NotLP64]>;
+def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addrX32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[In64BitMode, NotLP64]>;
}
// Darwin TLS Support
@@ -809,15 +824,6 @@ let Predicates = [UseIncDec] in {
}
// Atomic compare and swap.
-multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
- SDPatternOperator frag, X86MemOperand x86memop> {
-let isCodeGenOnly = 1, usesCustomInserter = 1 in {
- def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
- !strconcat(mnemonic, "\t$ptr"),
- [(frag addr:$ptr)]>, TB, LOCK;
-}
-}
-
multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
string mnemonic, SDPatternOperator frag> {
let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
@@ -841,8 +847,19 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
- Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in {
-defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, usesCustomInserter = 1 in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
+ "cmpxchg8b\t$ptr",
+ [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
+def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
+ "cmpxchg16b\t$ptr",
+ []>, TB, LOCK;
}
// This pseudo must be used when the frame uses RBX as
@@ -852,50 +869,64 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
// RBX that will happen when setting the arguments for the instrucion.
//
// Unlike the actual related instruction, we mark that this one
-// defines EBX (instead of using EBX).
+// defines RBX (instead of using RBX).
// The rationale is that we will define RBX during the expansion of
-// the pseudo. The argument feeding EBX is ebx_input.
+// the pseudo. The argument feeding RBX is rbx_input.
//
-// The additional argument, $ebx_save, is a temporary register used to
+// The additional argument, $rbx_save, is a temporary register used to
// save the value of RBX across the actual instruction.
//
-// To make sure the register assigned to $ebx_save does not interfere with
+// To make sure the register assigned to $rbx_save does not interfere with
// the definition of the actual instruction, we use a definition $dst which
// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
// the instruction and we are sure we will have a valid register to restore
// the value of RBX.
-let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
- Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
- isCodeGenOnly = 1, isPseudo = 1, Constraints = "$ebx_save = $dst",
- usesCustomInserter = 1 in {
-def LCMPXCHG8B_SAVE_EBX :
- I<0, Pseudo, (outs GR32:$dst),
- (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
- !strconcat("cmpxchg8b", "\t$ptr"),
- [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
- GR32:$ebx_save))]>;
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1,
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+ Constraints = "$rbx_save = $dst" in {
+def LCMPXCHG16B_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>;
}
+// Pseudo instruction that doesn't read/write RBX. Will be turned into either
+// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1,
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+ usesCustomInserter = 1 in {
+def LCMPXCHG16B_NO_RBX :
+ I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "",
+ [(X86cas16 addr:$ptr, GR64:$rbx_input)]>;
+}
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in {
-defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
- X86cas16, i128mem>, REX_W;
+// This pseudo must be used when the frame uses RBX/EBX as
+// the base pointer.
+// cf comment for LCMPXCHG16B_SAVE_RBX.
+let Defs = [EBX], Uses = [ECX, EAX],
+ Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+ isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in {
+def MWAITX_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins GR32:$ebx_input, GR64:$rbx_save),
+ "mwaitx",
+ []>;
}
-// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
-let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
- isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
+// Pseudo mwaitx instruction to use for custom insertion.
+let Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+ isCodeGenOnly = 1, isPseudo = 1,
usesCustomInserter = 1 in {
-def LCMPXCHG16B_SAVE_RBX :
- I<0, Pseudo, (outs GR64:$dst),
- (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
- !strconcat("cmpxchg16b", "\t$ptr"),
- [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
- GR64:$rbx_save))]>;
+def MWAITX :
+ I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx),
+ "mwaitx",
+ [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>;
}
+
defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
// Atomic exchange and add
@@ -1182,49 +1213,49 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
return true;
}]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
- (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
// callee-saved register.
-def : Pat<(X86tcret (load addr:$dst), imm:$off),
- (TCRETURNmi addr:$dst, imm:$off)>,
+def : Pat<(X86tcret (load addr:$dst), timm:$off),
+ (TCRETURNmi addr:$dst, timm:$off)>,
Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
-def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
- (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off),
+ (TCRETURNdi tglobaladdr:$dst, timm:$off)>,
Requires<[NotLP64]>;
-def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
- (TCRETURNdi texternalsym:$dst, imm:$off)>,
+def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
+ (TCRETURNdi texternalsym:$dst, timm:$off)>,
Requires<[NotLP64]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
- (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
// Don't fold loads into X86tcret requiring more than 6 regs.
// There wouldn't be enough scratch registers for base+index.
-def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
- (TCRETURNmi64 addr:$dst, imm:$off)>,
+def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
+ (TCRETURNmi64 addr:$dst, timm:$off)>,
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
- (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[In64BitMode, UseIndirectThunkCalls]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
- (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[Not64BitMode, UseIndirectThunkCalls]>;
-def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
- (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
Requires<[IsLP64]>;
-def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
- (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off),
+ (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
Requires<[IsLP64]>;
// Normal calls, with various flavors of addresses.
@@ -1698,6 +1729,16 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
(EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
Requires<[In64BitMode]>;
+// Special pattern to catch the last step of __builtin_parity handling. Our
+// goal is to use an xor of an h-register with the corresponding l-register.
+// The above patterns would handle this on non 64-bit targets, but for 64-bit
+// we need to be more careful. We're using a NOREX instruction here in case
+// register allocation fails to keep the two registers together. So we need to
+// make sure we can't accidentally mix R8-R15 with an h-register.
+def : Pat<(X86xor_flag (i8 (trunc GR32:$src)),
+ (i8 (trunc (srl_su GR32:$src, (i8 8))))),
+ (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
// (shl x, 1) ==> (add x, x)
// Note that if x is undef (immediate or otherwise), we could theoretically
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
index 4dbd6bb8cd7e..f9be3a783279 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
@@ -123,7 +123,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
- loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32, any_fma, v4f32, v8f32,
SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
@@ -138,7 +138,7 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
- loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
+ loadv2f64, loadv4f64, any_fma, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
@@ -319,7 +319,7 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
VR128, sdmem, sched>, VEX_W;
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma,
SchedWriteFMA.Scl>, VEX_LIG;
defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
SchedWriteFMA.Scl>, VEX_LIG;
@@ -372,12 +372,12 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
}
}
-defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
@@ -538,7 +538,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
- defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86any_Fmadd, loadf32,
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
@@ -555,7 +555,7 @@ let ExeDomain = SSEPackedSingle in {
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
// Packed Instructions
- defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -571,7 +571,7 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
- defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86any_Fmadd, loadf64,
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
@@ -588,7 +588,7 @@ let ExeDomain = SSEPackedDouble in {
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
// Packed Instructions
- defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -629,12 +629,12 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
}
}
-defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
index 67dcb8d00ea5..961b4e590365 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -392,13 +392,13 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW, FPCW], mayLoad = 1 in {
-def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
-def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">;
+def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">;
}
let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in {
-def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
-def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">;
}
let Uses = [FPSW], mayStore = 1 in
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index e16382e956c5..17fe7f0bd310 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -300,11 +300,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
+ { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
@@ -357,6 +359,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE },
+ { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -367,6 +371,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -3742,18 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
{ X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
{ X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
+ { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
{ X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
{ X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
{ X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 },
+ { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 },
+ { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 },
{ X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 },
{ X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
{ X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
+ { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
+ { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
{ X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
{ X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
{ X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 },
+ { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 },
+ { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 },
{ X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 },
{ X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 },
{ X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 },
+ { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 },
{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
index d7752e656b55..686b19fc0a6c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
@@ -216,6 +216,7 @@ class T8XS : T8 { Prefix OpPrefix = XS; }
class TAPS : TA { Prefix OpPrefix = PS; }
class TAPD : TA { Prefix OpPrefix = PD; }
class TAXD : TA { Prefix OpPrefix = XD; }
+class TAXS : TA { Prefix OpPrefix = XS; }
class VEX { Encoding OpEnc = EncVEX; }
class VEX_W { bit HasVEX_W = 1; }
class VEX_WIG { bit IgnoresVEX_W = 1; }
@@ -263,6 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; }
// Prevent EVEX->VEX conversion from considering this instruction.
class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr, Domain d = GenericDomain>
: Instruction {
@@ -347,6 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
+ bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
@@ -375,6 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{51-45} = CD8_Scale;
let TSFlags{52} = hasEVEX_RC;
let TSFlags{53} = hasNoTrackPrefix;
+ let TSFlags{54} = ExplicitVEXPrefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index f3f7d17d9b3c..777c5a158b4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -87,16 +87,16 @@ def X86multishift : SDNode<"X86ISD::MULTISHIFT",
SDTCisSameAs<1,2>]>>;
def X86pextrb : SDNode<"X86ISD::PEXTRB",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
- SDTCisPtrTy<2>]>>;
+ SDTCisVT<2, i8>]>>;
def X86pextrw : SDNode<"X86ISD::PEXTRW",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
- SDTCisPtrTy<2>]>>;
+ SDTCisVT<2, i8>]>>;
def X86pinsrb : SDNode<"X86ISD::PINSRB",
SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
- SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+ SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
def X86pinsrw : SDNode<"X86ISD::PINSRW",
SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
- SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+ SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
def X86insertps : SDNode<"X86ISD::INSERTPS",
SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
@@ -109,6 +109,8 @@ def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -207,16 +209,21 @@ def X86CmpMaskCC :
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86MaskCmpMaskCC :
+ SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
def X86CmpMaskCCScalar :
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>]>;
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>;
def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_cmpm node:$src1, node:$src2, node:$src3),
(X86cmpm node:$src1, node:$src2, node:$src3)]>;
-def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
+def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
@@ -488,10 +495,6 @@ def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS",
SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>,
SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
-def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
- SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisSubVecOfVec<1, 0>]>, []>;
-
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
@@ -533,11 +536,6 @@ def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>;
def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>;
def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>;
-def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86strict_Fmadd : SDNode<"ISD::STRICT_FMA", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
-def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
- [(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
- (X86Fmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
@@ -963,6 +961,16 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
+def X86SubVBroadcastld128 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16;
+}]>;
+
+def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32;
+}]>;
+
// Scalar SSE intrinsic fragments to match several different types of loads.
// Used by scalar SSE intrinsic instructions which have 128 bit types, but
// only load a single element.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
index 42c111173570..d9bab14f0c08 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -28,9 +28,9 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -947,9 +947,9 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
}
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
-static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
// Don't waste compile time scanning use-def chains of physregs.
- if (!Register::isVirtualRegister(BaseReg))
+ if (!BaseReg.isVirtual())
return false;
bool isPICBase = false;
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
@@ -1127,7 +1127,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
- if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+ if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
+ MachineBasicBlock::LQR_Dead) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
int Value;
@@ -1205,8 +1206,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
isKill = Src.isKill();
assert(!Src.isUndef() && "Undef op doesn't need optimization");
- if (Register::isVirtualRegister(NewSrc) &&
- !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+ if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
return false;
return true;
@@ -1214,7 +1214,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
// another we need to add 64-bit registers to the final MI.
- if (Register::isPhysicalRegister(SrcReg)) {
+ if (SrcReg.isPhysical()) {
ImplicitOp = Src;
ImplicitOp.setImplicit();
@@ -1409,9 +1409,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
- if (Register::isVirtualRegister(Src.getReg()) &&
- !MF.getRegInfo().constrainRegClass(Src.getReg(),
- &X86::GR64_NOSPRegClass))
+ if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
+ Src.getReg(), &X86::GR64_NOSPRegClass))
return nullptr;
NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
@@ -2567,6 +2566,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ case X86::VPDPWSSDYrr:
+ case X86::VPDPWSSDrr:
+ case X86::VPDPWSSDSYrr:
+ case X86::VPDPWSSDSrr:
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128rk:
case X86::VPDPWSSDZ128rkz:
@@ -3527,11 +3530,10 @@ X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
return None;
}
-static unsigned getLoadStoreRegOpcode(unsigned Reg,
+static unsigned getLoadStoreRegOpcode(Register Reg,
const TargetRegisterClass *RC,
- bool isStackAligned,
- const X86Subtarget &STI,
- bool load) {
+ bool IsStackAligned,
+ const X86Subtarget &STI, bool load) {
bool HasAVX = STI.hasAVX();
bool HasAVX512 = STI.hasAVX512();
bool HasVLX = STI.hasVLX();
@@ -3604,7 +3606,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
case 16: {
if (X86::VR128XRegClass.hasSubClassEq(RC)) {
// If stack is realigned we can use aligned stores.
- if (isStackAligned)
+ if (IsStackAligned)
return load ?
(HasVLX ? X86::VMOVAPSZ128rm :
HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
@@ -3636,7 +3638,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
case 32:
assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
// If stack is realigned we can use aligned stores.
- if (isStackAligned)
+ if (IsStackAligned)
return load ?
(HasVLX ? X86::VMOVAPSZ256rm :
HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
@@ -3655,13 +3657,80 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
- if (isStackAligned)
+ if (IsStackAligned)
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
}
+Optional<ExtAddrMode>
+X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemI.getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBegin < 0)
+ return None;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
+ if (!BaseOp.isReg()) // Can be an MO_FrameIndex
+ return None;
+
+ const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return None;
+
+ ExtAddrMode AM;
+ AM.BaseReg = BaseOp.getReg();
+ AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
+ AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
+ AM.Displacement = DispMO.getImm();
+ return AM;
+}
+
+bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
+ const Register Reg,
+ int64_t &ImmVal) const {
+ if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
+ return false;
+ // Mov Src can be a global address.
+ if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
+ return false;
+ ImmVal = MI.getOperand(1).getImm();
+ return true;
+}
+
+bool X86InstrInfo::preservesZeroValueInReg(
+ const MachineInstr *MI, const Register NullValueReg,
+ const TargetRegisterInfo *TRI) const {
+ if (!MI->modifiesRegister(NullValueReg, TRI))
+ return true;
+ switch (MI->getOpcode()) {
+ // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
+ // X.
+ case X86::SHR64ri:
+ case X86::SHR32ri:
+ case X86::SHL64ri:
+ case X86::SHL32ri:
+ assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
+ "expected for shift opcode!");
+ return MI->getOperand(0).getReg() == NullValueReg &&
+ MI->getOperand(1).getReg() == NullValueReg;
+ // Zero extend of a sub-reg of NullValueReg into itself does not change the
+ // null value.
+ case X86::MOV32rr:
+ return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
+ return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
+ });
+ default:
+ return false;
+ }
+ llvm_unreachable("Should be handled above!");
+}
+
bool X86InstrInfo::getMemOperandsWithOffsetWidth(
const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
@@ -3706,19 +3775,17 @@ bool X86InstrInfo::getMemOperandsWithOffsetWidth(
return true;
}
-static unsigned getStoreRegOpcode(unsigned SrcReg,
+static unsigned getStoreRegOpcode(Register SrcReg,
const TargetRegisterClass *RC,
- bool isStackAligned,
+ bool IsStackAligned,
const X86Subtarget &STI) {
- return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+ return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
}
-
-static unsigned getLoadRegOpcode(unsigned DestReg,
+static unsigned getLoadRegOpcode(Register DestReg,
const TargetRegisterClass *RC,
- bool isStackAligned,
- const X86Subtarget &STI) {
- return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+ bool IsStackAligned, const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
}
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -3729,13 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const MachineFunction &MF = *MBB.getParent();
assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
- bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill));
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILESTORED;
+ // tilestored %tmm, (%sp, %idx)
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ MachineOperand &MO = NewMI->getOperand(2);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PSTTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ }
}
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -3743,13 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- const MachineFunction &MF = *MBB.getParent();
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
- bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILELOADD;
+ // tileloadd (%sp, %idx), %tmm
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ MachineInstr *NewMI =
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ MachineOperand &MO = NewMI->getOperand(3);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PLDTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ } else {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ }
}
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
@@ -4312,7 +4416,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
/// instructions in-between do not load or store, and have no side effects.
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
- unsigned &FoldAsLoadDefReg,
+ Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move DefMI here.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
@@ -4375,8 +4479,8 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
/// %k4 = K_SET1
/// to:
/// %k4 = KXNORrr %k0, %k0
-static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
- const MCInstrDesc &Desc, unsigned Reg) {
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
+ Register Reg) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
MIB->setDesc(Desc);
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
@@ -4822,7 +4926,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI.getOperand(0);
Register Reg = MO.getReg();
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
return 0;
} else {
@@ -5120,18 +5224,12 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
/// high bits that are passed-through are not live.
unsigned
-X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
- for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands();
- i != e; ++i) {
- const MachineOperand &MO = MI.getOperand(i);
- if (MO.isReg() && MO.isUndef() &&
- Register::isPhysicalRegister(MO.getReg()) &&
- hasUndefRegUpdate(MI.getOpcode(), i)) {
- OpNum = i;
- return UndefRegClearance;
- }
- }
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (Register::isPhysicalRegister(MO.getReg()) &&
+ hasUndefRegUpdate(MI.getOpcode(), OpNum))
+ return UndefRegClearance;
return 0;
}
@@ -5213,7 +5311,7 @@ static void updateOperandRegConstraints(MachineFunction &MF,
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
continue;
auto *NewRC = MRI.constrainRegClass(
@@ -5464,6 +5562,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (I != nullptr) {
unsigned Opcode = I->DstOp;
+ bool FoldedLoad =
+ isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
+ bool FoldedStore =
+ isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
MaybeAlign MinAlign =
decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
if (MinAlign && Alignment < *MinAlign)
@@ -5474,20 +5576,25 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
&RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if (Size < RCSize) {
- // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
- // Check if it's safe to fold the load. If the size of the object is
- // narrower than the load width, then it's not.
- if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
- return nullptr;
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
+ if (FoldedLoad && Size < RCSize) {
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return nullptr;
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
return nullptr;
Opcode = X86::MOV32rm;
NarrowToMOV32rm = true;
}
+ // For stores, make sure the size of the object is equal to the size of
+ // the store. If the object is larger, the extra bits would be garbage. If
+ // the object is smaller we might overwrite another object or fault.
+ if (FoldedStore && Size != RCSize)
+ return nullptr;
}
if (isTwoAddrFold)
@@ -5500,7 +5607,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// value and zero-extend the top bits. Change the destination register
// to a 32-bit one.
Register DstReg = NewMI->getOperand(0).getReg();
- if (Register::isPhysicalRegister(DstReg))
+ if (DstReg.isPhysical())
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
@@ -6357,7 +6464,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
}
if (Load)
BeforeOps.push_back(SDValue(Load, 0));
- BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+ llvm::append_range(BeforeOps, AfterOps);
// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
switch (Opc) {
default: break;
@@ -6675,6 +6782,18 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
return true;
}
+bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+
+ // ENDBR instructions should not be scheduled around.
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32)
+ return true;
+
+ return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
bool X86InstrInfo::
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -6705,7 +6824,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
"X86-64 PIC uses RIP relative addressing");
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
- unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ Register GlobalBaseReg = X86FI->getGlobalBaseReg();
if (GlobalBaseReg != 0)
return GlobalBaseReg;
@@ -8261,7 +8380,7 @@ describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg,
// If the described register is a sub-register of the destination register,
// then pick out the source register's corresponding sub-register.
if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
- unsigned SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
+ Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
}
@@ -8525,7 +8644,7 @@ namespace {
return false;
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ Register GlobalBaseReg = X86FI->getGlobalBaseReg();
// If we didn't need a GlobalBaseReg, don't insert code.
if (GlobalBaseReg == 0)
@@ -8538,7 +8657,7 @@ namespace {
MachineRegisterInfo &RegInfo = MF.getRegInfo();
const X86InstrInfo *TII = STI.getInstrInfo();
- unsigned PC;
+ Register PC;
if (STI.isPICStyleGOT())
PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
else
@@ -8608,7 +8727,7 @@ namespace {
MachineFunctionPass::getAnalysisUsage(AU);
}
};
-}
+} // namespace
char CGBR::ID = 0;
FunctionPass*
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
index 89f2ff118c37..d7d2370c6f67 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
@@ -317,6 +317,17 @@ public:
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
+ Optional<ExtAddrMode>
+ getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
+ int64_t &ImmVal) const override;
+
+ bool preservesZeroValueInReg(const MachineInstr *MI,
+ const Register NullValueReg,
+ const TargetRegisterInfo *TRI) const override;
+
bool getMemOperandsWithOffsetWidth(
const MachineInstr &LdSt,
SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
@@ -409,6 +420,13 @@ public:
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
int64_t &Offset2) const override;
+ /// isSchedulingBoundary - Overrides the isSchedulingBoundary from
+ /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR
+ /// intructions and prevent it from being re-scheduled.
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
/// should be scheduled togther. On some targets if two loads are loading from
@@ -430,16 +448,6 @@ public:
/// instruction that defines the specified register class.
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
- /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
- /// would clobber the EFLAGS condition register. Note the result may be
- /// conservative. If it cannot definitely determine the safety after visiting
- /// a few instructions in each direction it assumes it's not safe.
- bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- return MBB.computeRegisterLiveness(&RI, X86::EFLAGS, I, 4) ==
- MachineBasicBlock::LQR_Dead;
- }
-
/// True if MI has a condition code def, e.g. EFLAGS, that is
/// not marked dead.
bool hasLiveCondCodeDef(MachineInstr &MI) const;
@@ -462,7 +470,7 @@ public:
unsigned
getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
- unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+ unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
@@ -517,7 +525,7 @@ public:
/// the machine instruction generated due to folding.
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
- unsigned &FoldAsLoadDefReg,
+ Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
std::pair<unsigned, unsigned>
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
index 3ea0ae8a8840..b006d1d9aa3a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
@@ -69,13 +69,8 @@ def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
SDTCisVT<2, i8>]>;
-def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
-def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
- [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
- SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
-def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
- [SDTCisVT<0, i64>, SDTCisPtrTy<1>,
- SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
+def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>;
def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisPtrTy<1>,
@@ -99,11 +94,11 @@ def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
SDTCisVT<1, iPTR>,
SDTCisVT<2, iPTR>]>;
-def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, i32>,
- SDTCisVT<3, i8>,
- SDTCisVT<4, i32>]>;
+def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
@@ -132,6 +127,11 @@ def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
+def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, v2i64>,
+ SDTCisPtrTy<3>]>;
+
def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
[SDNPHasChain,SDNPSideEffect]>;
def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
@@ -169,20 +169,12 @@ def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
-def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
- SDTX86caspairSaveEbx8,
- [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
- SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
-def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
- SDTX86caspairSaveRbx16,
- [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
- SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -194,7 +186,11 @@ def X86vastart_save_xmm_regs :
SDT_X86VASTART_SAVE_XMM_REGS,
[SDNPHasChain, SDNPVariadic]>;
def X86vaarg64 :
- SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+ SDNode<"X86ISD::VAARG_64", SDT_X86VAARG,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
+def X86vaargx32 :
+ SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG,
[SDNPHasChain, SDNPMayLoad, SDNPMayStore,
SDNPMemOperand]>;
def X86callseq_start :
@@ -284,6 +280,7 @@ def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
SDNPMemOperand]>;
def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
+def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>;
@@ -323,6 +320,22 @@ def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
[SDNPHasChain, SDNPSideEffect]>;
def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
[SDNPHasChain, SDNPSideEffect]>;
+def X86testui : SDNode<"X86ISD::TESTUI",
+ SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
@@ -901,6 +914,8 @@ def PKU : Predicate<"Subtarget->hasPKU()">;
def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
@@ -964,11 +979,15 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
+def HasKL : Predicate<"Subtarget->hasKL()">;
+def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">;
+def HasHRESET : Predicate<"Subtarget->hasHRESET()">;
def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">;
def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">;
def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">;
def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
+def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
@@ -1016,6 +1035,7 @@ def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
+def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
@@ -1053,6 +1073,7 @@ def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{
return isSExtAbsoluteSymbolRef(8, N);
@@ -2658,11 +2679,11 @@ let Predicates = [HasBMI2] in {
//
let Predicates = [HasTBM], Defs = [EFLAGS] in {
-multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag,
- SDNode OpNode, Operand immtype,
- SDPatternOperator immoperator,
- X86FoldableSchedWrite Sched> {
+multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ SDNode OpNode, Operand immtype,
+ SDPatternOperator immoperator,
+ X86FoldableSchedWrite Sched> {
def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
!strconcat(OpcodeStr,
"\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
@@ -2676,12 +2697,12 @@ multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr,
XOP, XOPA, Sched<[Sched.Folded]>;
}
-defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32,
- X86bextr, i32imm, imm, WriteBEXTR>;
+defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32,
+ X86bextri, i32imm, timm, WriteBEXTR>;
let ImmT = Imm32S in
-defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64,
- X86bextr, i64i32imm,
- i64immSExt32, WriteBEXTR>, VEX_W;
+defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64,
+ X86bextri, i64i32imm,
+ i64timmSExt32, WriteBEXTR>, VEX_W;
multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
RegisterClass RC, string OpcodeStr,
@@ -2787,8 +2808,7 @@ let SchedRW = [ WriteSystem ] in {
let Uses = [ ECX, EAX, EBX ] in {
def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
- [(int_x86_mwaitx ECX, EAX, EBX)]>,
- TB, Requires<[ HasMWAITX ]>;
+ []>, TB, Requires<[ HasMWAITX ]>;
}
} // SchedRW
@@ -2905,6 +2925,41 @@ def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
+// INVLPGB Instruction
+// OPCODE 0F 01 FE
+//
+let SchedRW = [WriteSystem] in {
+ let Uses = [EAX, EDX] in
+ def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
+ "invlpgb}", []>,
+ PS, Requires<[Not64BitMode]>;
+ let Uses = [RAX, EDX] in
+ def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
+ "invlpgb", []>,
+ PS, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// TLBSYNC Instruction
+// OPCODE 0F 01 FF
+//
+let SchedRW = [WriteSystem] in {
+ def TLBSYNC : I<0x01, MRM_FF, (outs), (ins),
+ "tlbsync", []>,
+ PS, Requires<[]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// HRESET Instruction
+//
+let Uses = [EAX], SchedRW = [WriteSystem] in
+ def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
+ Requires<[HasHRESET]>, TAXS;
+
+//===----------------------------------------------------------------------===//
// SERIALIZE Instruction
//
def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
@@ -2922,6 +2977,25 @@ let Predicates = [HasTSXLDTRK] in {
}
//===----------------------------------------------------------------------===//
+// UINTR Instructions
+//
+let Predicates = [HasUINTR, In64BitMode] in {
+ def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
+ []>, XS;
+ def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
+ [(int_x86_clui)]>, XS;
+ def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
+ [(int_x86_stui)]>, XS;
+
+ def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
+ [(int_x86_senduipi GR64:$arg)]>, XS;
+
+ let Defs = [EFLAGS] in
+ def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
+ [(set EFLAGS, (X86testui))]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
// Pattern fragments to auto generate TBM instructions.
//===----------------------------------------------------------------------===//
@@ -3080,10 +3154,16 @@ include "X86InstrMPX.td"
include "X86InstrVMX.td"
include "X86InstrSVM.td"
+include "X86InstrSNP.td"
include "X86InstrTSX.td"
include "X86InstrSGX.td"
+include "X86InstrTDX.td"
+
+// Key Locker instructions
+include "X86InstrKL.td"
+
// AMX instructions
include "X86InstrAMX.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
new file mode 100644
index 000000000000..b91e563a15f3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrKL.td
@@ -0,0 +1,86 @@
+//===---------------------------*-tablegen-*-------------------------------===//
+//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel key locker
+// instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Key Locker instructions
+
+let SchedRW = [WriteSystem], Predicates = [HasKL] in {
+ let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
+ def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "loadiwkey\t{$src2, $src1|$src1, $src2}",
+ [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
+ def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
+ def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Constraints = "$src1 = $dst",
+ Defs = [EFLAGS] in {
+ def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesenc128kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesdec128kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesenc256kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesdec256kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+ }
+
+} // SchedRW, Predicates
+
+let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
+ let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+ Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+ mayLoad = 1 in {
+ def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
+ "aesencwide128kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
+ "aesdecwide128kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
+ "aesencwide256kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
+ "aesdecwide256kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+} // SchedRW, Predicates
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
index 49940204c25a..bb3e6df3bf3e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
@@ -472,6 +472,7 @@ defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
SchedWriteVarShuffle.MMX>;
+let Predicates = [HasMMX, HasSSE1] in {
def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
(outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -485,6 +486,7 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
timm:$src2))]>,
Sched<[SchedWriteShuffle.MMX.Folded]>;
+}
// -- Conversion Instructions
defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
new file mode 100644
index 000000000000..de59f3fe2750
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSNP.td
@@ -0,0 +1,47 @@
+//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD Secure Nested
+// Paging (SNP) instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SNP instructions
+
+let SchedRW = [WriteSystem] in {
+// F3 0F 01 FF
+let Uses = [RAX] in
+def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
+ Requires<[In64BitMode]>;
+
+// F2 0F 01 FF
+let Uses = [RAX] in
+def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+ XD, Requires<[In64BitMode]>;
+
+let Uses = [EAX] in
+def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+ XD, Requires<[Not64BitMode]>;
+
+// F2 0F 01 FE
+let Uses = [RAX] in
+def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
+ Requires<[In64BitMode]>;
+
+// F3 0F 01 FE
+let Uses = [RAX] in
+def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>;
+def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
index c3c9f22381f8..7cf555748c46 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1242,7 +1242,8 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
/// SSE 2 Only
// Convert scalar double to scalar single
-let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
+let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
+ ExeDomain = SSEPackedSingle in {
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1260,7 +1261,7 @@ def : Pat<(f32 (any_fpround FR64:$src)),
(VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (any_fpround FR64:$src))]>,
@@ -1272,7 +1273,7 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
}
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
+let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1306,7 +1307,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1326,7 +1327,7 @@ def : Pat<(f64 (any_fpextend FR32:$src)),
def : Pat<(any_fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (any_fpextend FR32:$src))]>,
@@ -1338,7 +1339,8 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
} // isCodeGenOnly = 1
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
+ ExeDomain = SSEPackedSingle in {
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -3928,7 +3930,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+ (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
@@ -3938,7 +3940,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
@@ -3948,13 +3950,13 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
- imm:$src2))]>,
+ timm:$src2))]>,
PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
- imm:$src2))]>,
+ timm:$src2))]>,
Sched<[WriteVecExtract]>;
// Insert
@@ -5151,14 +5153,14 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
- imm:$src2))]>,
+ timm:$src2))]>,
Sched<[WriteVecExtract]>;
let hasSideEffects = 0, mayStore = 1 in
def mr : SS4AIi8<opc, MRMDestMem, (outs),
(ins i8mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
+ [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
@@ -5182,7 +5184,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
(ins i16mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
+ [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
@@ -5272,7 +5274,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+ (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
@@ -5281,7 +5283,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
@@ -7014,22 +7016,19 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-}
-
// NOTE: We're using FP instructions here, but execution domain fixing can
// convert to integer when profitable.
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
}
@@ -7165,6 +7164,68 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+ VR128:$src2, VR128:$src3)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+ (loadv4i32 addr:$src3))))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ let isCommutable = IsCommutable in
+ def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+ VR256:$src2, VR256:$src3)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+ (loadv8i32 addr:$src3))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86vpmaddwd node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
+ def : Pat<(v8i32 (add VR256:$src1,
+ (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
+ (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+ def : Pat<(v8i32 (add VR256:$src1,
+ (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
+ (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
+ def : Pat<(v4i32 (add VR128:$src1,
+ (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
+ (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v4i32 (add VR128:$src1,
+ (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
+ (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
//
@@ -7226,16 +7287,12 @@ let ExeDomain = SSEPackedSingle in {
let isCommutable = 1 in
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
- "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
- (i8 timm:$src3))))]>, VEX_4V, VEX_L,
- Sched<[WriteFShuffle256]>;
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
- "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
- (i8 timm:$src3)))]>, VEX_4V, VEX_L,
- Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
}
// Immediate transform to help with commuting.
@@ -7243,23 +7300,27 @@ def Perm2XCommuteImm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
}]>;
+multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
+ def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
+ def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
+ // Pattern with load in other operand.
+ def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (Perm2XCommuteImm timm:$imm))>;
+}
+
let Predicates = [HasAVX] in {
-// Pattern with load in other operand.
-def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
- VR256:$src1, (i8 timm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
+ defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
+ defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
- (loadv4i64 addr:$src2), (i8 timm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
-// Pattern with load in other operand.
-def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
- VR256:$src1, (i8 timm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
+ defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
+ defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
+ defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
}
//===----------------------------------------------------------------------===//
@@ -7628,27 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
WriteFShuffle256, f256mem>, VEX_W;
//===----------------------------------------------------------------------===//
-// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
//
let isCommutable = 1 in
def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
- "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
- (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
- VEX_4V, VEX_L;
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
- "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
- (i8 timm:$src3)))]>,
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
-let Predicates = [HasAVX2] in
-def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
- VR256:$src1, (i8 timm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
-
+let Predicates = [HasAVX2] in {
+ defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
+ defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
+ defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
+}
//===----------------------------------------------------------------------===//
// VINSERTI128 - Insert packed integer values
@@ -7768,37 +7826,6 @@ let Predicates = [HasAVX2] in {
}
//===----------------------------------------------------------------------===//
-// SubVector Broadcasts
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
- (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v2f64 VR128:$src), 1)>;
-def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
- (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v4f32 VR128:$src), 1)>;
-}
-
-// NOTE: We're using FP instructions here, but execution domain fixing can
-// convert to integer when profitable.
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
- (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v2i64 VR128:$src), 1)>;
-def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
- (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v4i32 VR128:$src), 1)>;
-def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
- (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v8i16 VR128:$src), 1)>;
-def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
- (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
- (v16i8 VR128:$src), 1)>;
-}
-
-//===----------------------------------------------------------------------===//
// Variable Bit Shifts
//
multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
index 82c8e74156b2..d8f70b016c7b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSVM.td
@@ -26,37 +26,47 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
// 0F 01 DE
let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB;
// 0F 01 D8
let Uses = [EAX] in
-def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%eax|eax}", []>, TB,
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%rax|rax}", []>, TB,
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
Requires<[In64BitMode]>;
// 0F 01 DA
let Uses = [EAX] in
-def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%eax|eax}", []>, TB,
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%rax|rax}", []>, TB,
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
Requires<[In64BitMode]>;
// 0F 01 DB
let Uses = [EAX] in
-def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%eax|eax}", []>, TB,
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%rax|rax}", []>, TB,
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
Requires<[In64BitMode]>;
// 0F 01 DF
let Uses = [EAX, ECX] in
def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%eax, %ecx|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+ "invlpga", []>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX, ECX] in
def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%rax, %ecx|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+ "invlpga", []>, TB, Requires<[In64BitMode]>;
} // SchedRW
+
+def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>;
+def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
index 13659b5c456e..eb8740896e5d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
@@ -49,6 +49,7 @@ let Uses = [EFLAGS] in
def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
} // SchedRW
+def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>;
// The long form of "int $3" turns into int3 as a size optimization.
// FIXME: This doesn't work because InstAlias can't match immediate constants.
//def : InstAlias<"int\t$3", (INT3)>;
@@ -171,6 +172,17 @@ def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>;
} // SchedRW
//===----------------------------------------------------------------------===//
+// Address-size override prefixes.
+//
+
+let SchedRW = [WriteNop] in {
+def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>,
+ Requires<[In32BitMode]>;
+def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
// Moves to and from segment registers.
//
@@ -447,7 +459,7 @@ let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
// Cache instructions
let SchedRW = [WriteSystem] in {
def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
// wbnoinvd is like wbinvd, except without invalidation
// encoding: like wbinvd + an 0xF3 prefix
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
new file mode 100644
index 000000000000..8d7cd6082095
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTDX.td
@@ -0,0 +1,39 @@
+//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TDX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TDX instructions
+
+// 64-bit only instructions
+let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
+// SEAMCALL - Call to SEAM VMX-root Operation Module
+def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
+ "seamcall", []>, PD;
+
+// SEAMRET - Return to Legacy VMX-root Operation
+def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
+ "seamret", []>, PD;
+
+// SEAMOPS - SEAM Operations
+def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
+ "seamops", []>, PD;
+
+} // SchedRW
+
+// common instructions
+let SchedRW = [WriteSystem] in {
+// TDCALL - Call SEAM Module Functions
+def TDCALL : I<0x01, MRM_CC, (outs), (ins),
+ "tdcall", []>, PD;
+
+} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 60fb4d2ef4bf..ff531713037c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -214,8 +214,8 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
return SubIdx;
}
-static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
- assert(Register::isPhysicalRegister(Reg));
+static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) {
+ assert(Reg.isPhysical());
if (X86::GR64RegClass.contains(Reg))
return &X86::GR64RegClass;
if (X86::GR32RegClass.contains(Reg))
@@ -239,7 +239,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- if (Register::isPhysicalRegister(DstReg)) {
+ if (DstReg.isPhysical()) {
assert(I.isCopy() && "Generic operators do not allow physical registers");
if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
@@ -266,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
- assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+ assert((!SrcReg.isPhysical() || I.isCopy()) &&
"No phys reg on generic operators");
assert((DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
- (Register::isPhysicalRegister(SrcReg) &&
+ (SrcReg.isPhysical() &&
DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
"Copy with different width?!");
@@ -280,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
if (SrcRegBank.getID() == X86::GPRRegBankID &&
DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
- Register::isPhysicalRegister(SrcReg)) {
+ SrcReg.isPhysical()) {
// Change the physical register to performe truncate.
const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
@@ -479,7 +479,7 @@ static void X86SelectAddress(const MachineInstr &I,
"unsupported type.");
if (I.getOpcode() == TargetOpcode::G_PTR_ADD) {
- if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+ if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
int64_t Imm = *COff;
if (isInt<32>(Imm)) { // Check for displacement overflow.
AM.Disp = static_cast<int32_t>(Imm);
@@ -780,69 +780,18 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) &&
+ "8=>16 Zext is handled by tablegen");
assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) &&
"8=>32 Zext is handled by tablegen");
assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) &&
"16=>32 Zext is handled by tablegen");
-
- const static struct ZextEntry {
- LLT SrcTy;
- LLT DstTy;
- unsigned MovOp;
- bool NeedSubregToReg;
- } OpTable[] = {
- {LLT::scalar(8), LLT::scalar(16), X86::MOVZX16rr8, false}, // i8 => i16
- {LLT::scalar(8), LLT::scalar(64), X86::MOVZX32rr8, true}, // i8 => i64
- {LLT::scalar(16), LLT::scalar(64), X86::MOVZX32rr16, true}, // i16 => i64
- {LLT::scalar(32), LLT::scalar(64), 0, true} // i32 => i64
- };
-
- auto ZextEntryIt =
- std::find_if(std::begin(OpTable), std::end(OpTable),
- [SrcTy, DstTy](const ZextEntry &El) {
- return El.DstTy == DstTy && El.SrcTy == SrcTy;
- });
-
- // Here we try to select Zext into a MOVZ and/or SUBREG_TO_REG instruction.
- if (ZextEntryIt != std::end(OpTable)) {
- const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
- const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
- const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
- const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
-
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
- << " operand\n");
- return false;
- }
-
- unsigned TransitRegTo = DstReg;
- unsigned TransitRegFrom = SrcReg;
- if (ZextEntryIt->MovOp) {
- // If we select Zext into MOVZ + SUBREG_TO_REG, we need to have
- // a transit register in between: create it here.
- if (ZextEntryIt->NeedSubregToReg) {
- TransitRegFrom = MRI.createVirtualRegister(
- getRegClass(LLT::scalar(32), DstReg, MRI));
- TransitRegTo = TransitRegFrom;
- }
-
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ZextEntryIt->MovOp))
- .addDef(TransitRegTo)
- .addReg(SrcReg);
- }
- if (ZextEntryIt->NeedSubregToReg) {
- BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(TargetOpcode::SUBREG_TO_REG))
- .addDef(DstReg)
- .addImm(0)
- .addReg(TransitRegFrom)
- .addImm(X86::sub_32bit);
- }
- I.eraseFromParent();
- return true;
- }
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) &&
+ "8=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) &&
+ "16=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) &&
+ "32=>64 Zext is handled by tablegen");
if (SrcTy != LLT::scalar(1))
return false;
@@ -859,12 +808,17 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
else
return false;
- unsigned DefReg = SrcReg;
+ Register DefReg = SrcReg;
if (DstTy != LLT::scalar(8)) {
+ Register ImpDefReg =
+ MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg);
+
DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
- .addImm(0)
+ TII.get(TargetOpcode::INSERT_SUBREG), DefReg)
+ .addReg(ImpDefReg)
.addReg(SrcReg)
.addImm(X86::sub_8bit);
}
@@ -1605,10 +1559,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
}}, // i64
};
- auto OpEntryIt = std::find_if(std::begin(OpTable), std::end(OpTable),
- [RegTy](const DivRemEntry &El) {
- return El.SizeInBits == RegTy.getSizeInBits();
- });
+ auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) {
+ return El.SizeInBits == RegTy.getSizeInBits();
+ });
if (OpEntryIt == std::end(OpTable))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index a19e12766e10..95655dd4723b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -44,8 +44,8 @@ namespace {
/// E.g. A group of interleaving access loads (Factor = 2; accessing every
/// other element)
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
-/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
-/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
+/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
class X86InterleavedAccessGroup {
/// Reference to the wide-load instruction of an interleaved access
/// group.
@@ -211,7 +211,7 @@ void X86InterleavedAccessGroup::decompose(
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
}
// Generate N loads of T type.
- assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
+ assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
"VecBaseTy's size must be a multiple of 8");
const Align FirstAlignment = LI->getAlign();
const Align SubsequentAlignment = commonAlignment(
@@ -295,8 +295,7 @@ static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
if (VecElems == 16) {
for (unsigned i = 0; i < Stride; i++)
- TransposedMatrix[i] = Builder.CreateShuffleVector(
- Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
+ TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
return;
}
@@ -577,8 +576,7 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
for (int i = 0; i < 3; i++)
- Vec[i] = Builder.CreateShuffleVector(
- Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
+ Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
@@ -600,10 +598,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
- Value *TempVec = Builder.CreateShuffleVector(
- Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
- TransposedMatrix[0] = Builder.CreateShuffleVector(
- Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
+ Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
}
@@ -660,10 +656,8 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
- Vec[0] = Builder.CreateShuffleVector(
- InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
- Vec[1] = Builder.CreateShuffleVector(
- InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
+ Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
+ Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
Vec[2] = InVec[2];
// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 1c10c07abeee..72ab3e9cf78d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -22,7 +22,7 @@ namespace llvm {
enum IntrinsicType : uint16_t {
CVTNEPS2BF16_MASK,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
- INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
INTR_TYPE_3OP_IMM8,
CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
CVTPD2PS_MASK,
@@ -417,12 +417,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
- X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
- X86_INTRINSIC_DATA(avx512_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, X86ISD::CMPM_SAE),
X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
X86_INTRINSIC_DATA(avx512_conflict_d_256, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
X86_INTRINSIC_DATA(avx512_conflict_d_512, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
@@ -464,6 +458,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FADDS, X86ISD::FADDS_RND),
X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FADDS, X86ISD::FADDS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
@@ -882,12 +882,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
@@ -1098,7 +1098,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
- X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
+ X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -1108,8 +1108,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
- X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0),
- X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTRI, 0),
X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
@@ -1132,10 +1132,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
X86ISD::GF2P8MULB, 0),
- X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
- X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
- X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
- X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index 84f560f2f9ee..1b371ac2a108 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -70,6 +70,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizerInfoAVX512DQ();
setLegalizerInfoAVX512BW();
+ getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
+ .scalarize(0)
+ .minScalar(0, LLT::scalar(32))
+ .libcall();
+
setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
@@ -81,25 +86,14 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizeScalarToDifferentSizeStrategy(
G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
computeTables();
verify(*STI.getInstrInfo());
}
bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
- MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
- switch (MI.getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memset:
- case Intrinsic::memmove:
- if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
- LegalizerHelper::UnableToLegalize)
- return false;
- MI.eraseFromParent();
- return true;
- default:
- break;
- }
return true;
}
@@ -161,6 +155,11 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
.legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
.clampScalar(0, s8, s32)
.clampScalar(1, s8, s8);
+
+ // Comparison
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s8}, {s8, s16, s32, p0})
+ .clampScalar(0, s8, s8);
}
// Control-flow
@@ -179,12 +178,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ANYEXT, s128}, Legal);
getActionDefinitionsBuilder(G_SEXT_INREG).lower();
- // Comparison
- setAction({G_ICMP, s1}, Legal);
-
- for (auto Ty : {s8, s16, s32, p0})
- setAction({G_ICMP, 1, Ty}, Legal);
-
// Merge/Unmerge
for (const auto &Ty : {s16, s32, s64}) {
setAction({G_MERGE_VALUES, Ty}, Legal);
@@ -253,7 +246,9 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
.widenScalarToNextPow2(1);
// Comparison
- setAction({G_ICMP, 1, s64}, Legal);
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0})
+ .clampScalar(0, s8, s8);
getActionDefinitionsBuilder(G_FCMP)
.legalForCartesianProduct({s8}, {s32, s64})
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 50f8b3477acc..810fee052b5a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -42,6 +42,7 @@
#include "X86TargetMachine.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -104,9 +105,9 @@ static cl::opt<bool> EmitDotVerify(
cl::init(false), cl::Hidden);
static llvm::sys::DynamicLibrary OptimizeDL;
-typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size,
- unsigned int *edges, int *edge_values,
- int *cut_edges /* out */, unsigned int edges_size);
+typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize,
+ unsigned int *Edges, int *EdgeValues,
+ int *CutEdges /* out */, unsigned int EdgesSize);
static OptimizeCutT OptimizeCut = nullptr;
namespace {
@@ -148,9 +149,10 @@ public:
private:
using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
+ using Edge = MachineGadgetGraph::Edge;
+ using Node = MachineGadgetGraph::Node;
using EdgeSet = MachineGadgetGraph::EdgeSet;
using NodeSet = MachineGadgetGraph::NodeSet;
- using Gadget = std::pair<MachineInstr *, MachineInstr *>;
const X86Subtarget *STI;
const TargetInstrInfo *TII;
@@ -162,15 +164,13 @@ private:
const MachineDominanceFrontier &MDF) const;
int hardenLoadsWithPlugin(MachineFunction &MF,
std::unique_ptr<MachineGadgetGraph> Graph) const;
- int hardenLoadsWithGreedyHeuristic(
- MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const;
+ int hardenLoadsWithHeuristic(MachineFunction &MF,
+ std::unique_ptr<MachineGadgetGraph> Graph) const;
int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
EdgeSet &ElimEdges /* in, out */,
NodeSet &ElimNodes /* in, out */) const;
std::unique_ptr<MachineGadgetGraph>
trimMitigatedEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
- void findAndCutEdges(MachineGadgetGraph &G,
- EdgeSet &CutEdges /* out */) const;
int insertFences(MachineFunction &MF, MachineGadgetGraph &G,
EdgeSet &CutEdges /* in, out */) const;
bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
@@ -198,7 +198,7 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
using ChildIteratorType = typename Traits::ChildIteratorType;
using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
std::string getNodeLabel(NodeRef Node, GraphType *) {
if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
@@ -243,7 +243,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
AU.setPreservesCFG();
}
-static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF,
+static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF,
MachineGadgetGraph *G) {
WriteGraph(OS, G, /*ShortNames*/ false,
"Speculative gadgets for \"" + MF.getName() + "\" function");
@@ -279,7 +279,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
return false; // didn't find any gadgets
if (EmitDotVerify) {
- WriteGadgetGraph(outs(), MF, Graph.get());
+ writeGadgetGraph(outs(), MF, Graph.get());
return false;
}
@@ -292,7 +292,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
raw_fd_ostream FileOut(FileName, FileError);
if (FileError)
errs() << FileError.message();
- WriteGadgetGraph(FileOut, MF, Graph.get());
+ writeGadgetGraph(FileOut, MF, Graph.get());
FileOut.close();
LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
if (EmitDotOnly)
@@ -313,7 +313,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
}
FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
} else { // Use the default greedy heuristic
- FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph));
+ FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph));
}
if (FencesInserted > 0)
@@ -367,7 +367,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
// Use RDF to find all the uses of `Def`
rdf::NodeSet Uses;
- RegisterRef DefReg = DFG.getPRI().normalize(Def.Addr->getRegRef(DFG));
+ RegisterRef DefReg = Def.Addr->getRegRef(DFG);
for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
auto Use = DFG.addr<UseNode *>(UseID);
if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
@@ -540,17 +540,17 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
// Returns the number of remaining gadget edges that could not be eliminated
int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
- MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */,
- MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const {
+ MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */,
+ NodeSet &ElimNodes /* in, out */) const {
if (G.NumFences > 0) {
// Eliminate fences and CFG edges that ingress and egress the fence, as
// they are trivially mitigated.
- for (const auto &E : G.edges()) {
- const MachineGadgetGraph::Node *Dest = E.getDest();
+ for (const Edge &E : G.edges()) {
+ const Node *Dest = E.getDest();
if (isFence(Dest->getValue())) {
ElimNodes.insert(*Dest);
ElimEdges.insert(E);
- for (const auto &DE : Dest->edges())
+ for (const Edge &DE : Dest->edges())
ElimEdges.insert(DE);
}
}
@@ -558,29 +558,28 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
// Find and eliminate gadget edges that have been mitigated.
int MitigatedGadgets = 0, RemainingGadgets = 0;
- MachineGadgetGraph::NodeSet ReachableNodes{G};
- for (const auto &RootN : G.nodes()) {
+ NodeSet ReachableNodes{G};
+ for (const Node &RootN : G.nodes()) {
if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
continue; // skip this node if it isn't a gadget source
// Find all of the nodes that are CFG-reachable from RootN using DFS
ReachableNodes.clear();
- std::function<void(const MachineGadgetGraph::Node *, bool)>
- FindReachableNodes =
- [&](const MachineGadgetGraph::Node *N, bool FirstNode) {
- if (!FirstNode)
- ReachableNodes.insert(*N);
- for (const auto &E : N->edges()) {
- const MachineGadgetGraph::Node *Dest = E.getDest();
- if (MachineGadgetGraph::isCFGEdge(E) &&
- !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest))
- FindReachableNodes(Dest, false);
- }
- };
+ std::function<void(const Node *, bool)> FindReachableNodes =
+ [&](const Node *N, bool FirstNode) {
+ if (!FirstNode)
+ ReachableNodes.insert(*N);
+ for (const Edge &E : N->edges()) {
+ const Node *Dest = E.getDest();
+ if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) &&
+ !ReachableNodes.contains(*Dest))
+ FindReachableNodes(Dest, false);
+ }
+ };
FindReachableNodes(&RootN, true);
// Any gadget whose sink is unreachable has been mitigated
- for (const auto &E : RootN.edges()) {
+ for (const Edge &E : RootN.edges()) {
if (MachineGadgetGraph::isGadgetEdge(E)) {
if (ReachableNodes.contains(*E.getDest())) {
// This gadget's sink is reachable
@@ -598,8 +597,8 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
std::unique_ptr<MachineGadgetGraph>
X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
std::unique_ptr<MachineGadgetGraph> Graph) const {
- MachineGadgetGraph::NodeSet ElimNodes{*Graph};
- MachineGadgetGraph::EdgeSet ElimEdges{*Graph};
+ NodeSet ElimNodes{*Graph};
+ EdgeSet ElimEdges{*Graph};
int RemainingGadgets =
elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
if (ElimEdges.empty() && ElimNodes.empty()) {
@@ -630,11 +629,11 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
- for (const auto &N : Graph->nodes()) {
+ for (const Node &N : Graph->nodes()) {
Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
}
Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
- for (const auto &E : Graph->edges()) {
+ for (const Edge &E : Graph->edges()) {
Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
}
@@ -651,74 +650,67 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
- Graph = GraphBuilder::trim(*Graph, MachineGadgetGraph::NodeSet{*Graph},
- CutEdges);
+ Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges);
} while (true);
return FencesInserted;
}
-int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic(
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic(
MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
- LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
- Graph = trimMitigatedEdges(std::move(Graph));
- LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+ // If `MF` does not have any fences, then no gadgets would have been
+ // mitigated at this point.
+ if (Graph->NumFences > 0) {
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+ Graph = trimMitigatedEdges(std::move(Graph));
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+ }
+
if (Graph->NumGadgets == 0)
return 0;
LLVM_DEBUG(dbgs() << "Cutting edges...\n");
- MachineGadgetGraph::NodeSet ElimNodes{*Graph}, GadgetSinks{*Graph};
- MachineGadgetGraph::EdgeSet ElimEdges{*Graph}, CutEdges{*Graph};
- auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) {
- return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
- MachineGadgetGraph::isCFGEdge(E);
- };
- auto IsGadgetEdge = [&ElimEdges,
- &CutEdges](const MachineGadgetGraph::Edge &E) {
- return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
- MachineGadgetGraph::isGadgetEdge(E);
- };
-
- // FIXME: this is O(E^2), we could probably do better.
- do {
- // Find the cheapest CFG edge that will eliminate a gadget (by being
- // egress from a SOURCE node or ingress to a SINK node), and cut it.
- const MachineGadgetGraph::Edge *CheapestSoFar = nullptr;
-
- // First, collect all gadget source and sink nodes.
- MachineGadgetGraph::NodeSet GadgetSources{*Graph}, GadgetSinks{*Graph};
- for (const auto &N : Graph->nodes()) {
- if (ElimNodes.contains(N))
+ EdgeSet CutEdges{*Graph};
+
+ // Begin by collecting all ingress CFG edges for each node
+ DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap;
+ for (const Edge &E : Graph->edges())
+ if (MachineGadgetGraph::isCFGEdge(E))
+ IngressEdgeMap[E.getDest()].push_back(&E);
+
+ // For each gadget edge, make cuts that guarantee the gadget will be
+ // mitigated. A computationally efficient way to achieve this is to either:
+ // (a) cut all egress CFG edges from the gadget source, or
+ // (b) cut all ingress CFG edges to the gadget sink.
+ //
+ // Moreover, the algorithm tries not to make a cut into a loop by preferring
+ // to make a (b)-type cut if the gadget source resides at a greater loop depth
+ // than the gadget sink, or an (a)-type cut otherwise.
+ for (const Node &N : Graph->nodes()) {
+ for (const Edge &E : N.edges()) {
+ if (!MachineGadgetGraph::isGadgetEdge(E))
continue;
- for (const auto &E : N.edges()) {
- if (IsGadgetEdge(E)) {
- GadgetSources.insert(N);
- GadgetSinks.insert(*E.getDest());
- }
- }
- }
- // Next, look for the cheapest CFG edge which, when cut, is guaranteed to
- // mitigate at least one gadget by either:
- // (a) being egress from a gadget source, or
- // (b) being ingress to a gadget sink.
- for (const auto &N : Graph->nodes()) {
- if (ElimNodes.contains(N))
- continue;
- for (const auto &E : N.edges()) {
- if (IsCFGEdge(E)) {
- if (GadgetSources.contains(N) || GadgetSinks.contains(*E.getDest())) {
- if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue())
- CheapestSoFar = &E;
- }
- }
- }
+ SmallVector<const Edge *, 2> EgressEdges;
+ SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()];
+ for (const Edge &EgressEdge : N.edges())
+ if (MachineGadgetGraph::isCFGEdge(EgressEdge))
+ EgressEdges.push_back(&EgressEdge);
+
+ int EgressCutCost = 0, IngressCutCost = 0;
+ for (const Edge *EgressEdge : EgressEdges)
+ if (!CutEdges.contains(*EgressEdge))
+ EgressCutCost += EgressEdge->getValue();
+ for (const Edge *IngressEdge : IngressEdges)
+ if (!CutEdges.contains(*IngressEdge))
+ IngressCutCost += IngressEdge->getValue();
+
+ auto &EdgesToCut =
+ IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges;
+ for (const Edge *E : EdgesToCut)
+ CutEdges.insert(*E);
}
-
- assert(CheapestSoFar && "Failed to cut an edge");
- CutEdges.insert(*CheapestSoFar);
- ElimEdges.insert(*CheapestSoFar);
- } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes));
+ }
LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
@@ -734,8 +726,8 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences(
MachineFunction &MF, MachineGadgetGraph &G,
EdgeSet &CutEdges /* in, out */) const {
int FencesInserted = 0;
- for (const auto &N : G.nodes()) {
- for (const auto &E : N.edges()) {
+ for (const Node &N : G.nodes()) {
+ for (const Edge &E : N.edges()) {
if (CutEdges.contains(E)) {
MachineInstr *MI = N.getValue(), *Prev;
MachineBasicBlock *MBB; // Insert an LFENCE in this MBB
@@ -751,7 +743,7 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences(
Prev = MI->getPrevNode();
// Remove all egress CFG edges from this branch because the inserted
// LFENCE prevents gadgets from crossing the branch.
- for (const auto &E : N.edges()) {
+ for (const Edge &E : N.edges()) {
if (MachineGadgetGraph::isCFGEdge(E))
CutEdges.insert(E);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
index 6e1134a25950..7b6276c1d87e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -72,62 +72,39 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
++NumFunctionsConsidered;
const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
const X86InstrInfo *TII = Subtarget->getInstrInfo();
- unsigned ClobberReg = X86::NoRegister;
- std::bitset<X86::NUM_TARGET_REGS> UnclobberableGR64s;
- UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer
- UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer
- UnclobberableGR64s.set(X86::RAX); // used for function return
- UnclobberableGR64s.set(X86::RDX); // used for function return
-
- // We can clobber any register allowed by the function's calling convention.
- for (const MCPhysReg *PR = TRI->getCalleeSavedRegs(&MF); auto Reg = *PR; ++PR)
- UnclobberableGR64s.set(Reg);
- for (auto &Reg : X86::GR64RegClass) {
- if (!UnclobberableGR64s.test(Reg)) {
- ClobberReg = Reg;
- break;
- }
- }
-
- if (ClobberReg != X86::NoRegister) {
- LLVM_DEBUG(dbgs() << "Selected register "
- << Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg)
- << " to clobber\n");
- } else {
- LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n");
- }
bool Modified = false;
for (auto &MBB : MF) {
- if (MBB.empty())
- continue;
-
- MachineInstr &MI = MBB.back();
- if (MI.getOpcode() != X86::RETQ)
- continue;
-
- if (ClobberReg != X86::NoRegister) {
- MBB.erase_instr(&MI);
- BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r))
- .addReg(ClobberReg, RegState::Define)
- .setMIFlag(MachineInstr::FrameDestroy);
- BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE));
- BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r))
- .addReg(ClobberReg);
- } else {
- // In case there is no available scratch register, we can still read from
- // RSP to assert that RSP points to a valid page. The write to RSP is
- // also helpful because it verifies that the stack's write permissions
- // are intact.
- MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
- addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
- X86::RSP, false, 0)
- .addImm(0)
- ->addRegisterDead(X86::EFLAGS, TRI);
+ for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) {
+ if (MBBI->getOpcode() != X86::RETQ)
+ continue;
+
+ unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+ if (ClobberReg != X86::NoRegister) {
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r))
+ .addReg(ClobberReg, RegState::Define)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r))
+ .addReg(ClobberReg);
+ MBB.erase(MBBI);
+ } else {
+ // In case there is no available scratch register, we can still read
+ // from RSP to assert that RSP points to a valid page. The write to RSP
+ // is also helpful because it verifies that the stack's write
+ // permissions are intact.
+ MachineInstr *Fence =
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+ addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
+ X86::RSP, false, 0)
+ .addImm(0)
+ ->addRegisterDead(X86::EFLAGS, TRI);
+ }
+
+ ++NumFences;
+ Modified = true;
+ break;
}
-
- ++NumFences;
- Modified = true;
}
if (Modified)
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
new file mode 100644
index 000000000000..85166decd8cd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -0,0 +1,351 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to transform <256 x i32> load/store
+/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only
+/// provides simple operation on x86_amx. The basic elementwise operation
+/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32>
+/// and only AMX intrinsics can operate on the type, we need transform
+/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
+/// not be combined with load/store, we transform the bitcast to amx load/store
+/// and <256 x i32> store/load.
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-amx-type"
+
+static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
+ Function &F = *BB->getParent();
+ Module *M = BB->getModule();
+ const DataLayout &DL = M->getDataLayout();
+
+ Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
+ LLVMContext &Ctx = Builder.getContext();
+ auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
+ unsigned AllocaAS = DL.getAllocaAddrSpace();
+ AllocaInst *AllocaRes =
+ new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
+ AllocaRes->setAlignment(AllocaAlignment);
+ return AllocaRes;
+}
+
+static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+ Value *Row = nullptr, *Col = nullptr;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Expect amx intrinsics");
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tilestored64_internal: {
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ // a * b + c
+ // The shape depends on which operand.
+ case Intrinsic::x86_tdpbssd_internal: {
+ switch (OpNo) {
+ case 3:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ case 4:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(2);
+ break;
+ case 5:
+ Row = II->getArgOperand(2);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ break;
+ }
+ }
+
+ return std::make_pair(Row, Col);
+}
+
+// %src = load <256 x i32>, <256 x i32>* %addr, align 64
+// %2 = bitcast <256 x i32> %src to x86_amx
+// -->
+// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+// i8* %addr, i64 %stride64)
+static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
+ Value *Row = nullptr, *Col = nullptr;
+ Use &U = *(Bitcast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = cast<IntrinsicInst>(U.getUser());
+ std::tie(Row, Col) = getShape(II, OpNo);
+ IRBuilder<> Builder(Bitcast);
+ // Use the maximun column as stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+ Value *NewInst =
+ Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+ Bitcast->replaceAllUsesWith(NewInst);
+}
+
+// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+// %stride);
+// %13 = bitcast x86_amx %src to <256 x i32>
+// store <256 x i32> %13, <256 x i32>* %addr, align 64
+// -->
+// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+// %stride64, %13)
+static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
+
+ Value *Tile = Bitcast->getOperand(0);
+ auto *II = cast<IntrinsicInst>(Tile);
+ // Tile is output from AMX intrinsic. The first operand of the
+ // intrinsic is row, the second operand of the intrinsic is column.
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ IRBuilder<> Builder(ST);
+ // Use the maximum column as stride. It must be the same with load
+ // stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ if (Bitcast->hasOneUse())
+ return;
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // %add = <256 x i32> %13, <256 x i32> %src2
+ // -->
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ // %14 = load <256 x i32>, %addr
+ // %add = <256 x i32> %14, <256 x i32> %src2
+ Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1));
+ Bitcast->replaceAllUsesWith(Vec);
+}
+
+// transform bitcast to <store, load> instructions.
+static bool transformBitcast(BitCastInst *Bitcast) {
+ IRBuilder<> Builder(Bitcast);
+ AllocaInst *AllocaAddr;
+ Value *I8Ptr, *Stride;
+ auto *Src = Bitcast->getOperand(0);
+
+ auto Prepare = [&]() {
+ AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+ I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
+ Stride = Builder.getInt64(64);
+ };
+
+ if (Bitcast->getType()->isX86_AMXTy()) {
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // -->
+ // %addr = alloca <256 x i32>, align 64
+ // store <256 x i32> %src, <256 x i32>* %addr, align 64
+ // %addr2 = bitcast <256 x i32>* to i8*
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr2,
+ // i64 64)
+ Use &U = *(Bitcast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+ if (!II)
+ return false; // May be bitcast from x86amx to <256 x i32>.
+ Prepare();
+ Builder.CreateStore(Src, AllocaAddr);
+ // TODO we can pick an constant operand for the shape.
+ Value *Row = nullptr, *Col = nullptr;
+ std::tie(Row, Col) = getShape(II, OpNo);
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+ Value *NewInst = Builder.CreateIntrinsic(
+ Intrinsic::x86_tileloadd64_internal, None, Args);
+ Bitcast->replaceAllUsesWith(NewInst);
+ } else {
+ // %2 = bitcast x86_amx %src to <256 x i32>
+ // -->
+ // %addr = alloca <256 x i32>, align 64
+ // %addr2 = bitcast <256 x i32>* to i8*
+ // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
+ // i8* %addr2, i64 %stride)
+ // %2 = load <256 x i32>, <256 x i32>* %addr, align 64
+ auto *II = dyn_cast<IntrinsicInst>(Src);
+ if (!II)
+ return false; // May be bitcast from <256 x i32> to x86amx.
+ Prepare();
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
+ Bitcast->replaceAllUsesWith(NewInst);
+ }
+
+ return true;
+}
+
+namespace {
+class X86LowerAMXType {
+ Function &Func;
+
+public:
+ X86LowerAMXType(Function &F) : Func(F) {}
+ bool visit();
+};
+
+bool X86LowerAMXType::visit() {
+ SmallVector<Instruction *, 8> DeadInsts;
+
+ for (BasicBlock *BB : post_order(&Func)) {
+ for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
+ II != IE;) {
+ Instruction &Inst = *II++;
+ auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
+ if (!Bitcast)
+ continue;
+
+ Value *Src = Bitcast->getOperand(0);
+ if (Bitcast->getType()->isX86_AMXTy()) {
+ if (Bitcast->user_empty()) {
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ LoadInst *LD = dyn_cast<LoadInst>(Src);
+ if (!LD) {
+ if (transformBitcast(Bitcast))
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ // If load has mutli-user, duplicate a vector load.
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // %add = add <256 x i32> %src, <256 x i32> %src2
+ // -->
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr, i64 %stride64)
+ // %add = add <256 x i32> %src, <256 x i32> %src2
+
+ // If load has one user, the load will be eliminated in DAG ISel.
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // -->
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr, i64 %stride64)
+ combineLoadBitcast(LD, Bitcast);
+ DeadInsts.push_back(Bitcast);
+ if (LD->hasOneUse())
+ DeadInsts.push_back(LD);
+ } else if (Src->getType()->isX86_AMXTy()) {
+ if (Bitcast->user_empty()) {
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ StoreInst *ST = nullptr;
+ for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
+ UI != UE;) {
+ Value *I = (UI++)->getUser();
+ ST = dyn_cast<StoreInst>(I);
+ if (ST)
+ break;
+ }
+ if (!ST) {
+ if (transformBitcast(Bitcast))
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ // If bitcast (%13) has one use, combine bitcast and store to amx store.
+ // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+ // %stride);
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // -->
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ //
+ // If bitcast (%13) has multi-use, transform as below.
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // %add = <256 x i32> %13, <256 x i32> %src2
+ // -->
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ // %14 = load <256 x i32>, %addr
+ // %add = <256 x i32> %14, <256 x i32> %src2
+ //
+ combineBitcastStore(Bitcast, ST);
+ // Delete user first.
+ DeadInsts.push_back(ST);
+ DeadInsts.push_back(Bitcast);
+ }
+ }
+ }
+
+ bool C = !DeadInsts.empty();
+
+ for (auto *Inst : DeadInsts)
+ Inst->eraseFromParent();
+
+ return C;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86LowerAMXTypeLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
+ initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ X86LowerAMXType LAT(F);
+ bool C = LAT.visit();
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Lower AMX type for load/store";
+char X86LowerAMXTypeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+
+FunctionPass *llvm::createX86LowerAMXTypePass() {
+ return new X86LowerAMXTypeLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
index 9ce2a4637e2e..89fa3ae3a3f4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -977,20 +977,24 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
NoAutoPaddingScope NoPadScope(*OutStreamer);
- bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
- MI.getOpcode() == X86::TLS_base_addr64;
+ bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
+ MI.getOpcode() != X86::TLS_base_addr32;
+ bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
MCContext &Ctx = OutStreamer->getContext();
MCSymbolRefExpr::VariantKind SRVK;
switch (MI.getOpcode()) {
case X86::TLS_addr32:
case X86::TLS_addr64:
+ case X86::TLS_addrX32:
SRVK = MCSymbolRefExpr::VK_TLSGD;
break;
case X86::TLS_base_addr32:
SRVK = MCSymbolRefExpr::VK_TLSLDM;
break;
case X86::TLS_base_addr64:
+ case X86::TLS_base_addrX32:
SRVK = MCSymbolRefExpr::VK_TLSLD;
break;
default:
@@ -1010,7 +1014,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
if (Is64Bits) {
bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
- if (NeedsPadding)
+ if (NeedsPadding && Is64BitsLP64)
EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
.addReg(X86::RDI)
@@ -1079,29 +1083,30 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
}
}
-/// Return the longest nop which can be efficiently decoded for the given
-/// target cpu. 15-bytes is the longest single NOP instruction, but some
-/// platforms can't decode the longest forms efficiently.
-static unsigned maxLongNopLength(const X86Subtarget *Subtarget) {
- if (Subtarget->getFeatureBits()[X86::ProcIntelSLM])
- return 7;
- if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
- return 15;
- if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
- return 11;
- if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit())
- return 10;
- if (Subtarget->is32Bit())
- return 2;
- return 1;
-}
-
/// Emit the largest nop instruction smaller than or equal to \p NumBytes
/// bytes. Return the size of nop emitted.
static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
const X86Subtarget *Subtarget) {
+ // Determine the longest nop which can be efficiently decoded for the given
+ // target cpu. 15-bytes is the longest single NOP instruction, but some
+ // platforms can't decode the longest forms efficiently.
+ unsigned MaxNopLength = 1;
+ if (Subtarget->is64Bit()) {
+ // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
+ // IndexReg/BaseReg below need to be updated.
+ if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
+ MaxNopLength = 7;
+ else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
+ MaxNopLength = 15;
+ else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
+ MaxNopLength = 11;
+ else
+ MaxNopLength = 10;
+ } if (Subtarget->is32Bit())
+ MaxNopLength = 2;
+
// Cap a single nop emission at the profitable value for the target
- NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget));
+ NumBytes = std::min(NumBytes, MaxNopLength);
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1329,7 +1334,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
MCInst MCI;
MCI.setOpcode(Opcode);
- for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
+ for (auto &MO : drop_begin(MI.operands(), 2))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
MCI.addOperand(MaybeOperand.getValue());
@@ -1705,7 +1710,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
unsigned OpCode = MI.getOperand(0).getImm();
MCInst Ret;
Ret.setOpcode(OpCode);
- for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+ for (auto &MO : drop_begin(MI.operands()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
Ret.addOperand(MaybeOperand.getValue());
OutStreamer->emitInstruction(Ret, getSubtargetInfo());
@@ -1744,7 +1749,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
// Before emitting the instruction, add a comment to indicate that this is
// indeed a tail call.
OutStreamer->AddComment("TAILCALL");
- for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+ for (auto &MO : drop_begin(MI.operands()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
TC.addOperand(MaybeOperand.getValue());
OutStreamer->emitInstruction(TC, getSubtargetInfo());
@@ -1779,10 +1784,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
if (ConstantEntry.isMachineConstantPoolEntry())
return nullptr;
- const Constant *C = ConstantEntry.Val.ConstVal;
- assert((!C || ConstantEntry.getType() == C->getType()) &&
- "Expected a constant of the same type!");
- return C;
+ return ConstantEntry.Val.ConstVal;
}
static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
@@ -2444,8 +2446,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::TLS_addr32:
case X86::TLS_addr64:
+ case X86::TLS_addrX32:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
+ case X86::TLS_base_addrX32:
return LowerTlsAddr(MCInstLowering, *MI);
case X86::MOVPC32r: {
@@ -2594,6 +2598,15 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
return;
}
+ case X86::UBSAN_UD1:
+ EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm)
+ .addReg(X86::EAX)
+ .addReg(X86::EAX)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(MI->getOperand(0).getImm())
+ .addReg(X86::NoRegister));
+ return;
}
MCInst TmpInst;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
index 8784a3df1773..babd923e7496 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -392,8 +392,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
break;
// Push incoming values to the worklist.
- for (Value *InV : PN->incoming_values())
- Worklist.push_back(InV);
+ append_range(Worklist, PN->incoming_values());
continue;
}
@@ -402,8 +401,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
if (BO->getOpcode() == Instruction::Add) {
// Simple case. Single use, just push its operands to the worklist.
if (BO->hasNUses(BO == Root ? 2 : 1)) {
- for (Value *Op : BO->operands())
- Worklist.push_back(Op);
+ append_range(Worklist, BO->operands());
continue;
}
@@ -426,8 +424,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
continue;
// The phi forms a loop with this Add, push its operands.
- for (Value *Op : BO->operands())
- Worklist.push_back(Op);
+ append_range(Worklist, BO->operands());
}
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
new file mode 100644
index 000000000000..05ee6c6c8384
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -0,0 +1,265 @@
+//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to pre-config the shape of AMX register
+/// AMX register need to be configured before use. The shape of AMX register
+/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
+/// The pldtilecfg is to config tile registers. It should dominator all AMX
+/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
+/// register is used by all AMX instructions.
+/// This pass is to find the common dominator of all AMX instructions and
+/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
+/// produces is inserted as the last operand of each AMX instruction. We use
+/// this scheme to model the def-use relationship between AMX config instruction
+/// and other AMX instructions. Below is an example.
+///
+/// ----B1----
+/// / \
+/// / \
+/// B2 B3
+/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
+///
+/// is transformed to
+///
+/// B1
+/// %25:tilecfg = PLDTILECFG
+/// / \
+/// / \
+/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-pre-config"
+
+namespace {
+
+class X86PreTileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+
+public:
+ X86PreTileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Tile Register Pre-configure";
+ }
+
+ /// X86PreTileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86PreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+
+void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
+ const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI,
+ const X86Subtarget *ST) {
+ auto *MBB = MI->getParent();
+
+ // FIXME: AMX should assume AVX512 enabled.
+ if (ST->hasAVX512()) {
+ // Zero stack slot.
+ Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
+ .addReg(Zmm, RegState::Undef)
+ .addReg(Zmm, RegState::Undef);
+ addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
+ FrameIdx)
+ .addReg(Zmm);
+ }
+
+ // build psuedo ldtilecfg
+ Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
+
+ addFrameReference(
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
+
+ return VReg;
+}
+
+static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile");
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
+ MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
+ ShapeT Shape(&MO1, &MO2, MRI);
+ return Shape;
+ }
+}
+
+MachineInstr *X86PreTileConfig::getTileConfigPoint() {
+ DenseMap<Register, ShapeT> PhysShapeInfo;
+ MachineBasicBlock *MBB = nullptr;
+ DenseSet<const MachineInstr *> MIs;
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+
+ // Find the common dominator for all MI that define tile register.
+ for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
+ if (MO.isUndef())
+ continue;
+ const auto *MI = MO.getParent();
+ // PHI or IMPLICIT_DEF instructiion.
+ // There must be a input tile before PHI instruction.
+ if (MI->isTransient())
+ continue;
+ if (!MBB)
+ MBB = const_cast<MachineBasicBlock *>(MI->getParent());
+ MBB = DomTree->findNearestCommonDominator(
+ MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+
+ // Collect the instructions that define shape.
+ ShapeT Shape = getShape(*MI, MRI);
+ std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
+ Shape.getCol()};
+ for (auto *ShapeMO : ShapeMOs) {
+ Register ShapeReg = ShapeMO->getReg();
+ for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
+ const auto *ShapeMI = MO.getParent();
+ MIs.insert(ShapeMI);
+ }
+ }
+ }
+ }
+ if (!MBB)
+ return nullptr;
+ // This pass is before the pass of eliminating PHI node, so it
+ // is in SSA form.
+ assert(MRI->isSSA() && "Not SSA form in pre-tile config");
+ // Shape def should dominate tile config MBB.
+ // def s s1 s2
+ // / \ \ /
+ // / \ \ /
+ // conf s3=phi(s1,s2)
+ // |
+ // c
+ //
+ for (const auto *MI : MIs) {
+ const MachineBasicBlock *ShapeMBB = MI->getParent();
+ if (DomTree->dominates(ShapeMBB, MBB))
+ continue;
+ if (MI->isMoveImmediate())
+ continue;
+ report_fatal_error(MF->getName() + ": Failed to config tile register, "
+ "please define the shape earlier");
+ }
+
+ // ldtilecfg should be inserted after the MI that define the shape.
+ MachineBasicBlock::reverse_instr_iterator I, E;
+ for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
+ auto *MI = &*I;
+ if (MIs.count(MI) && (!MI->isMoveImmediate()))
+ break;
+ }
+ MachineBasicBlock::iterator MII;
+ if (I == E)
+ MII = MBB->getFirstNonPHI();
+ else {
+ MII = MachineBasicBlock::iterator(&*I);
+ MII++;
+ }
+ return &*MII;
+}
+
+static void addTileCFGUse(MachineFunction &MF, Register CFG) {
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case X86::PTILELOADDV:
+ case X86::PTILESTOREDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ unsigned NumOperands = MI.getNumOperands();
+ MI.RemoveOperand(NumOperands - 1);
+ MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
+ break;
+ }
+ }
+ }
+}
+
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return false;
+ unsigned Size = ST->getTileConfigSize();
+ Align Alignment = ST->getTileConfigAlignment();
+ int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
+ addTileCFGUse(mf, CFG);
+ return true;
+}
+
+FunctionPass *llvm::createX86PreTileConfigPass() {
+ return new X86PreTileConfig();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
index f456728cf47b..d90b4e7bdc7e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -18,6 +18,8 @@
#include "X86Subtarget.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -664,13 +666,6 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
return true;
}
-bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
- Register Reg, int &FrameIdx) const {
- // Since X86 defines assignCalleeSavedSpillSlots which always return true
- // this function neither used nor tested.
- llvm_unreachable("Unused function on X86. Otherwise need a test case.");
-}
-
// tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction
// of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'.
// TODO: In this case we should be really trying first to entirely eliminate
@@ -731,11 +726,12 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert((!needsStackRealignment(MF) ||
MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
"Return instruction can only reference SP relative frame objects");
- FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+ FIOffset =
+ TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed();
} else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
} else {
- FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
+ FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed();
}
// LOCAL_ESCAPE uses a single offset, with no register. It only works in the
@@ -790,6 +786,55 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
}
+unsigned X86RegisterInfo::findDeadCallerSavedReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
+ const MachineFunction *MF = MBB.getParent();
+ if (MF->callsEHReturn())
+ return 0;
+
+ const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
+
+ if (MBBI == MBB.end())
+ return 0;
+
+ switch (MBBI->getOpcode()) {
+ default:
+ return 0;
+ case TargetOpcode::PATCHABLE_RET:
+ case X86::RET:
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ SmallSet<uint16_t, 8> Uses;
+ for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+ MachineOperand &MO = MBBI->getOperand(I);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
+ return CS;
+ }
+ }
+
+ return 0;
+}
+
Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const X86FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? FramePtr : StackPtr;
@@ -812,3 +857,79 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
StackReg = getX86SubSuperRegister(StackReg, 32);
return StackReg;
}
+
+static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
+ const MachineRegisterInfo *MRI) {
+ if (VRM->hasShape(VirtReg))
+ return VRM->getShape(VirtReg);
+
+ const MachineOperand &Def = *MRI->def_begin(VirtReg);
+ MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
+ unsigned OpCode = MI->getOpcode();
+ switch (OpCode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile register!");
+ break;
+ // We only collect the tile shape that is defined.
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ MachineOperand &MO1 = MI->getOperand(1);
+ MachineOperand &MO2 = MI->getOperand(2);
+ ShapeT Shape(&MO1, &MO2, MRI);
+ VRM->assignVirt2Shape(VirtReg, Shape);
+ return Shape;
+ }
+}
+
+bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+
+ if (RC.getID() != X86::TILERegClassID)
+ return BaseImplRetVal;
+
+ ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ if (VReg == MCRegister::NoRegister) { // Not allocated yet
+ Hints.push_back(PhysReg);
+ return;
+ }
+ ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
+ if (PhysShape == VirtShape)
+ Hints.push_back(PhysReg);
+ };
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+ Hints.clear();
+ for (auto Hint : CopyHints) {
+ if (RC.contains(Hint) && !MRI->isReserved(Hint))
+ AddHint(Hint);
+ }
+ for (MCPhysReg PhysReg : Order) {
+ if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
+ !MRI->isReserved(PhysReg))
+ AddHint(PhysReg);
+ }
+
+#define DEBUG_TYPE "tile-hint"
+ LLVM_DEBUG({
+ dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
+ for (auto Hint : Hints) {
+ dbgs() << "tmm" << Hint << ",";
+ }
+ dbgs() << "\n";
+ });
+#undef DEBUG_TYPE
+
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
index 3435c0a10b04..7fd10ddd1a15 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -121,13 +121,16 @@ public:
bool canRealignStack(const MachineFunction &MF) const override;
- bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
- int &FrameIdx) const override;
-
void eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
+ /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+ /// when it reaches the "return" instruction. We can then pop a stack object
+ /// to this register without worry about clobbering it.
+ unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) const;
+
// Debug information queries.
Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
@@ -141,6 +144,11 @@ public:
Register getFramePtr() const { return FramePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
+
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
};
} // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
index 8de5b94bbffa..75cbd4e1cff1 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -265,6 +265,9 @@ let SubRegIndices = [sub_ymm] in {
}
}
+// Tile config registers.
+def TMMCFG: X86Reg<"tmmcfg", 0>;
+
// Tile "registers".
def TMM0: X86Reg<"tmm0", 0>;
def TMM1: X86Reg<"tmm1", 1>;
@@ -633,6 +636,11 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
// Tiles
-let isAllocatable = 0 in
-def TILE : RegisterClass<"X86", [untyped], 0,
+let CopyCost = -1 in // Don't allow copying of tile registers
+def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
+ let CopyCost = -1; // Don't allow copying of tile config registers.
+ let isAllocatable = 1;
+ let Size = 512;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce8d1d464da9..e76908ef4bc4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -24,6 +24,10 @@ using namespace llvm;
#define DEBUG_TYPE "x86-selectiondag-info"
+static cl::opt<bool>
+ UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
+ cl::desc("Use fast short rep mov in memcpy lowering"));
+
bool X86SelectionDAGInfo::isBaseRegConflictPossible(
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -306,6 +310,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
const X86Subtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+ // If enabled and available, use fast short rep mov.
+ if (UseFSRMForMemcpy && Subtarget.hasFSRM())
+ return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
+
/// Handle constant sizes,
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
return emitConstantSizeRepmov(
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index de528299654c..14a3fea240e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -293,55 +293,4 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
}
}
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
- SmallVectorImpl<int> &ShuffleMask) {
- assert((Width == 128 || Width == 256 || Width == 512) &&
- C->getType()->getPrimitiveSizeInBits() >= Width &&
- "Unexpected vector size.");
- assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
- "Unexpected vector element size.");
-
- // The shuffle mask requires elements the same size as the target.
- APInt UndefElts;
- SmallVector<uint64_t, 64> RawMask;
- if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
- return;
-
- unsigned NumElts = Width / ElSize;
-
- for (unsigned i = 0; i != NumElts; ++i) {
- if (UndefElts[i]) {
- ShuffleMask.push_back(SM_SentinelUndef);
- continue;
- }
- int Index = RawMask[i] & (NumElts - 1);
- ShuffleMask.push_back(Index);
- }
-}
-
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
- SmallVectorImpl<int> &ShuffleMask) {
- assert((Width == 128 || Width == 256 || Width == 512) &&
- C->getType()->getPrimitiveSizeInBits() >= Width &&
- "Unexpected vector size.");
- assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
- "Unexpected vector element size.");
-
- // The shuffle mask requires elements the same size as the target.
- APInt UndefElts;
- SmallVector<uint64_t, 64> RawMask;
- if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
- return;
-
- unsigned NumElts = Width / ElSize;
-
- for (unsigned i = 0; i != NumElts; ++i) {
- if (UndefElts[i]) {
- ShuffleMask.push_back(SM_SentinelUndef);
- continue;
- }
- int Index = RawMask[i] & (NumElts*2 - 1);
- ShuffleMask.push_back(Index);
- }
-}
-} // llvm namespace
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index 51229a69a626..77236f6aac9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -38,14 +38,6 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
void DecodeVPPERMMask(const Constant *C, unsigned Width,
SmallVectorImpl<int> &ShuffleMask);
-/// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
- SmallVectorImpl<int> &ShuffleMask);
-
-/// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
- SmallVectorImpl<int> &ShuffleMask);
-
} // llvm namespace
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
index 7e91c37367d2..d57871130b0c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
@@ -161,6 +161,7 @@ bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction(
// This branch requires adding an LFENCE.
if (!PrevInstIsLFENCE) {
+ assert(FirstTerminator && "Unknown terminator instruction");
BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE));
NumLFENCEsInserted++;
Modified = true;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index fe5b9a05f811..aa73d4bce65a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -184,7 +184,7 @@ private:
MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
void restoreEFLAGS(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- unsigned OFReg);
+ Register Reg);
void mergePredStateIntoSP(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
@@ -200,8 +200,8 @@ private:
MachineInstr *
sinkPostLoadHardenedInst(MachineInstr &MI,
SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
- bool canHardenRegister(unsigned Reg);
- unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
+ bool canHardenRegister(Register Reg);
+ unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
DebugLoc Loc);
unsigned hardenPostLoad(MachineInstr &MI);
@@ -1520,7 +1520,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
/// reliably lower.
void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- unsigned Reg) {
+ Register Reg) {
BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
++NumInstsInserted;
}
@@ -1842,8 +1842,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// just bail. Also check that its register class is one of the ones we
// can harden.
Register UseDefReg = UseMI.getOperand(0).getReg();
- if (!Register::isVirtualRegister(UseDefReg) ||
- !canHardenRegister(UseDefReg))
+ if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
return {};
SingleUseMI = &UseMI;
@@ -1865,7 +1864,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
return MI;
}
-bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
auto *RC = MRI->getRegClass(Reg);
int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
if (RegBytes > 8)
@@ -1909,10 +1908,10 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
/// The new, hardened virtual register is returned. It will have the same
/// register class as `Reg`.
unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
- unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
DebugLoc Loc) {
assert(canHardenRegister(Reg) && "Cannot harden this register!");
- assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!");
+ assert(Reg.isVirtual() && "Cannot harden a physical register!");
auto *RC = MRI->getRegClass(Reg);
int Bytes = TRI->getRegSizeInBits(*RC) / 8;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
index 975cbabb30fd..c95213c3539d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -166,6 +166,10 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
}
+ // 32-bit ELF references GlobalAddress directly in static relocation model.
+ // We cannot use MO_GOT because EBX may not be set up.
+ if (TM.getRelocationModel() == Reloc::Static)
+ return X86II::MO_NO_FLAG;
return X86II::MO_GOT;
}
@@ -202,6 +206,9 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
(!F && M.getRtLibUseGOT())) &&
is64Bit())
return X86II::MO_GOTPCREL;
+ // Reference ExternalSymbol directly in static relocation model.
+ if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static)
+ return X86II::MO_NO_FLAG;
return X86II::MO_PLT;
}
@@ -227,39 +234,22 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
}
-void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
- std::string CPUName = std::string(CPU);
- if (CPUName.empty())
- CPUName = "generic";
-
- std::string FullFS = std::string(FS);
- if (In64BitMode) {
- // SSE2 should default to enabled in 64-bit mode, but can be turned off
- // explicitly.
- if (!FullFS.empty())
- FullFS = "+sse2," + FullFS;
- else
- FullFS = "+sse2";
-
- // If no CPU was specified, enable 64bit feature to satisy later check.
- if (CPUName == "generic") {
- if (!FullFS.empty())
- FullFS = "+64bit," + FullFS;
- else
- FullFS = "+64bit";
- }
- }
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
+ StringRef FS) {
+ if (CPU.empty())
+ CPU = "generic";
- // LAHF/SAHF are always supported in non-64-bit mode.
- if (!In64BitMode) {
- if (!FullFS.empty())
- FullFS = "+sahf," + FullFS;
- else
- FullFS = "+sahf";
- }
+ if (TuneCPU.empty())
+ TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect.
+
+ std::string FullFS = X86_MC::ParseX86Triple(TargetTriple);
+ assert(!FullFS.empty() && "Failed to parse X86 triple");
+
+ if (!FS.empty())
+ FullFS = (Twine(FullFS) + "," + FS).str();
// Parse features string and set the CPU.
- ParseSubtargetFeatures(CPUName, FullFS);
+ ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
// All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
// 16-bytes and under that are reasonably fast. These features were
@@ -268,17 +258,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (hasSSE42() || hasSSE4A())
IsUAMem16Slow = false;
- // It's important to keep the MCSubtargetInfo feature bits in sync with
- // target data structure which is shared with MC code emitter, etc.
- if (In64BitMode)
- ToggleFeature(X86::Mode64Bit);
- else if (In32BitMode)
- ToggleFeature(X86::Mode32Bit);
- else if (In16BitMode)
- ToggleFeature(X86::Mode16Bit);
- else
- llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
-
LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
<< ", 3DNowLevel " << X863DNowLevel << ", 64bit "
<< HasX86_64 << "\n");
@@ -286,25 +265,15 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
report_fatal_error("64-bit code requested on a subtarget that doesn't "
"support it!");
- // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
- // 32 and 64 bit) and for all 64-bit targets.
+ // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all
+ // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes
+ // following the i386 psABI, while on Illumos it is always 16 bytes.
if (StackAlignOverride)
stackAlignment = *StackAlignOverride;
- else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
- isTargetKFreeBSD() || In64BitMode)
+ else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
+ In64BitMode)
stackAlignment = Align(16);
- // Some CPUs have more overhead for gather. The specified overhead is relative
- // to the Load operation. "2" is the number provided by Intel architects. This
- // parameter is used for cost estimation of Gather Op and comparison with
- // other alternatives.
- // TODO: Remove the explicit hasAVX512()?, That would mean we would only
- // enable gather with a -march.
- if (hasAVX512() || (hasAVX2() && hasFastGather()))
- GatherOverhead = 2;
- if (hasAVX512())
- ScatterOverhead = 2;
-
// Consume the vector width attribute or apply any target specific limit.
if (PreferVectorWidthOverride)
PreferVectorWidth = PreferVectorWidthOverride;
@@ -315,27 +284,24 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
}
X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef TuneCPU,
StringRef FS) {
- initSubtargetFeatures(CPU, FS);
+ initSubtargetFeatures(CPU, TuneCPU, FS);
return *this;
}
-X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const X86TargetMachine &TM,
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+ StringRef FS, const X86TargetMachine &TM,
MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth)
- : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::Style::None),
- TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride),
+ : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS),
+ PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT),
+ StackAlignOverride(StackAlignOverride),
PreferVectorWidthOverride(PreferVectorWidthOverride),
RequiredVectorWidth(RequiredVectorWidth),
- In64BitMode(TargetTriple.getArch() == Triple::x86_64),
- In32BitMode(TargetTriple.getArch() == Triple::x86 &&
- TargetTriple.getEnvironment() != Triple::CODE16),
- In16BitMode(TargetTriple.getArch() == Triple::x86 &&
- TargetTriple.getEnvironment() == Triple::CODE16),
- InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
- FrameLowering(*this, getStackAlignment()) {
+ InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
+ TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
// Determine the PICStyle based on the target selected.
if (!isPositionIndependent())
setPICStyle(PICStyles::Style::None);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
index de45d357e3c2..fa2622333d60 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
@@ -50,7 +50,6 @@ enum class Style {
} // end namespace PICStyles
class X86Subtarget final : public X86GenSubtargetInfo {
-public:
// NOTE: Do not add anything new to this list. Coarse, CPU name based flags
// are not a good idea. We should be migrating away from these.
enum X86ProcFamilyEnum {
@@ -59,7 +58,6 @@ public:
IntelSLM
};
-protected:
enum X86SSEEnum {
NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
};
@@ -191,8 +189,8 @@ protected:
/// Processor has RDSEED instructions.
bool HasRDSEED = false;
- /// Processor has LAHF/SAHF instructions.
- bool HasLAHFSAHF = false;
+ /// Processor has LAHF/SAHF instructions in 64-bit mode.
+ bool HasLAHFSAHF64 = false;
/// Processor has MONITORX/MWAITX instructions.
bool HasMWAITX = false;
@@ -304,6 +302,9 @@ protected:
/// True if the processor has enhanced REP MOVSB/STOSB.
bool HasERMSB = false;
+ /// True if the processor has fast short REP MOV.
+ bool HasFSRM = false;
+
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions = false;
@@ -354,6 +355,9 @@ protected:
/// Processor has AVX-512 Vector Neural Network Instructions
bool HasVNNI = false;
+ /// Processor has AVX Vector Neural Network Instructions
+ bool HasAVXVNNI = false;
+
/// Processor has AVX-512 bfloat16 floating-point extensions
bool HasBF16 = false;
@@ -366,9 +370,6 @@ protected:
/// Processor has AVX-512 vp2intersect instructions
bool HasVP2INTERSECT = false;
- /// Deprecated flag for MPX instructions.
- bool DeprecatedHasMPX = false;
-
/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
/// using Shadow Stack
bool HasSHSTK = false;
@@ -397,6 +398,15 @@ protected:
/// Processor supports PCONFIG instruction
bool HasPCONFIG = false;
+ /// Processor support key locker instructions
+ bool HasKL = false;
+
+ /// Processor support key locker wide instructions
+ bool HasWIDEKL = false;
+
+ /// Processor supports HRESET instruction
+ bool HasHRESET = false;
+
/// Processor supports SERIALIZE instruction
bool HasSERIALIZE = false;
@@ -408,6 +418,9 @@ protected:
bool HasAMXBF16 = false;
bool HasAMXINT8 = false;
+ /// Processor supports User Level Interrupt instructions
+ bool HasUINTR = false;
+
/// Processor has a single uop BEXTR implementation.
bool HasFastBEXTR = false;
@@ -459,6 +472,8 @@ protected:
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
+ Align TileConfigAlignment = Align(4);
+
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
// FIXME: this is a known good value for Yonah. How about others?
@@ -473,9 +488,6 @@ protected:
/// Indicates target prefers AVX512 mask registers.
bool PreferMaskRegisters = false;
- /// Threeway branch is profitable in this subtarget.
- bool ThreewayBranchProfitable = false;
-
/// Use Goldmont specific floating point div/sqrt costs.
bool UseGLMDivSqrtCosts = false;
@@ -503,17 +515,13 @@ private:
unsigned RequiredVectorWidth;
/// True if compiling for 64-bit, false for 16-bit or 32-bit.
- bool In64BitMode;
+ bool In64BitMode = false;
/// True if compiling for 32-bit, false for 16-bit or 64-bit.
- bool In32BitMode;
+ bool In32BitMode = false;
/// True if compiling for 16-bit, false for 32-bit or 64-bit.
- bool In16BitMode;
-
- /// Contains the Overhead of gather\scatter instructions
- int GatherOverhead = 1024;
- int ScatterOverhead = 1024;
+ bool In16BitMode = false;
X86SelectionDAGInfo TSInfo;
// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
@@ -526,7 +534,7 @@ public:
/// This constructor initializes the data members to match that
/// of the specified triple.
///
- X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth);
@@ -549,6 +557,9 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
+ unsigned getTileConfigSize() const { return 64; }
+ Align getTileConfigAlignment() const { return TileConfigAlignment; }
+
/// Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
@@ -560,7 +571,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
/// Methods used by Global ISel
const CallLowering *getCallLowering() const override;
@@ -571,8 +582,10 @@ public:
private:
/// Initialize the full set of dependencies so we can use an initializer
/// list for X86Subtarget.
- X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
- void initSubtargetFeatures(StringRef CPU, StringRef FS);
+ X86Subtarget &initializeSubtargetDependencies(StringRef CPU,
+ StringRef TuneCPU,
+ StringRef FS);
+ void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
public:
/// Is this x86_64? (disregarding specific ABI / programming model)
@@ -671,7 +684,7 @@ public:
return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
}
bool hasRDSEED() const { return HasRDSEED; }
- bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+ bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
bool hasMWAITX() const { return HasMWAITX; }
bool hasCLZERO() const { return HasCLZERO; }
bool hasCLDEMOTE() const { return HasCLDEMOTE; }
@@ -683,8 +696,6 @@ public:
bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
- int getGatherOverhead() const { return GatherOverhead; }
- int getScatterOverhead() const { return ScatterOverhead; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); }
bool useLeaForSP() const { return UseLeaForSP; }
@@ -706,6 +717,7 @@ public:
bool hasMacroFusion() const { return HasMacroFusion; }
bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }
+ bool hasFSRM() const { return HasFSRM; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
@@ -734,15 +746,19 @@ public:
bool hasWAITPKG() const { return HasWAITPKG; }
bool hasPCONFIG() const { return HasPCONFIG; }
bool hasSGX() const { return HasSGX; }
- bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
bool hasINVPCID() const { return HasINVPCID; }
bool hasENQCMD() const { return HasENQCMD; }
+ bool hasKL() const { return HasKL; }
+ bool hasWIDEKL() const { return HasWIDEKL; }
+ bool hasHRESET() const { return HasHRESET; }
bool hasSERIALIZE() const { return HasSERIALIZE; }
bool hasTSXLDTRK() const { return HasTSXLDTRK; }
+ bool hasUINTR() const { return HasUINTR; }
bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
}
+ bool hasAVXVNNI() const { return HasAVXVNNI; }
bool hasAMXTILE() const { return HasAMXTILE; }
bool hasAMXBF16() const { return HasAMXBF16; }
bool hasAMXINT8() const { return HasAMXINT8; }
@@ -792,8 +808,6 @@ public:
bool isXRaySupported() const override { return is64Bit(); }
- X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }
-
/// TODO: to be removed later and replaced with suitable properties
bool isAtom() const { return X86ProcFamily == IntelAtom; }
bool isSLM() const { return X86ProcFamily == IntelSLM; }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
index 7344116e14af..c8f76c210a3f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -56,17 +56,13 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
-static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
- cl::desc("Enable the conditional branch "
- "folding pass"),
- cl::init(false), cl::Hidden);
-
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
// Register the target.
RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeX86LowerAMXTypeLegacyPassPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@@ -76,6 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86FixupSetCCPassPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
+ initializeX86TileConfigPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
@@ -84,11 +81,11 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86SpeculativeLoadHardeningPassPass(PR);
initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
initializeX86FlagsCopyLoweringPassPass(PR);
- initializeX86CondBrFoldingPassPass(PR);
initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
initializeX86LoadValueInjectionRetHardeningPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
initializeX86PartialReductionPass(PR);
+ initializePseudoProbeInserterPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -239,43 +236,30 @@ X86TargetMachine::~X86TargetMachine() = default;
const X86Subtarget *
X86TargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
+ Attribute TuneAttr = F.getFnAttribute("tune-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
- ? CPUAttr.getValueAsString()
- : (StringRef)TargetCPU;
- StringRef FS = !FSAttr.hasAttribute(Attribute::None)
- ? FSAttr.getValueAsString()
- : (StringRef)TargetFS;
+ StringRef CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
+ StringRef TuneCPU =
+ TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+ StringRef FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
SmallString<512> Key;
- Key.reserve(CPU.size() + FS.size());
- Key += CPU;
- Key += FS;
-
- // FIXME: This is related to the code below to reset the target options,
- // we need to know whether or not the soft float flag is set on the
- // function before we can generate a subtarget. We also need to use
- // it as a key for the subtarget since that can be the only difference
- // between two functions.
- bool SoftFloat =
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
- // If the soft float attribute is set on the function turn on the soft float
- // subtarget feature.
- if (SoftFloat)
- Key += FS.empty() ? "+soft-float" : ",+soft-float";
-
- // Keep track of the key width after all features are added so we can extract
- // the feature string out later.
- unsigned CPUFSWidth = Key.size();
+ // The additions here are ordered so that the definitely short strings are
+ // added first so we won't exceed the small size. We append the
+ // much longer FS string at the end so that we only heap allocate at most
+ // one time.
// Extract prefer-vector-width attribute.
unsigned PreferVectorWidthOverride = 0;
- if (F.hasFnAttribute("prefer-vector-width")) {
- StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString();
+ Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width");
+ if (PreferVecWidthAttr.isValid()) {
+ StringRef Val = PreferVecWidthAttr.getValueAsString();
unsigned Width;
if (!Val.getAsInteger(0, Width)) {
- Key += ",prefer-vector-width=";
+ Key += "prefer-vector-width=";
Key += Val;
PreferVectorWidthOverride = Width;
}
@@ -283,21 +267,44 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// Extract min-legal-vector-width attribute.
unsigned RequiredVectorWidth = UINT32_MAX;
- if (F.hasFnAttribute("min-legal-vector-width")) {
- StringRef Val =
- F.getFnAttribute("min-legal-vector-width").getValueAsString();
+ Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width");
+ if (MinLegalVecWidthAttr.isValid()) {
+ StringRef Val = MinLegalVecWidthAttr.getValueAsString();
unsigned Width;
if (!Val.getAsInteger(0, Width)) {
- Key += ",min-legal-vector-width=";
+ Key += "min-legal-vector-width=";
Key += Val;
RequiredVectorWidth = Width;
}
}
- // Extracted here so that we make sure there is backing for the StringRef. If
- // we assigned earlier, its possible the SmallString reallocated leaving a
- // dangling StringRef.
- FS = Key.slice(CPU.size(), CPUFSWidth);
+ // Add CPU to the Key.
+ Key += CPU;
+
+ // Add tune CPU to the Key.
+ Key += "tune=";
+ Key += TuneCPU;
+
+ // Keep track of the start of the feature portion of the string.
+ unsigned FSStart = Key.size();
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ Key += FS.empty() ? "+soft-float" : "+soft-float,";
+
+ Key += FS;
+
+ // We may have added +soft-float to the features so move the StringRef to
+ // point to the full string in the Key.
+ FS = Key.substr(FSStart);
auto &I = SubtargetMap[Key];
if (!I) {
@@ -306,13 +313,21 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = std::make_unique<X86Subtarget>(
- TargetTriple, CPU, FS, *this,
+ TargetTriple, CPU, TuneCPU, FS, *this,
MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
RequiredVectorWidth);
}
return I.get();
}
+bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ assert(SrcAS != DestAS && "Expected different address spaces!");
+ if (getPointerSize(SrcAS) != getPointerSize(DestAS))
+ return false;
+ return SrcAS < 256 && DestAS < 256;
+}
+
//===----------------------------------------------------------------------===//
// X86 TTI query.
//===----------------------------------------------------------------------===//
@@ -366,6 +381,7 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
+ bool addPreRewrite() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -394,6 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
void X86PassConfig::addIRPasses() {
addPass(createAtomicExpandPass());
+ addPass(createX86LowerAMXTypePass());
TargetPassConfig::addIRPasses();
@@ -432,7 +449,7 @@ bool X86PassConfig::addInstSelector() {
}
bool X86PassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
@@ -452,8 +469,6 @@ bool X86PassConfig::addGlobalInstructionSelect() {
}
bool X86PassConfig::addILPOpts() {
- if (EnableCondBrFoldingPass)
- addPass(createX86CondBrFolding());
addPass(&EarlyIfConverterID);
if (EnableMachineCombinerPass)
addPass(&MachineCombinerID);
@@ -481,7 +496,12 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86SpeculativeLoadHardeningPass());
addPass(createX86FlagsCopyLoweringPass());
addPass(createX86WinAllocaExpander());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86PreTileConfigPass());
+ }
}
+
void X86PassConfig::addMachineSSAOptimization() {
addPass(createX86DomainReassignmentPass());
TargetPassConfig::addMachineSSAOptimization();
@@ -554,6 +574,11 @@ void X86PassConfig::addPreEmitPass2() {
addPass(createX86LoadValueInjectionRetHardeningPass());
}
+bool X86PassConfig::addPreRewrite() {
+ addPass(createX86TileConfigPass());
+ return true;
+}
+
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
index 8d98474a39c0..69d7e48b8977 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
@@ -54,6 +54,8 @@ public:
}
bool isJIT() const { return IsJIT; }
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 2b48baccc01f..b88ad5a478f3 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -7,16 +7,8 @@
//===----------------------------------------------------------------------===//
#include "X86TargetObjectFile.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
index acea772eb036..f4bf52c83771 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -36,7 +36,7 @@ namespace llvm {
MCStreamer &Streamer) const override;
};
- /// This implemenatation is used for X86 ELF targets that don't
+ /// This implementation is used for X86 ELF targets that don't
/// have a further specialization.
class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
public:
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cc18e55656ef..71455237fb61 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -232,16 +232,16 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
bool Op2Signed = false;
unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
- bool signedMode = Op1Signed | Op2Signed;
+ bool SignedMode = Op1Signed || Op2Signed;
unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
if (OpMinSize <= 7)
return LT.first * 3; // pmullw/sext
- if (!signedMode && OpMinSize <= 8)
+ if (!SignedMode && OpMinSize <= 8)
return LT.first * 3; // pmullw/zext
if (OpMinSize <= 15)
return LT.first * 5; // pmullw/pmulhw/pshuf
- if (!signedMode && OpMinSize <= 16)
+ if (!SignedMode && OpMinSize <= 16)
return LT.first * 5; // pmullw/pmulhw/pshuf
}
@@ -321,6 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -336,6 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
+
+ { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -353,6 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
{ ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+ { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
+ { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
+ { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
};
// XOP has faster vXi8 shifts.
@@ -1109,6 +1128,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
+
+ {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
+ {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
};
if (ST->hasBWI())
@@ -1162,6 +1184,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
+
+ {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
+ {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
+ {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
+ {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
};
if (ST->hasAVX512())
@@ -1367,6 +1396,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
}
int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -1988,7 +2018,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// The function getSimpleVT only handles simple value types.
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
MVT SimpleSrcTy = SrcTy.getSimpleVT();
MVT SimpleDstTy = DstTy.getSimpleVT();
@@ -2049,15 +2079,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return AdjustCost(Entry->Cost);
}
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+ I);
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -2241,7 +2274,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * (ExtraCost + Entry->Cost);
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
@@ -2255,6 +2288,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
// CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
// CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
// CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+
+ // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
+ // specialized in these tables yet.
static const CostTblEntry AVX512CDCostTbl[] = {
{ ISD::CTLZ, MVT::v8i64, 1 },
{ ISD::CTLZ, MVT::v16i32, 1 },
@@ -2270,6 +2306,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTLZ, MVT::v16i8, 4 },
};
static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::ABS, MVT::v32i16, 1 },
+ { ISD::ABS, MVT::v64i8, 1 },
{ ISD::BITREVERSE, MVT::v8i64, 5 },
{ ISD::BITREVERSE, MVT::v16i32, 5 },
{ ISD::BITREVERSE, MVT::v32i16, 5 },
@@ -2288,14 +2326,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v64i8, 9 },
{ ISD::SADDSAT, MVT::v32i16, 1 },
{ ISD::SADDSAT, MVT::v64i8, 1 },
+ { ISD::SMAX, MVT::v32i16, 1 },
+ { ISD::SMAX, MVT::v64i8, 1 },
+ { ISD::SMIN, MVT::v32i16, 1 },
+ { ISD::SMIN, MVT::v64i8, 1 },
{ ISD::SSUBSAT, MVT::v32i16, 1 },
{ ISD::SSUBSAT, MVT::v64i8, 1 },
{ ISD::UADDSAT, MVT::v32i16, 1 },
{ ISD::UADDSAT, MVT::v64i8, 1 },
+ { ISD::UMAX, MVT::v32i16, 1 },
+ { ISD::UMAX, MVT::v64i8, 1 },
+ { ISD::UMIN, MVT::v32i16, 1 },
+ { ISD::UMIN, MVT::v64i8, 1 },
{ ISD::USUBSAT, MVT::v32i16, 1 },
{ ISD::USUBSAT, MVT::v64i8, 1 },
};
static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::ABS, MVT::v8i64, 1 },
+ { ISD::ABS, MVT::v16i32, 1 },
+ { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::ABS, MVT::v4i64, 1 },
+ { ISD::ABS, MVT::v2i64, 1 },
{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },
{ ISD::BITREVERSE, MVT::v32i16, 10 },
@@ -2312,6 +2364,30 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v16i32, 28 },
{ ISD::CTTZ, MVT::v32i16, 24 },
{ ISD::CTTZ, MVT::v64i8, 18 },
+ { ISD::SMAX, MVT::v8i64, 1 },
+ { ISD::SMAX, MVT::v16i32, 1 },
+ { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SMAX, MVT::v4i64, 1 },
+ { ISD::SMAX, MVT::v2i64, 1 },
+ { ISD::SMIN, MVT::v8i64, 1 },
+ { ISD::SMIN, MVT::v16i32, 1 },
+ { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SMIN, MVT::v4i64, 1 },
+ { ISD::SMIN, MVT::v2i64, 1 },
+ { ISD::UMAX, MVT::v8i64, 1 },
+ { ISD::UMAX, MVT::v16i32, 1 },
+ { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UMAX, MVT::v4i64, 1 },
+ { ISD::UMAX, MVT::v2i64, 1 },
+ { ISD::UMIN, MVT::v8i64, 1 },
+ { ISD::UMIN, MVT::v16i32, 1 },
+ { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UMIN, MVT::v4i64, 1 },
+ { ISD::UMIN, MVT::v2i64, 1 },
{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
@@ -2352,6 +2428,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::BITREVERSE, MVT::i8, 3 }
};
static const CostTblEntry AVX2CostTbl[] = {
+ { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+ { ISD::ABS, MVT::v8i32, 1 },
+ { ISD::ABS, MVT::v16i16, 1 },
+ { ISD::ABS, MVT::v32i8, 1 },
{ ISD::BITREVERSE, MVT::v4i64, 5 },
{ ISD::BITREVERSE, MVT::v8i32, 5 },
{ ISD::BITREVERSE, MVT::v16i16, 5 },
@@ -2373,14 +2453,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v32i8, 9 },
{ ISD::SADDSAT, MVT::v16i16, 1 },
{ ISD::SADDSAT, MVT::v32i8, 1 },
+ { ISD::SMAX, MVT::v8i32, 1 },
+ { ISD::SMAX, MVT::v16i16, 1 },
+ { ISD::SMAX, MVT::v32i8, 1 },
+ { ISD::SMIN, MVT::v8i32, 1 },
+ { ISD::SMIN, MVT::v16i16, 1 },
+ { ISD::SMIN, MVT::v32i8, 1 },
{ ISD::SSUBSAT, MVT::v16i16, 1 },
{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
+ { ISD::UMAX, MVT::v8i32, 1 },
+ { ISD::UMAX, MVT::v16i16, 1 },
+ { ISD::UMAX, MVT::v32i8, 1 },
+ { ISD::UMIN, MVT::v8i32, 1 },
+ { ISD::UMIN, MVT::v16i16, 1 },
+ { ISD::UMIN, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
+ { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -2389,6 +2483,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};
static const CostTblEntry AVX1CostTbl[] = {
+ { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+ { ISD::ABS, MVT::v8i32, 3 },
+ { ISD::ABS, MVT::v16i16, 3 },
+ { ISD::ABS, MVT::v32i8, 3 },
{ ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
{ ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
{ ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
@@ -2410,20 +2508,32 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
{ ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
- { ISD::FMAXNUM, MVT::f32, 3 },
- { ISD::FMAXNUM, MVT::v4f32, 3 },
- { ISD::FMAXNUM, MVT::v8f32, 5 },
- { ISD::FMAXNUM, MVT::f64, 3 },
- { ISD::FMAXNUM, MVT::v2f64, 3 },
- { ISD::FMAXNUM, MVT::v4f64, 5 },
+ { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
+ { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -2449,7 +2559,21 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
+ static const CostTblEntry SSE41CostTbl[] = {
+ { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
+ { ISD::SMAX, MVT::v4i32, 1 },
+ { ISD::SMAX, MVT::v16i8, 1 },
+ { ISD::SMIN, MVT::v4i32, 1 },
+ { ISD::SMIN, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v4i32, 1 },
+ { ISD::UMAX, MVT::v8i16, 1 },
+ { ISD::UMIN, MVT::v4i32, 1 },
+ { ISD::UMIN, MVT::v8i16, 1 },
+ };
static const CostTblEntry SSSE3CostTbl[] = {
+ { ISD::ABS, MVT::v4i32, 1 },
+ { ISD::ABS, MVT::v8i16, 1 },
+ { ISD::ABS, MVT::v16i8, 1 },
{ ISD::BITREVERSE, MVT::v2i64, 5 },
{ ISD::BITREVERSE, MVT::v4i32, 5 },
{ ISD::BITREVERSE, MVT::v8i16, 5 },
@@ -2471,6 +2595,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v16i8, 9 }
};
static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::ABS, MVT::v2i64, 4 },
+ { ISD::ABS, MVT::v4i32, 3 },
+ { ISD::ABS, MVT::v8i16, 2 },
+ { ISD::ABS, MVT::v16i8, 2 },
{ ISD::BITREVERSE, MVT::v2i64, 29 },
{ ISD::BITREVERSE, MVT::v4i32, 27 },
{ ISD::BITREVERSE, MVT::v8i16, 27 },
@@ -2492,10 +2620,16 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v16i8, 13 },
{ ISD::SADDSAT, MVT::v8i16, 1 },
{ ISD::SADDSAT, MVT::v16i8, 1 },
+ { ISD::SMAX, MVT::v8i16, 1 },
+ { ISD::SMIN, MVT::v8i16, 1 },
{ ISD::SSUBSAT, MVT::v8i16, 1 },
{ ISD::SSUBSAT, MVT::v16i8, 1 },
{ ISD::UADDSAT, MVT::v8i16, 1 },
{ ISD::UADDSAT, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v8i16, 2 },
+ { ISD::UMAX, MVT::v16i8, 1 },
+ { ISD::UMIN, MVT::v8i16, 2 },
+ { ISD::UMIN, MVT::v16i8, 1 },
{ ISD::USUBSAT, MVT::v8i16, 1 },
{ ISD::USUBSAT, MVT::v16i8, 1 },
{ ISD::FMAXNUM, MVT::f64, 4 },
@@ -2534,14 +2668,18 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTPOP, MVT::i8, 1 },
};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, 14 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
+ { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
+ { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
@@ -2560,6 +2698,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::UADDO, MVT::i32, 1 },
{ ISD::UADDO, MVT::i16, 1 },
{ ISD::UADDO, MVT::i8, 1 },
+ { ISD::UMULO, MVT::i32, 2 }, // mul + seto
+ { ISD::UMULO, MVT::i16, 2 },
+ { ISD::UMULO, MVT::i8, 2 },
};
Type *RetTy = ICA.getReturnType();
@@ -2569,6 +2710,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
switch (IID) {
default:
break;
+ case Intrinsic::abs:
+ ISD = ISD::ABS;
+ break;
case Intrinsic::bitreverse:
ISD = ISD::BITREVERSE;
break;
@@ -2592,12 +2736,24 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
case Intrinsic::sadd_sat:
ISD = ISD::SADDSAT;
break;
+ case Intrinsic::smax:
+ ISD = ISD::SMAX;
+ break;
+ case Intrinsic::smin:
+ ISD = ISD::SMIN;
+ break;
case Intrinsic::ssub_sat:
ISD = ISD::SSUBSAT;
break;
case Intrinsic::uadd_sat:
ISD = ISD::UADDSAT;
break;
+ case Intrinsic::umax:
+ ISD = ISD::UMAX;
+ break;
+ case Intrinsic::umin:
+ ISD = ISD::UMIN;
+ break;
case Intrinsic::usub_sat:
ISD = ISD::USUBSAT;
break;
@@ -2616,6 +2772,12 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
ISD = ISD::UADDO;
OpTy = RetTy->getContainedType(0);
break;
+ case Intrinsic::umul_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ // SMULO has same costs so don't duplicate.
+ ISD = ISD::UMULO;
+ OpTy = RetTy->getContainedType(0);
+ break;
}
if (ISD != ISD::DELETED_NODE) {
@@ -2624,89 +2786,121 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
MVT MTy = LT.second;
// Attempt to lookup cost.
+ if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
+ MTy.isVector()) {
+ // With PSHUFB the code is very similar for all types. If we have integer
+ // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
+ // we also need a PSHUFB.
+ unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
+
+ // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
+ // instructions. We also need an extract and an insert.
+ if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
+ (ST->hasBWI() && MTy.is512BitVector())))
+ Cost = Cost * 2 + 2;
+
+ return LT.first * Cost;
+ }
+
+ auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
+ FastMathFlags FMF) {
+ // If there are no NANs to deal with, then these are reduced to a
+ // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
+ // assume is used in the non-fast case.
+ if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
+ if (FMF.noNaNs())
+ return LegalizationCost * 1;
+ }
+ return LegalizationCost * (int)Entry.Cost;
+ };
+
if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->isSLM())
if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasCDI())
if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSSE3())
if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasBMI()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
if (ST->hasLZCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
if (ST->hasPOPCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
// TODO - add BMI (TZCNT) scalar handling
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
- return LT.first * Entry->Cost;
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -2714,9 +2908,6 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
- if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
@@ -2928,8 +3119,32 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
Cost +=
BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
} else {
- unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
- Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
+ // In each 128-lane, if at least one index is demanded but not all
+ // indices are demanded and this 128-lane is not the first 128-lane of
+ // the legalized-vector, then this 128-lane needs a extracti128; If in
+ // each 128-lane, there is at least one demanded index, this 128-lane
+ // needs a inserti128.
+
+ // The following cases will help you build a better understanding:
+ // Assume we insert several elements into a v8i32 vector in avx2,
+ // Case#1: inserting into 1th index needs vpinsrd + inserti128.
+ // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
+ // inserti128.
+ // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
+ unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
+ unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
+ APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+ unsigned Scale = NumElts / Num128Lanes;
+ // We iterate each 128-lane, and check if we need a
+ // extracti128/inserti128 for this 128-lane.
+ for (unsigned I = 0; I < NumElts; I += Scale) {
+ APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
+ APInt MaskedDE = Mask & WidenedDemandedElts;
+ unsigned Population = MaskedDE.countPopulation();
+ Cost += (Population > 0 && Population != Scale &&
+ I % LT.second.getVectorNumElements() != 0);
+ Cost += Population > 0;
+ }
Cost += DemandedElts.countPopulation();
// For vXf32 cases, insertion into the 0'th index in each v4f32
@@ -2973,11 +3188,10 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput) {
- if (isa_and_nonnull<StoreInst>(I)) {
- Value *Ptr = I->getOperand(1);
+ if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
// Store instruction with index and scale costs 2 Uops.
// Check the preceding GEP to identify non-const indices.
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
return TTI::TCC_Basic * 2;
}
@@ -3056,7 +3270,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
- CostKind);
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
int ValueSplitCost =
@@ -3477,8 +3691,10 @@ int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// Otherwise fall back to cmp+select.
- return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
- getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
+ return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
+ CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
@@ -3707,8 +3923,10 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
return std::max(1, Cost);
}
-int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind) {
+int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3848,7 +4066,28 @@ X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
}
-// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGatherOverhead() const {
+ // Some CPUs have more overhead for gather. The specified overhead is relative
+ // to the Load operation. "2" is the number provided by Intel architects. This
+ // parameter is used for cost estimation of Gather Op and comparison with
+ // other alternatives.
+ // TODO: Remove the explicit hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
+ return 2;
+
+ return 1024;
+}
+
+int X86TTIImpl::getScatterOverhead() const {
+ if (ST->hasAVX512())
+ return 2;
+
+ return 1024;
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later.
+// FIXME: Add TargetCostKind support.
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
Align Alignment, unsigned AddressSpace) {
@@ -3906,8 +4145,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
// The gather / scatter cost is given by Intel architects. It is a rough
// number since we are looking at one instruction in a time.
const int GSOverhead = (Opcode == Instruction::Load)
- ? ST->getGatherOverhead()
- : ST->getScatterOverhead();
+ ? getGatherOverhead()
+ : getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
MaybeAlign(Alignment), AddressSpace,
TTI::TCK_RecipThroughput);
@@ -3921,6 +4160,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
/// Alignment - Alignment for one element.
/// AddressSpace - pointer[s] address space.
///
+/// FIXME: Add TargetCostKind support.
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
bool VariableMask, Align Alignment,
unsigned AddressSpace) {
@@ -3934,9 +4174,9 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
- int ScalarCompareCost =
- getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
- nullptr, CostKind);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
@@ -3967,9 +4207,15 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
Align Alignment,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) {
-
- if (CostKind != TTI::TCK_RecipThroughput)
- return 1;
+ if (CostKind != TTI::TCK_RecipThroughput) {
+ if ((Opcode == Instruction::Load &&
+ isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+ (Opcode == Instruction::Store &&
+ isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+ return 1;
+ return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+ }
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
@@ -4129,7 +4375,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
// scalarize it.
if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
unsigned NumElts = DataVTy->getNumElements();
- if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ if (NumElts == 1)
return false;
}
Type *ScalarTy = DataTy->getScalarType();
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
index d462e1f96ca2..17570f1c04a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -22,6 +22,8 @@
namespace llvm {
+class InstCombiner;
+
class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
typedef BasicTTIImplBase<X86TTIImpl> BaseT;
typedef TargetTransformInfo TTI;
@@ -60,7 +62,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureLZCNTFalseDeps,
X86::FeatureBranchFusion,
X86::FeatureMacroFusion,
- X86::FeatureMergeToThreeWayBranch,
X86::FeaturePadShortFunctions,
X86::FeaturePOPCNTFalseDeps,
X86::FeatureSSEUnalignedMem,
@@ -129,9 +130,10 @@ public:
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
VectorType *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::TargetCostKind CostKind,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -151,6 +153,18 @@ public:
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
const SCEV *Ptr);
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+ Optional<Value *>
+ simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+ APInt DemandedMask, KnownBits &Known,
+ bool &KnownBitsComputed) const;
+ Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ SimplifyAndSetOp) const;
+
unsigned getAtomicMemIntrinsicMaxElementSize() const;
int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
@@ -190,8 +204,9 @@ public:
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
@@ -230,6 +245,9 @@ private:
int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
Align Alignment, unsigned AddressSpace);
+ int getGatherOverhead() const;
+ int getScatterOverhead() const;
+
/// @}
};
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
new file mode 100644
index 000000000000..ef010bcd38b7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -0,0 +1,248 @@
+//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. In X86PreTileConfig pass
+/// the pldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-config"
+
+namespace {
+
+class X86TileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ VirtRegMap *VRM = nullptr;
+ LiveIntervals *LIS = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+ void tileConfig();
+
+public:
+ X86TileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override { return "Tile Register Configure"; }
+
+ /// X86TileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86TileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+
+void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addRequired<VirtRegMap>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static unsigned getTilePhysRegIndex(Register PhysReg) {
+ assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
+ "Tile register number is invalid");
+ return (PhysReg - X86::TMM0);
+}
+
+static MachineInstr *
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
+ const TargetInstrInfo *TII, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+
+ unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
+ if (BitSize == TRI->getRegSizeInBits(*RC))
+ SubIdx = 0;
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
+ Offset)
+ .addReg(SrcReg, 0, SubIdx);
+ return NewMI;
+}
+
+static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ int64_t Imm, unsigned BitSize,
+ int FrameIdx, int Offset,
+ const TargetInstrInfo *TII) {
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
+ return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
+ FrameIdx, Offset)
+ .addImm(Imm);
+}
+
+MachineInstr *X86TileConfig::getTileConfigPoint() {
+ for (MachineBasicBlock &MBB : *MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ // Refer X86PreTileConfig.cpp.
+ // We only support one tile config for now.
+ if (MI.getOpcode() == X86::PLDTILECFG)
+ return &MI;
+ }
+
+ return nullptr;
+}
+
+void X86TileConfig::tileConfig() {
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return;
+ MachineBasicBlock *MBB = MI->getParent();
+ int SS = MI->getOperand(1).getIndex();
+ BitVector PhysRegs(TRI->getNumRegs());
+
+ // Fill in the palette first.
+ auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ // Fill in the shape of each tile physical register.
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+ Register PhysReg = VRM->getPhys(VirtReg);
+ if (PhysRegs.test(PhysReg))
+ continue;
+ PhysRegs.set(PhysReg);
+ ShapeT Shape = VRM->getShape(VirtReg);
+ Register RowReg = Shape.getRow()->getReg();
+ Register ColReg = Shape.getCol()->getReg();
+
+ // Here is the data format for the tile config.
+ // 0 palette
+ // 1 start_row
+ // 2-15 reserved, must be zero
+ // 16-17 tile0.colsb Tile 0 bytes per row.
+ // 18-19 tile1.colsb Tile 1 bytes per row.
+ // 20-21 tile2.colsb Tile 2 bytes per row.
+ // ... (sequence continues)
+ // 30-31 tile7.colsb Tile 7 bytes per row.
+ // 32-47 reserved, must be zero
+ // 48 tile0.rows Tile 0 rows.
+ // 49 tile1.rows Tile 1 rows.
+ // 50 tile2.rows Tile 2 rows.
+ // ... (sequence continues)
+ // 55 tile7.rows Tile 7 rows.
+ // 56-63 reserved, must be zero
+ unsigned Index = getTilePhysRegIndex(PhysReg);
+ int RowOffset = 48 + Index;
+ int ColOffset = 16 + Index * 2;
+
+ unsigned BitSize = 8;
+ for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
+ std::make_pair(ColReg, ColOffset)}) {
+ int64_t Imm;
+ int ImmCount = 0;
+ // All def must be the same value, otherwise it is invalid MIs.
+ // Immediate is prefered.
+ for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
+ const auto *Inst = MO.getParent();
+ if (Inst->isMoveImmediate()) {
+ ImmCount++;
+ Imm = Inst->getOperand(1).getImm();
+ break;
+ }
+ }
+ auto StoreConfig = [&](int Offset) {
+ MachineInstr *NewMI = nullptr;
+ if (ImmCount)
+ NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
+ else {
+ const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
+ NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
+ Offset, TII, RC, TRI);
+ }
+ SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
+ if (!ImmCount) {
+ // Extend the live interval.
+ SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
+ LiveInterval &Int = LIS->getInterval(Pair.first);
+ LIS->extendToIndices(Int, EndPoints);
+ }
+ };
+ StoreConfig(Pair.second);
+ BitSize += 8;
+ }
+ }
+}
+
+bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ VRM = &getAnalysis<VirtRegMap>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ if (VRM->isShapeMapEmpty())
+ return false;
+
+ tileConfig();
+ return true;
+}
+
+FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
index 8627bbbf18d2..8d8bd5e6b326 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -109,7 +109,7 @@ private:
/// The linked list node subobject inside of RegNode.
Value *Link = nullptr;
};
-}
+} // namespace
FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
index c7868bf4cf8e..0ea47106434c 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h
@@ -27,6 +27,7 @@ public:
: MCInstPrinter(MAI, MII, MRI) {}
// Autogenerated by tblgen.
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 4de252548961..b44984ff6b4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -51,7 +51,7 @@ static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) {
static MCSubtargetInfo *
createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
- return createXCoreMCSubtargetInfoImpl(TT, CPU, FS);
+ return createXCoreMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
}
static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index c32653137a10..db3dd7fb1438 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -22,7 +22,6 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
@@ -444,16 +443,15 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
}
if (LD->getAlignment() == 2) {
- SDValue Low =
- DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
- LD->getPointerInfo(), MVT::i16,
- /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+ SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
+ LD->getPointerInfo(), MVT::i16, Align(2),
+ LD->getMemOperand()->getFlags());
SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
DAG.getConstant(2, DL, MVT::i32));
SDValue High =
DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, HighAddr,
LD->getPointerInfo().getWithOffset(2), MVT::i16,
- /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+ Align(2), LD->getMemOperand()->getFlags());
SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
DAG.getConstant(16, DL, MVT::i32));
SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
@@ -503,14 +501,14 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDValue Low = Value;
SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
DAG.getConstant(16, dl, MVT::i32));
- SDValue StoreLow = DAG.getTruncStore(
- Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16,
- /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+ SDValue StoreLow =
+ DAG.getTruncStore(Chain, dl, Low, BasePtr, ST->getPointerInfo(),
+ MVT::i16, Align(2), ST->getMemOperand()->getFlags());
SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
DAG.getConstant(2, dl, MVT::i32));
SDValue StoreHigh = DAG.getTruncStore(
Chain, dl, High, HighAddr, ST->getPointerInfo().getWithOffset(2),
- MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+ MVT::i16, Align(2), ST->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 83fc16ed98fc..6528154ab0e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IntrinsicsXCore.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ReplaceConstant.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -74,61 +75,10 @@ createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
return ConstantArray::get(NewType, Elements);
}
-static Instruction *
-createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
- IRBuilder<NoFolder> Builder(Instr);
- unsigned OpCode = CE->getOpcode();
- switch (OpCode) {
- case Instruction::GetElementPtr: {
- SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
- ArrayRef<Value *> CEOps(CEOpVec);
- return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(
- cast<GEPOperator>(CE)->getSourceElementType(), CEOps[0],
- CEOps.slice(1)));
- }
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- return dyn_cast<Instruction>(
- Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
- CE->getOperand(0), CE->getOperand(1),
- CE->getName()));
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::BitCast:
- return dyn_cast<Instruction>(
- Builder.CreateCast((Instruction::CastOps)OpCode,
- CE->getOperand(0), CE->getType(),
- CE->getName()));
- default:
- llvm_unreachable("Unhandled constant expression!\n");
- }
-}
static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
do {
- SmallVector<WeakTrackingVH, 8> WUsers(CE->user_begin(), CE->user_end());
+ SmallVector<WeakTrackingVH, 8> WUsers(CE->users());
llvm::sort(WUsers);
WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
while (!WUsers.empty())
@@ -201,7 +151,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
GV->isExternallyInitialized());
// Update uses.
- SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
+ SmallVector<User *, 16> Users(GV->users());
for (unsigned I = 0, E = Users.size(); I != E; ++I) {
User *U = Users[I];
Instruction *Inst = cast<Instruction>(U);
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp
index ffeb0862c945..4b29751c7d06 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { }
XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const TargetMachine &TM)
- : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
- TLInfo(TM, *this), TSInfo() {}
+ : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(),
+ FrameLowering(*this), TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h
index 68139da9d1d0..d3979b275beb 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreSubtarget.h
@@ -44,7 +44,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
const XCoreFrameLowering *getFrameLowering() const override {
diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 1eea1e37c253..046cd6b5db7d 100644
--- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -26,9 +26,7 @@
using namespace llvm;
static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
- if (!RM.hasValue())
- return Reloc::Static;
- return *RM;
+ return RM.getValueOr(Reloc::Static);
}
static CodeModel::Model